提取整个页面的 链接 通用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from html.parser import HTMLParser from urllib import parse import requests class LinkFinder(HTMLParser): def __init__(self, base_url, page_url): super().__init__() self.base_url = base_url self.page_url = page_url self.links = set() # When we call HTMLParser feed() this function is called when it # encounters an opening tag <a> def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = parse.urljoin(self.base_url, value) if self.base_url in url: self.links.add(url) # exclud def page_links(self): return self.links def error(self, message): pass if __name__ == '__main__': finder = LinkFinder("https://www.csai.cn", 'https://www.csai.cn/baoxian/') r = requests.get("https://www.csai.cn/baoxian/") finder.feed(r.text) urls = list(set(finder.page_links())) print(urls) |
效果如图


文章不错支持一下吧
谢谢分享,这正是我要找的 ,我的博客,欢迎回访