
1 2 3 4 5 6 7 8 9 10 11 12 |
import requests from bs4 import BeautifulSoup as bs4 #引入模块 url="http://www.baidu.com/s?ie=UTF-8&wd=%E6%98%8A%E5%A4%A9seo" res = requests.get(url) soup= bs4(res.text, "lxml") for z in soup.select('h3[class="t"]'): #获取百度的搜索结果list print(z.select('a')[0].get_text()) #获取百度title print(requests.get(url).url) #获取搜索结果真实地址 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# -*- coding: utf-8 -*- """ @Time: 2018/5/16 @Author: songhao @微信公众号: zeropython @File: bdspider.py """ from urllib.parse import quote import requests from scrapy import Selector url1 = "http://www.baidu.com/s?wd={}".format(quote("雅昌")) r = requests.get(url1) # print(r.text) selector = Selector(text=r.text,type="html") # na = requests.get(url).url for a in selector.xpath('//h3[contains(@class, "t")]'): print("".join(a.xpath('.//text()').extract()).strip()) # print() urlen = a.xpath('./a/@href').extract_first() print(urlen) # HEAD: 只请求页面的首部。 print(requests.head(urlen).headers.get('Location')) |
其中 获取 嵌套标签的text可以参考
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
# 引入库文件 import requests import urllib.request bd_url = "https://www.baidu.com/link?url=mhMx_W4kSIqeHdckh0dvrBt4LDIxvTrf1XqoDQKAptW&" \ "ck=5341.10.0.0.0.203.232.0&shh=www.baidu.com&sht=baiduhome_pg&wd=&" \ "eqid=cd90b17a00034b1c000000035a645fd5" # 解密方法一: r = requests.get(bd_url) print(r.url) # 解密方法二: # python3下和Python2.7 下urlopen 引入不同 r = urllib.request.urlopen(bd_url) print(r.geturl()) |

