![[实战]爬取iask 爱问问题 导入 MONGODB 数据库-Python 技术分享 Java技术分享 Python 爬虫技术_微信公众号:zeropython—昊天博客](https://hangzhou01.oss-cn-hangzhou.aliyuncs.com/uploads/2017/08/7803A611-9DBD-440E-A890-DF51050F964F-1024x447.jpg)
直接上源码:
https://github.com/huahuizi/Iask-crawl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# iask 采集的主文件 # coding:utf-8 import requests, re from pyquery import PyQuery as pq from requests.exceptions import RequestException from fake_useragent import UserAgent SITE = "http://iask.sina.com.cn" from bs4 import BeautifulSoup as bs4 from Tools import Tool import pymongo from config import * headers = { 'User-Agent':UserAgent().chrome, } def getHtml(url): try: res = requests.get(url,headers=headers) if res.status_code == 200: return res.text return None except RequestException: return None def get_next_url_one(content): try: return SITE+bs4(content,"lxml").select('.btn-page')[-1]['href'] except Exception as e: print(e) pass def get_list_href(url): html1 = getHtml(url) doc = pq(html1) for a in doc('.question-title a').items(): page_url = SITE+a.attr.href get_detail_page(page_url) print(a.text()) nex = get_next_url_one(html1) print('正在抓取',nex) get_list_href(nex) def get_detail_page(url): html = getHtml(url) newselect = bs4(html, "lxml").select('pre') try: title = Tool().replace(newselect[0].text) answer = Tool().replace(newselect[1].text) save_mongo({'title':title,'answer':answer}) except Exception as e: print(e) pass def save_mongo(dic): clent = pymongo.MongoClient(host='localhost',port=27017) clent.MONGO_DB.MONGO_TB.insert_one(dic) print("正在保存",dic) # pass if __name__ == '__main__': url = "http://iask.sina.com.cn/c/213-goodAnswer-180-new.html" html = getHtml(url) nextii = get_next_url_one(html) get_list_href(nextii) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# Tools 采集内容处理文件 #coding:utf-8 import re #处理页面标签类 class Tool: #将超链接广告剔除 removeADLink = re.compile(' <div class="link_layer.*?</div> ') #去除img标签,1-7位空格, removeImg = re.compile('<img.*?>| {1,7}| ') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile(' <tr>| <div>|</div> | ') #将表格制表 <td> 替换为\t replaceTD= re.compile(' <td>') #将换行符或双换行符替换为\n replaceBR = re.compile(' | ') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') #将多行空行删除 removeNoneLine = re.compile('\n+') def replace(self,x): x = re.sub(self.removeADLink,"",x) x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) x = re.sub(self.removeNoneLine,"\n",x) #strip()将前后多余内容删除 return x.strip() |
1 2 3 |
#config 数据库 配置文件 MONGO_DB="IASK" MONGO_TB="AUSWER" |
