[实战]爬取iask 爱问问题 导入 MONGODB 数据库-Python 技术分享 Java技术分享 Python 爬虫技术_微信公众号:zeropython—昊天博客

直接上源码:

https://github.com/huahuizi/Iask-crawl

# iask 采集的主文件

# coding:utf-8
import requests, re
from pyquery import PyQuery as pq
from requests.exceptions import RequestException
from fake_useragent import UserAgent
SITE = "http://iask.sina.com.cn"
from bs4 import BeautifulSoup as bs4
from Tools import Tool
import pymongo
from config import *
headers = {
    'User-Agent':UserAgent().chrome,
}
def getHtml(url):
    try:
        res = requests.get(url,headers=headers)
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        return None

def get_next_url_one(content):
    try:
        return  SITE+bs4(content,"lxml").select('.btn-page')[-1]['href']
    except Exception as e:
        print(e)
        pass


def get_list_href(url):
    html1 = getHtml(url)
    doc = pq(html1)
    for a in doc('.question-title a').items():
        page_url = SITE+a.attr.href
        get_detail_page(page_url)
        print(a.text())

    nex = get_next_url_one(html1)
    print('正在抓取',nex)
    get_list_href(nex)

def get_detail_page(url):
    html = getHtml(url)
    newselect = bs4(html, "lxml").select('pre')
    try:
        title = Tool().replace(newselect[0].text)
        answer = Tool().replace(newselect[1].text)
        save_mongo({'title':title,'answer':answer})
    except Exception as e:
        print(e)
        pass

def save_mongo(dic):
    clent = pymongo.MongoClient(host='localhost',port=27017)
    clent.MONGO_DB.MONGO_TB.insert_one(dic)
    print("正在保存",dic)
    # pass

if __name__ == '__main__':
    url = "http://iask.sina.com.cn/c/213-goodAnswer-180-new.html"
    html = getHtml(url)
    nextii = get_next_url_one(html)
    get_list_href(nextii)

# Tools 采集内容处理文件
#coding:utf-8
import re

#处理页面标签类
class Tool:

    #将超链接广告剔除
    removeADLink = re.compile('

<div class="link_layer.*?</div>


')
    #去除img标签,1-7位空格,&nbsp;
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('

<tr>|

<div>|</div>


|

')
    #将表格制表

<td>
替换为\t
    replaceTD= re.compile('

<td>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('

|
')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    #将多行空行删除
    removeNoneLine = re.compile('\n+')

    def replace(self,x):
        x = re.sub(self.removeADLink,"",x)
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        x = re.sub(self.removeNoneLine,"\n",x)
        #strip()将前后多余内容删除
        return x.strip()




#config 数据库 配置文件
MONGO_DB="IASK"
MONGO_TB="AUSWER"

HTTPX 基础教程-新乡seo|网站优化,网站建设_微信公众号:zeropython—昊天博客