1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import urllib.request as request from bs4 import BeautifulSoup as bs import asyncio import aiohttp,re @asyncio.coroutine async def getPage(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 res_list.append(await resp.text()) async def getTitle(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 html = await resp.text() title=re.search("<title>(.*?)</title>",html,re.S).group(0) print(title) # with open('title.txt','a+') as f: # print(title,url) # f.write(title+","+url+"\n") # print(type(await resp.text())) # res_list.append(await resp.text()) class parseListPage(): def __init__(self,page_str): self.page_str = page_str def __enter__(self): page_str = self.page_str page = bs(page_str,'lxml') # 获取文章链接 articles = page.select('.txtList30 li') art_urls = [] for a in articles: x = a.find('a')['href'] art_urls.append(x) return art_urls def __exit__(self, exc_type, exc_val, exc_tb): pass page_num = 100 page_url_base = 'http://news.artron.net/morenews/list728/p' page_urls = [page_url_base + str(i+1) for i in range(page_num)] loop = asyncio.get_event_loop() ret_list = [] tasks = [getPage(host,ret_list) for host in page_urls] print(tasks) loop.run_until_complete(asyncio.wait(tasks)) articles_url = [] for ret in ret_list: with parseListPage(ret) as tmp: articles_url += tmp ret_list = [] tasks = [getTitle(url, ret_list) for url in articles_url] loop.run_until_complete(asyncio.wait(tasks)) loop.close() # 例子 0 import asyncio import aiohttp,time NUMBERS = range(12) ''' 1. 当我们给一个函数添加了async关键字,就会把它变成一个异步函数。 每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环, 你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。 和方法名字一样,异步的任务完成方法才会就执行完成了。 await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。 ''' URL = 'http://httpbin.org/get?a={}' async def fetch_async(a): async with aiohttp.ClientSession() as session: async with session.get(URL.format(a)) as r: data = await r.json() #希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。 return data['args']['a'] start = time.time() event_loop = asyncio.get_event_loop() #会创建事件循环 tasks = [fetch_async(num) for num in NUMBERS] results = event_loop.run_until_complete(asyncio.gather(*tasks)) for num, result in zip(NUMBERS, results): print('fetch({}) = {}'.format(num, result)) print('Use asyncio aiohttp : {}'.format(time.time() - start)) |
参考:http://blog.csdn.net/u014595019/article/details/52295642
