python提取页面内的url列表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
<span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup <span class="hljs-keyword">import</span> time,re,urllib2 t=time.time() websiteurls={} <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">scanpage</span><span class="hljs-params">(url)</span>:</span> websiteurl=url t=time.time() n=<span class="hljs-number">0</span> html=urllib2.urlopen(websiteurl).read() soup=BeautifulSoup(html) pageurls=[] Upageurls={} pageurls=soup.find_all(<span class="hljs-string">"a"</span>,href=<span class="hljs-keyword">True</span>) <span class="hljs-keyword">for</span> links <span class="hljs-keyword">in</span> pageurls: <span class="hljs-keyword">if</span> websiteurl <span class="hljs-keyword">in</span> links.get(<span class="hljs-string">"href"</span>) <span class="hljs-keyword">and</span> links.get(<span class="hljs-string">"href"</span>) <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> Upageurls <span class="hljs-keyword">and</span> links.get(<span class="hljs-string">"href"</span>) <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> websiteurls: Upageurls[links.get(<span class="hljs-string">"href"</span>)]=<span class="hljs-number">0</span> <span class="hljs-keyword">for</span> links <span class="hljs-keyword">in</span> Upageurls.keys(): <span class="hljs-keyword">try</span>: urllib2.urlopen(links).getcode() <span class="hljs-keyword">except</span>: <span class="hljs-keyword">print</span> <span class="hljs-string">"connect failed"</span> <span class="hljs-keyword">else</span>: t2=time.time() Upageurls[links]=urllib2.urlopen(links).getcode() <span class="hljs-keyword">print</span> n, <span class="hljs-keyword">print</span> links, <span class="hljs-keyword">print</span> Upageurls[links] t1=time.time() <span class="hljs-keyword">print</span> t1-t2 n+=<span class="hljs-number">1</span> <span class="hljs-keyword">print</span> (<span class="hljs-string">"total is "</span>+repr(n)+<span class="hljs-string">" links"</span>) <span class="hljs-keyword">print</span> time.time()-t scanpage(<span class="hljs-string">"http://news.163.com/"</span>) |
