1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
#!/usr/bin/env python #coding=utf-8 import</code> <code class="python plain">urllib2 from</code> <code class="python plain">bs4 </code><code class="python keyword">import</code> <code class="python plain">BeautifulSoup import</code> <code class="python plain">re import</code> <code class="python plain">sys reload</code><code class="python plain">(sys) sys.setdefaultencoding(</code><code class="python string">'utf-8'</code><code class="python plain">) def</code> <code class="python plain">getEachArticle(url): # response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html') </code><code class="python plain">response </code><code class="python keyword">=</code> <code class="python plain">urllib2.urlopen(url) </code><code class="python plain">html </code><code class="python keyword">=</code> <code class="python plain">response.read() </code><code class="python plain">soup </code><code class="python keyword">=</code> <code class="python plain">BeautifulSoup(html)</code><code class="python comments">#.decode("utf-8").encode("gbk")) </code><code class="python comments">#for i in soup.find_all('div'): </code><code class="python comments"># print i,1 </code><code class="python plain">title</code><code class="python keyword">=</code><code class="python plain">soup.find(</code><code class="python string">"h1"</code><code class="python plain">).string </code><code class="python plain">writer</code><code class="python keyword">=</code><code class="python plain">soup.find(</code><code class="python functions">id</code><code class="python keyword">=</code><code class="python string">"pub_date"</code><code class="python plain">).string.strip() </code><code class="python plain">_from</code><code class="python keyword">=</code><code class="python plain">soup.find(</code><code class="python functions">id</code><code class="python keyword">=</code><code class="python string">"media_name"</code><code class="python plain">).string.strip() </code><code class="python plain">text</code><code class="python keyword">=</code><code class="python plain">soup.get_text()</code><code class="python comments">#.encode("utf-8") </code><code class="python plain">main</code><code class="python keyword">=</code><code class="python plain">re.split(</code><code class="python string">"BAIDU_CLB.*;"</code><code class="python plain">,text) </code><code class="python plain">result</code><code class="python keyword">=</code><code class="python plain">{</code><code class="python string">"title"</code><code class="python plain">:title,</code><code class="python string">"writer"</code><code class="python plain">:writer,</code><code class="python string">"from"</code><code class="python plain">:_from,</code><code class="python string">"context"</code><code class="python plain">:main[</code><code class="python value">1</code><code class="python plain">]} </code><code class="python keyword">return</code> <code class="python plain">result </code><code class="python comments">#new=open("new.txt","w") </code><code class="python comments">#new.write(result["title"]+"\n\n") </code><code class="python comments">#new.write(result["writer"]+" "+result["from"]) </code><code class="python comments">#new.write(result["context"]) </code><code class="python comments">#new.close() def</code> <code class="python plain">getCatalog(issue): </code><code class="python plain">url</code><code class="python keyword">=</code><code class="python string">"http://www.52duzhe.com/"</code><code class="python keyword">+</code><code class="python plain">issue[:</code><code class="python value">4</code><code class="python plain">]</code><code class="python keyword">+</code><code class="python string">"_"</code><code class="python keyword">+</code><code class="python plain">issue[</code><code class="python keyword">-</code><code class="python value">2</code><code class="python plain">:]</code><code class="python keyword">+</code><code class="python string">"/" </code><code class="python plain">firstUrl</code><code class="python keyword">=</code><code class="python plain">url</code><code class="python keyword">+</code><code class="python string">"duzh"</code><code class="python keyword">+</code><code class="python plain">issue</code><code class="python keyword">+</code><code class="python string">"01.html" </code><code class="python plain">firstUrl</code><code class="python keyword">=</code><code class="python plain">url</code><code class="python keyword">+</code><code class="python string">"index.html" </code><code class="python plain">duzhe</code><code class="python keyword">=</code><code class="python functions">dict</code><code class="python plain">() </code><code class="python plain">response </code><code class="python keyword">=</code> <code class="python plain">urllib2.urlopen(firstUrl) </code><code class="python plain">html </code><code class="python keyword">=</code> <code class="python plain">response.read() </code><code class="python plain">soup</code><code class="python keyword">=</code><code class="python plain">BeautifulSoup(html) </code><code class="python plain">firstUrl</code><code class="python keyword">=</code><code class="python plain">url</code><code class="python keyword">+</code><code class="python plain">soup.table.a.get(</code><code class="python string">"href"</code><code class="python plain">) </code><code class="python plain">response </code><code class="python keyword">=</code> <code class="python plain">urllib2.urlopen(firstUrl) </code><code class="python plain">html </code><code class="python keyword">=</code> <code class="python plain">response.read() </code><code class="python plain">soup </code><code class="python keyword">=</code> <code class="python plain">BeautifulSoup(html) </code><code class="python functions">all</code><code class="python keyword">=</code><code class="python plain">soup.find_all(</code><code class="python string">"h2"</code><code class="python plain">) </code><code class="python keyword">for</code> <code class="python plain">i </code><code class="python keyword">in</code> <code class="python functions">all</code><code class="python plain">: </code><code class="python functions">print</code> <code class="python plain">i.string </code><code class="python plain">duzhe[i.string]</code><code class="python keyword">=</code><code class="python functions">list</code><code class="python plain">() </code><code class="python keyword">for</code> <code class="python plain">link </code><code class="python keyword">in</code> <code class="python plain">i.parent.find_all(</code><code class="python string">"a"</code><code class="python plain">): </code><code class="python plain">href</code><code class="python keyword">=</code><code class="python plain">url</code><code class="python keyword">+</code><code class="python plain">link.get(</code><code class="python string">"href"</code><code class="python plain">) </code><code class="python functions">print</code> <code class="python plain">href </code><code class="python keyword">while</code> <code class="python value">1</code><code class="python plain">: </code><code class="python keyword">try</code><code class="python plain">: </code><code class="python plain">article</code><code class="python keyword">=</code><code class="python plain">getEachArticle(href) </code><code class="python keyword">break </code><code class="python keyword">except</code><code class="python plain">: </code><code class="python keyword">continue </code><code class="python plain">duzhe[i.string].append(article) </code><code class="python keyword">return</code> <code class="python plain">duzhe def</code> <code class="python plain">readDuZhe(duzhe): </code><code class="python keyword">for</code> <code class="python plain">eachColumn </code><code class="python keyword">in</code> <code class="python plain">duzhe: </code><code class="python keyword">for</code> <code class="python plain">eachArticle </code><code class="python keyword">in</code> <code class="python plain">duzhe[eachColumn]: </code><code class="python functions">print</code> <code class="python plain">eachArticle[</code><code class="python string">"title"</code><code class="python plain">] if</code> <code class="python plain">__name__ </code><code class="python keyword">=</code><code class="python keyword">=</code> <code class="python string">'__main__'</code><code class="python plain">: # issue=raw_input("issue(201501):") </code><code class="python plain">readDuZhe(getCatalog(</code><code class="python string">"201424"</code><code class="python plain">)) 文件重命名为 |
1 |
文件重命名为 crawler.py |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#!/usr/bin/env python #coding=utf-8 """ </code><code class="python comments">Author: Anemone </code><code class="python comments">Filename: writetopdf.py </code><code class="python comments">Last modified: 2015-02-20 19:19 </code><code class="python comments">E-mail: anemone@82flex.com """ #coding=utf-8 import</code> <code class="python plain">reportlab.rl_config from</code> <code class="python plain">reportlab.pdfbase </code><code class="python keyword">import</code> <code class="python plain">pdfmetrics from</code> <code class="python plain">reportlab.pdfbase.ttfonts </code><code class="python keyword">import</code> <code class="python plain">TTFont from</code> <code class="python plain">reportlab.lib </code><code class="python keyword">import</code> <code class="python plain">fonts import</code> <code class="python plain">copy from</code> <code class="python plain">reportlab.platypus </code><code class="python keyword">import</code> <code class="python plain">Paragraph, SimpleDocTemplate,flowables from</code> <code class="python plain">reportlab.lib.styles </code><code class="python keyword">import</code> <code class="python plain">getSampleStyleSheet import</code> <code class="python plain">crawler def</code> <code class="python plain">writePDF(issue,duzhe): </code><code class="python plain">reportlab.rl_config.warnOnMissingFontGlyphs </code><code class="python keyword">=</code> <code class="python value">0 </code><code class="python plain">pdfmetrics.registerFont(TTFont(</code><code class="python string">'song'</code><code class="python plain">,</code><code class="python string">"simsun.ttc"</code><code class="python plain">)) </code><code class="python plain">pdfmetrics.registerFont(TTFont(</code><code class="python string">'hei'</code><code class="python plain">,</code><code class="python string">"msyh.ttc"</code><code class="python plain">)) </code><code class="python plain">fonts.addMapping(</code><code class="python string">'song'</code><code class="python plain">, </code><code class="python value">0</code><code class="python plain">, </code><code class="python value">0</code><code class="python plain">, </code><code class="python string">'song'</code><code class="python plain">) </code><code class="python plain">fonts.addMapping(</code><code class="python string">'song'</code><code class="python plain">, </code><code class="python value">0</code><code class="python plain">, </code><code class="python value">1</code><code class="python plain">, </code><code class="python string">'song'</code><code class="python plain">) </code><code class="python plain">fonts.addMapping(</code><code class="python string">'song'</code><code class="python plain">, </code><code class="python value">1</code><code class="python plain">, </code><code class="python value">0</code><code class="python plain">, </code><code class="python string">'hei'</code><code class="python plain">) </code><code class="python plain">fonts.addMapping(</code><code class="python string">'song'</code><code class="python plain">, </code><code class="python value">1</code><code class="python plain">, </code><code class="python value">1</code><code class="python plain">, </code><code class="python string">'hei'</code><code class="python plain">) </code><code class="python plain">stylesheet</code><code class="python keyword">=</code><code class="python plain">getSampleStyleSheet() </code><code class="python plain">normalStyle </code><code class="python keyword">=</code> <code class="python plain">copy.deepcopy(stylesheet[</code><code class="python string">'Normal'</code><code class="python plain">]) </code><code class="python plain">normalStyle.fontName </code><code class="python keyword">=</code><code class="python string">'song' </code><code class="python plain">normalStyle.fontSize </code><code class="python keyword">=</code> <code class="python value">11 </code><code class="python plain">normalStyle.leading </code><code class="python keyword">=</code> <code class="python value">11 </code><code class="python plain">normalStyle.firstLineIndent </code><code class="python keyword">=</code> <code class="python value">20 </code><code class="python plain">titleStyle </code><code class="python keyword">=</code> <code class="python plain">copy.deepcopy(stylesheet[</code><code class="python string">'Normal'</code><code class="python plain">]) </code><code class="python plain">titleStyle.fontName </code><code class="python keyword">=</code><code class="python string">'song' </code><code class="python plain">titleStyle.fontSize </code><code class="python keyword">=</code> <code class="python value">15 </code><code class="python plain">titleStyle.leading </code><code class="python keyword">=</code> <code class="python value">20 </code><code class="python plain">firstTitleStyle </code><code class="python keyword">=</code> <code class="python plain">copy.deepcopy(stylesheet[</code><code class="python string">'Normal'</code><code class="python plain">]) </code><code class="python plain">firstTitleStyle.fontName </code><code class="python keyword">=</code><code class="python string">'song' </code><code class="python plain">firstTitleStyle.fontSize </code><code class="python keyword">=</code> <code class="python value">20 </code><code class="python plain">firstTitleStyle.leading </code><code class="python keyword">=</code> <code class="python value">20 </code><code class="python plain">firstTitleStyle.firstLineIndent </code><code class="python keyword">=</code> <code class="python value">50 </code><code class="python plain">smallStyle </code><code class="python keyword">=</code> <code class="python plain">copy.deepcopy(stylesheet[</code><code class="python string">'Normal'</code><code class="python plain">]) </code><code class="python plain">smallStyle.fontName </code><code class="python keyword">=</code><code class="python string">'song' </code><code class="python plain">smallStyle.fontSize </code><code class="python keyword">=</code> <code class="python value">8 </code><code class="python plain">smallStyle.leading </code><code class="python keyword">=</code> <code class="python value">8 </code><code class="python plain">story </code><code class="python keyword">=</code> <code class="python plain">[] </code><code class="python plain">story.append(Paragraph(</code><code class="python string">"<b>读者{0}期</b>"</code><code class="python plain">.</code><code class="python functions">format</code><code class="python plain">(issue), firstTitleStyle)) </code><code class="python keyword">for</code> <code class="python plain">eachColumn </code><code class="python keyword">in</code> <code class="python plain">duzhe: </code><code class="python plain">story.append(Paragraph(</code><code class="python string">'__'</code><code class="python keyword">*</code><code class="python value">28</code><code class="python plain">, titleStyle)) </code><code class="python plain">story.append(Paragraph(</code><code class="python string">'<b>{0}</b>'</code><code class="python plain">.</code><code class="python functions">format</code><code class="python plain">(eachColumn), titleStyle)) </code><code class="python keyword">for</code> <code class="python plain">eachArticle </code><code class="python keyword">in</code> <code class="python plain">duzhe[eachColumn]: </code><code class="python plain">story.append(Paragraph(eachArticle[</code><code class="python string">"title"</code><code class="python plain">],normalStyle)) </code><code class="python plain">story.append(flowables.PageBreak()) </code><code class="python keyword">for</code> <code class="python plain">eachColumn </code><code class="python keyword">in</code> <code class="python plain">duzhe: </code><code class="python keyword">for</code> <code class="python plain">eachArticle </code><code class="python keyword">in</code> <code class="python plain">duzhe[eachColumn]: </code><code class="python plain">story.append(Paragraph(</code><code class="python string">"<b>{0}</b>"</code><code class="python plain">.</code><code class="python functions">format</code><code class="python plain">(eachArticle[</code><code class="python string">"title"</code><code class="python plain">]),titleStyle)) </code><code class="python plain">story.append(Paragraph(</code><code class="python string">" {0} {1}"</code><code class="python plain">.</code><code class="python functions">format</code><code class="python plain">(eachArticle[</code><code class="python string">"writer"</code><code class="python plain">],eachArticle[</code><code class="python string">"from"</code><code class="python plain">]),smallStyle)) </code><code class="python plain">para</code><code class="python keyword">=</code><code class="python plain">eachArticle[</code><code class="python string">"context"</code><code class="python plain">].split(</code><code class="python string">" "</code><code class="python plain">) </code><code class="python keyword">for</code> <code class="python plain">eachPara </code><code class="python keyword">in</code> <code class="python plain">para: </code><code class="python plain">story.append(Paragraph(eachPara,normalStyle)) </code><code class="python plain">story.append(flowables.PageBreak()) </code><code class="python comments">#story.append(Paragraph("context",normalStyle)) </code><code class="python plain">doc </code><code class="python keyword">=</code> <code class="python plain">SimpleDocTemplate(</code><code class="python string">"duzhe"</code><code class="python keyword">+</code><code class="python plain">issue</code><code class="python keyword">+</code><code class="python string">".pdf"</code><code class="python plain">) </code><code class="python functions">print</code> <code class="python string">"Writing PDF..." </code><code class="python plain">doc.build(story) def</code> <code class="python plain">main(issue): </code><code class="python plain">duzhe</code><code class="python keyword">=</code><code class="python plain">crawler.getCatalog(issue) </code><code class="python plain">writePDF(issue,duzhe) if</code> <code class="python plain">__name__ </code><code class="python keyword">=</code><code class="python keyword">=</code> <code class="python string">'__main__'</code><code class="python plain">: </code><code class="python plain">issue</code><code class="python keyword">=</code><code class="python functions">raw_input</code><code class="python plain">(</code><code class="python string">"Enter issue(201501):"</code><code class="python plain">) </code><code class="python plain">main(issue) 执行该文件即可 导入最上边的文件 |
