亚洲免费在线-亚洲免费在线播放-亚洲免费在线观看-亚洲免费在线观看视频-亚洲免费在线看-亚洲免费在线视频

Python爬取讀者并制作成PDF

系統 1798 0

學了下beautifulsoup后,做個個網絡爬蟲,爬取讀者雜志并用reportlab制作成pdf..

crawler.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? getmain.py
??? Last modified:? 2015-02-19 16:47
??? E-mail:???????? anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
#??? response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
??? response = urllib2.urlopen(url)
??? html = response.read()
??? soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
??? #for i in soup.find_all('div'):
??? #??? print i,1
??? title=soup.find("h1").string
??? writer=soup.find(id="pub_date").string.strip()
??? _from=soup.find(id="media_name").string.strip()
??? text=soup.get_text()#.encode("utf-8")
??? main=re.split("BAIDU_CLB.*;",text)
??? result={"title":title,"writer":writer,"from":_from,"context":main[1]}
??? return result
??? #new=open("new.txt","w")
??? #new.write(result["title"]+"\n\n")
??? #new.write(result["writer"]+"? "+result["from"])
??? #new.write(result["context"])
??? #new.close()
def getCatalog(issue):
??? url=" http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
??? firstUrl=url+"duzh"+issue+"01.html"
??? firstUrl=url+"index.html"
??? duzhe=dict()
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup=BeautifulSoup(html)
??? firstUrl=url+soup.table.a.get("href")
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup = BeautifulSoup(html)
??? all=soup.find_all("h2")
??? for i in all:
??????? print i.string
??????? duzhe[i.string]=list()
??????? for link in i.parent.find_all("a"):
??????????? href=url+link.get("href")
??????????? print href
??????????? while 1:
??????????????? try:
??????????????????? article=getEachArticle(href)
??????????????????? break
??????????????? except:
??????????????????? continue
??????????? duzhe[i.string].append(article)
??? return duzhe
def readDuZhe(duzhe):
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? print eachArticle["title"]
if __name__ == '__main__':
#??? issue=raw_input("issue(201501):")
??? readDuZhe(getCatalog("201424"))

getpdf.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? writetopdf.py
??? Last modified:? 2015-02-20 19:19
??? E-mail:???????? anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
??? reportlab.rl_config.warnOnMissingFontGlyphs = 0
??? pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
??? pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
??? fonts.addMapping('song', 0, 0, 'song')
??? fonts.addMapping('song', 0, 1, 'song')
??? fonts.addMapping('song', 1, 0, 'hei')
??? fonts.addMapping('song', 1, 1, 'hei')
??? stylesheet=getSampleStyleSheet()
??? normalStyle = copy.deepcopy(stylesheet['Normal'])
??? normalStyle.fontName ='song'
??? normalStyle.fontSize = 11
??? normalStyle.leading = 11
??? normalStyle.firstLineIndent = 20
??? titleStyle = copy.deepcopy(stylesheet['Normal'])
??? titleStyle.fontName ='song'
??? titleStyle.fontSize = 15
??? titleStyle.leading = 20
??? firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
??? firstTitleStyle.fontName ='song'
??? firstTitleStyle.fontSize = 20
??? firstTitleStyle.leading = 20
??? firstTitleStyle.firstLineIndent = 50
??? smallStyle = copy.deepcopy(stylesheet['Normal'])
??? smallStyle.fontName ='song'
??? smallStyle.fontSize = 8
??? smallStyle.leading = 8
??? story = []
??? story.append(Paragraph(" 讀者{0}期 ".format(issue), firstTitleStyle))
??? for eachColumn in duzhe:
??????? story.append(Paragraph('__'*28, titleStyle))
??????? story.append(Paragraph(' {0} '.format(eachColumn), titleStyle))
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(eachArticle["title"],normalStyle))
??? story.append(flowables.PageBreak())
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(" {0} ".format(eachArticle["title"]),titleStyle))
??????????? story.append(Paragraph(" {0}? {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
??????????? para=eachArticle["context"].split("  ")
??????????? for eachPara in para:
??????????????? story.append(Paragraph(eachPara,normalStyle))
??????????? story.append(flowables.PageBreak())
??? #story.append(Paragraph("context",normalStyle))
??? doc = SimpleDocTemplate("duzhe"+issue+".pdf")
??? print "Writing PDF..."
??? doc.build(story)
def main(issue):
??? duzhe=crawler.getCatalog(issue)
??? writePDF(issue,duzhe)
if __name__ == '__main__':
??? issue=raw_input("Enter issue(201501):")
??? main(issue)

以上就是本文的全部內容了,希望大家能夠喜歡。


更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號聯系: 360901061

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對您有幫助就好】

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長會非常 感謝您的哦?。。?/p>

發表我的評論
最新評論 總共0條評論
主站蜘蛛池模板: 日韩精品成人在线 | 国产精品原创视频 | 久久精品国产曰本波多野结衣 | 国产高清免费午夜在线视频 | 不卡中文一二三区 | 亚欧成人毛片一区二区三区四区 | 成年性视频bbixx | 成年女人18级毛片毛片免费观看 | 国产在线精品成人一区二区三区 | 国产日韩欧美一区二区 | 免费澳门一级毛片 | 国产精品久久久久久久久免费观看 | 亚洲欧美久久精品 | 色婷婷综合久久久久中文 | 分分操这里只有精品 | 日本欧美一二三区色视频 | 午夜网站在线观看免费网址免费 | 午夜国产在线 | 国产1区2区3区在线观看 | 亚洲在线网站 | 久久精品国产亚洲a不卡 | 欧美香蕉视频 | 久久久久国产精品免费 | 九九草在线观看 | 国产精品久久久久久久久久一区 | 日韩中文在线观看 | 日本中文字幕有码 | 欧美成人h版影片在线观看 欧美成人h精品网站 | 国产亚洲女人久久久久久 | 国产成人精品日本亚洲语音1 | 亚洲精品国产成人中文 | 97xxxx| 久久亚洲精品永久网站 | 香蕉久久ac一区二区三区 | 欧美aⅴ | 亚洲乱码视频 | 影音先锋在线亚洲精品推荐 | 久久www视频 | 亚洲成a人v在线观看 | 奇米777影视 | 四虎影院在线免费播放 |