python實現kindle每天推送部落格2----python實現爬取部落格內容
阿新 • • 發佈:2019-02-09
#!/usr/bin/env python #coding=utf-8 # # Copyright 2017 liuxinxing # from bs4 import BeautifulSoup import urllib2 import datetime import time import PyRSS2Gen import re import sys reload(sys) sys.setdefaultencoding('utf-8') class RssSpider(): def __init__(self): self.myrss = PyRSS2Gen.RSS2(title='OSChina', link='http://my.oschina.net', description=str(datetime.date.today()), pubDate=datetime.datetime.now(), lastBuildDate = datetime.datetime.now(), items=[] ) self.xmlpath=r'./oschina.xml' self.baseurl="http://www.oschina.net/blog" #if os.path.isfile(self.xmlpath): #os.remove(self.xmlpath) def useragent(self,url): i_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36","Referer": 'http://baidu.com/'} req = urllib2.Request(url, headers=i_headers) html = urllib2.urlopen(req).read() return html def enterpage(self,url): pattern = re.compile(r'\d{4}\S\d{2}\S\d{2}\s\d{2}\S\d{2}') rsp=self.useragent(url) # print rsp soup=BeautifulSoup(rsp, "html.parser") # print soup timespan=soup.find('div',{'class':'blog-content'}) # print timespan timespan=str(timespan).strip().replace('\n','').decode('utf-8') # match=re.search(r'\d{4}\S\d{2}\S\d{2}\s\d{2}\S\d{2}',timespan) # timestr=str(datetime.date.today()) # if match: # timestr=match.group() #print timestr ititle=soup.title.string print ititle div=soup.find('div',{'class':'BlogContent'}) # print type(div) doc = div.get_text() # print type(doc) return ititle,doc def getcontent(self): rsp=self.useragent(self.baseurl) # print rsp soup=BeautifulSoup(rsp, "html.parser") # print soup ul=soup.find('div',{'id':'topsOfRecommend'}) # print ul for div in ul.findAll('div',{'class':'box-aw'}): # div=li.find('div') # print div if div is not None: alink=div.find('a') if alink is not None: link=alink.get('href') print link if self.isbloglink(link): title,doc =self.enterpage(link) self.savefile(title,doc) def isbloglink(self,link): express = r".*/blog/.*" mo = re.search(express, link) if mo: return True else: return False def savefile(self,title,doc): doc = doc.decode('utf-8') with open("./data/"+title+".txt",'w') as f: f.write(doc) if __name__=='__main__': rssSpider=RssSpider() rssSpider.getcontent() # rssSpider.enterpage("https://my.oschina.net/diluga/blog/1501203")