Python爬取貼吧帖子內容
阿新 • • 發佈:2018-12-19
# -*- coding: utf-8 -*- """ Created on Sun Nov 4 09:58:09 2018 @author: wangf """ import re import requests import urllib #處理頁面標籤類 class Tool: #去除img標籤,7位長空格 removeImg = re.compile('<img.*?>| {7}|') #刪除超連結標籤 removeAddr = re.compile('<a.*?>|</a>') #把換行的標籤換為\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #將表格製表<td>替換為\t replaceTD= re.compile('<td>') #把段落開頭換為\n加空兩格 replacePara = re.compile('<p.*?>') #將換行符或雙換行符替換為\n replaceBR = re.compile('<br><br>|<br>') #將其餘標籤剔除 removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n ",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) #strip()將前後多餘內容刪除 return x.strip() #百度貼吧爬蟲類 class BDTB: def __init__(self, baseUrl, seeLZ, floorTag): self.baseUrl = baseUrl self.seeLZ = '?see_lz='+str(seeLZ) self.headers = { 'User-Agent': r'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) ' r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Referer': r'https://www.qiushibaike.com/', 'Connection': 'keep-alive' } self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u'百度貼吧' #是否寫入分隔符的標記 self.floorTag = floorTag #傳入頁碼,獲取該頁碼帖子的程式碼 def getPage(self,pageNum): try: url = self.baseUrl+self.seeLZ+'&pn='+str(pageNum) req = urllib.request.Request(url, headers = self.headers) res = urllib.request.urlopen(req).read().decode() return res except urllib.error.URLError as e: if hasattr(e, "reason"): print("連線百度貼吧失敗,錯誤原因:",e.reason) return None #獲取帖子標題 def getTitle(self, page): pattern = re.compile('<h3 class="core_title_txt.*?title="(.*?)"',re.S) result = re.findall(pattern,page) if result: #print result.group(1) #測試輸出 return result[0] else: return None #提取帖子頁數 def getPageNum(self,page): pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span class="red">(.*?)</span>.*?</li>',re.S) result = re.findall(pattern,page) if result: #print result.group(1) #測試輸出 return result[0] else: return None #提取正文內容 def getContent(self, page): pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findall(pattern,page) contents = [] for item in items: content = "\n"+self.tool.replace(item)+'\n' contents.append(content) return contents def setFileTitle(self,title): if title is not None: self.file = open(title + '.txt', 'w+') else: self.file = open(self.defaultTitle + '.txt', 'w+') def writeData(self, contents): for item in contents: if self.floorTag == '1': floorLine = '\n'+str(self.floor) +' 樓-----------------------------------------------------------------------------------------\n' # floorLine = floorLine.encode('utf-8') self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print ('URL已失效') return try: print('該帖子共有%s頁'%(str(pageNum))) for i in range(1,int(pageNum)+1): print('正在寫入第%s頁資料'%(str(i))) page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) except IOError as e: print('寫入異常,原因:',e.message) finally: self.file.close() print('寫入任務完成') print(u'請輸入帖子代號') baseURL = 'http://tieba.baidu.com/p/' + str(input(u'http://tieba.baidu.com/p/')) #baseURL = 'http://tieba.baidu.com/p/3138733512' seeLZ = input("是否只獲取樓主發言,是輸入1,否輸入0\n") floorTag = input("是否寫入樓層資訊,是輸入1,否輸入0\n") bdtb = BDTB(baseURL,seeLZ,floorTag) bdtb.start()