1. 程式人生 > >Python爬取貼吧帖子內容

Python爬取貼吧帖子內容

# -*- coding: utf-8 -*-
"""
Created on Sun Nov  4 09:58:09 2018

@author: wangf
"""

import re
import requests
import urllib
 
 
 
#處理頁面標籤類
class Tool:
    #去除img標籤,7位長空格
    removeImg = re.compile('<img.*?>| {7}|')
    #刪除超連結標籤
    removeAddr = re.compile('<a.*?>|</a>')
    #把換行的標籤換為\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #將表格製表<td>替換為\t
    replaceTD= re.compile('<td>')
    #把段落開頭換為\n加空兩格
    replacePara = re.compile('<p.*?>')
    #將換行符或雙換行符替換為\n
    replaceBR = re.compile('<br><br>|<br>')
    #將其餘標籤剔除
    removeExtraTag = re.compile('<.*?>')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n    ",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        #strip()將前後多餘內容刪除
        return x.strip()
 
#百度貼吧爬蟲類
class BDTB:
    def __init__(self, baseUrl, seeLZ, floorTag):
        self.baseUrl = baseUrl
        self.seeLZ = '?see_lz='+str(seeLZ)
        self.headers =  {
        'User-Agent': r'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
                    r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
         'Referer': r'https://www.qiushibaike.com/',
         'Connection': 'keep-alive'
        }
        self.tool = Tool()
        self.file = None
        self.floor = 1
        self.defaultTitle = u'百度貼吧'
        #是否寫入分隔符的標記
        self.floorTag = floorTag
 
    #傳入頁碼,獲取該頁碼帖子的程式碼
    def getPage(self,pageNum):
        try:
            url = self.baseUrl+self.seeLZ+'&pn='+str(pageNum)
            req = urllib.request.Request(url, headers = self.headers)
            res = urllib.request.urlopen(req).read().decode()
            return res
        except urllib.error.URLError as e:
            if hasattr(e, "reason"):
                print("連線百度貼吧失敗,錯誤原因:",e.reason)
                return None
            
    #獲取帖子標題
    def getTitle(self, page):
        
        pattern = re.compile('<h3 class="core_title_txt.*?title="(.*?)"',re.S)
        result = re.findall(pattern,page)
        if result:
            #print result.group(1)  #測試輸出
            return result[0]
        else:
            return None
    #提取帖子頁數
    def getPageNum(self,page):
       
        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span class="red">(.*?)</span>.*?</li>',re.S)
        result = re.findall(pattern,page)
        if result:
            #print result.group(1)  #測試輸出
            return result[0]
        else:
            return None
    #提取正文內容
    def getContent(self, page):
        pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
        items = re.findall(pattern,page)
        contents = []
      
        for item in items:
          content = "\n"+self.tool.replace(item)+'\n'
          contents.append(content)
        return contents
 
    def setFileTitle(self,title):
        if title is not None:
            self.file = open(title + '.txt', 'w+')
        else:
            self.file = open(self.defaultTitle + '.txt', 'w+')
 
    def writeData(self, contents):
        for item in contents:
            if self.floorTag == '1':
                floorLine = '\n'+str(self.floor) +' 樓-----------------------------------------------------------------------------------------\n'
                
               # floorLine = floorLine.encode('utf-8')
            
                self.file.write(floorLine)
            self.file.write(item)
            self.floor += 1
       
    def start(self):
        indexPage = self.getPage(1)
        pageNum = self.getPageNum(indexPage)
        title = self.getTitle(indexPage)
        self.setFileTitle(title)
        if pageNum == None:
            print ('URL已失效')
            return
        try:
            print('該帖子共有%s頁'%(str(pageNum)))
            for i in range(1,int(pageNum)+1):
                  print('正在寫入第%s頁資料'%(str(i)))
                  page = self.getPage(i)
                  contents = self.getContent(page)
                  self.writeData(contents)
            
        except IOError as e:
                  print('寫入異常,原因:',e.message)
        finally:
                  self.file.close()
                  print('寫入任務完成')
                  
print(u'請輸入帖子代號')
baseURL = 'http://tieba.baidu.com/p/' + str(input(u'http://tieba.baidu.com/p/'))              
#baseURL = 'http://tieba.baidu.com/p/3138733512'
seeLZ = input("是否只獲取樓主發言,是輸入1,否輸入0\n")
floorTag = input("是否寫入樓層資訊,是輸入1,否輸入0\n")
bdtb = BDTB(baseURL,seeLZ,floorTag)
bdtb.start()