Python爬蟲——爬取豆瓣top250完整程式碼

阿新 • • 發佈：2020-11-14

# -*- coding = utf-8 -*-

# 解析網頁
from bs4 import BeautifulSoup as bf
# 正則表示式
import re
# Excel表格操作
import xlwt
# 獲取URL得到html檔案
import urllib.request as req

# 設定單獨全域性變數，如需更加規範，也可以將電影資訊封裝成一個class類 比如 class Movie: ...
# 電影名稱
find_name = re.compile(r'<span class="title">(.*?)</span>')
# 電影播放地址連結 

find_link = re.compile(r'<a href="(.*?)">')
# 電影封面的地址連結，re.S讓換行符包含在字元中
find_imgSrc = re.compile(r'<img.*src="(.*?)"', re.S)
# 電影評分
find_score = re.compile(r'<span class="rating_num".*>(.*?)</span>')
# 評分人數
find_num = re.compile(r'<span>(\d*人)評價</span>')
# 名句
find_inq = re.compile(r' 
<span class="inq">(.*?)</span>')


# 有些電影沒有某些項，所以查詢長度為0的時候，設定該項為空
def set_film(file, content):
    # 檢查查詢內容的長度，如果不為0，說明查詢到內容，則將內容轉換成字串型別
    if len(re.findall(file, content)) != 0:
        film = str(re.findall(file, content)[0])
    else:
        film = ""

    return film


# 儲存獲取的html，避免出現ip異常的情況 

def write_html(path, html):
    file = open(path, 'w', encoding='utf-8')
    file.write(str(html))
    file.close()


# 迴圈獲取所有的html頁面並提取所需資訊儲存到 data_list 列表
def get_data():
    # 獲得多有頁面有價值的資訊，然後集中存放與data_list列表中
    data_list = []
    # 迴圈遍歷，修改?start=起始排行序號，獲取不同分頁的豆瓣top資訊，url分頁格式去豆瓣換頁內容試試
    # 例如第一頁第 top 0-24，第二頁是top 25-49條 ?start=25 這個引數，會讓伺服器響應第二頁的25條資訊
    for i in range(0, 250, 25):

        # 使用二進位制讀取，這點很重要，報錯無數次
        html = open('Data/html/html' + str(i//25) + '.html', 'rb')

        # 接下來是逐一解析資料
        bs = bf(html, 'html.parser')

        # 使用標籤 + 屬性組合查詢，查詢<div class="item"></div>的標籤塊
        # 注意：class是關鍵字，所以這裡需要使用 class_ 代替
        f_list = bs.find_all('div', class_="item")

        # 使用re.findall(x, s) 或者 x.findall(s)效果一樣
        for f in f_list:
            data = []
            # 將正則表示式提取的內容賦值給自定義變數
            file_name = set_film(find_name, str(f))
            file_num = set_film(find_num, str(f))
            file_link = set_film(find_link, str(f))
            file_img_src = set_film(find_imgSrc, str(f))
            file_score = set_film(find_score, str(f))
            file_inq = set_film(find_inq, str(f))

            # 將所有需要的資料儲存到data列表
            data.append(file_name)
            data.append(file_score)
            data.append(file_num)
            data.append(file_link)
            data.append(file_img_src)
            data.append(file_inq)

            # 寫入data（單條電影資訊）列表，到總的 data_list（所有電影資訊）列表
            data_list.append(data)

        html.close()

    return data_list


# 儲存豆瓣的各頁html檔案
def save_douban_html(base_url):
    for i in range(0, 250, 25):
        # 使用基礎地址 'https://movie.douban.com/top250?start='  +  偏移地址如 '25'
        url = base_url + str(i)

        # 獲取html儲存在本地，方便之後爬蟲操作，因為頻繁爬取可能被豆瓣發現異常
        html = ask_url(url)

        # 將檔案批量儲存在 Data/html/ 目錄下 i//25 是整除，命名格式如   html0.html  html1.html ...
        write_html('Data/html/html' + str(i//25) + '.html', html)


# 獲取html資訊，並返回html資訊
def ask_url(url):
    # 設定傳給伺服器的header頭部資訊，偽裝自己是正規瀏覽器訪問
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
    }

    # 用於儲存獲取的html檔案
    html = ""
    # 最好用 try-except 捕捉異常
    try:
        # 封裝一個Request物件，將自定義的頭部資訊加入進去
        res = req.Request(url, headers=headers)
        # 向指定的url獲取響應資訊，設定超時，防止長時間耗在一個頁面
        response = req.urlopen(res, timeout=10)
        # 讀取html資訊，使用decode('utf-8')解碼
        html = response.read().decode('utf-8')
    # 如果出錯，就捕捉報錯資訊並打印出，這裡使用Exception 泛泛的意思一下
    except Exception as error:
        # 出現異常時候，列印報錯資訊
        print("Ask_url is Error : " + error)

    # 將獲得的html頁面資訊返回
    return html


# 以下split、main兩個函式和 if __name__ 程式主入口是我個人程式設計習慣，與上述內容無本質關聯
# 定義分隔線長度，並返回分割線字串
def split(num):
    str1 = ""
    for i in range(1, num):
        # print("------------", end='')
        str1 += "------------"

    return str1


# 讀取檔案文字
def read_file(file_name):
    # 開啟文字選擇讀模式
    file = open(file_name, 'r', encoding='utf-8')
    print(file.read())
    file.close()


# 儲存資料到txt文字中
def save_data_txt(datas, save_file):
    # 開啟文字選擇寫模式，並指定編碼格式
    file = open(save_file, 'w', encoding='utf-8')
    # 不能直接寫入list，所以通過遍歷一條條寫入
    for data in datas:
        for dat in data:
            file.write(dat + '\n')
        file.write(split(10) + '\n')
    file.close()


# 設定excel的單元格字型樣式
def set_font(bold, size, horz):
    # 建立xlwt格式物件
    style_font = xlwt.XFStyle()
    # 設定字型是否為粗體
    style_font.font.bold = bold
    # 設定字型尺寸大小
    style_font.font.height = size
    # 字型是否居中
    if horz:
        # 設定字型水平居中
        style_font.alignment.horz = 0x02
        # 設定字型垂直居中
        style_font.alignment.vert = 0x01
    # 設定單元格自動換行
    style_font.alignment.wrap = False

    # 返回設定的字型樣式
    return style_font


# 儲存資料到excel檔案中
def save_data_excel(datas, save_path):
    # 建立一個xlwt物件，使用utf-8編碼格式
    excel = xlwt.Workbook(encoding='utf-8')
    # 建立一個工作表，命名為top250
    sheet = excel.add_sheet('top250')

    # 設定前六列的列寬
    width_c = [256*20, 256*6, 256*12, 256*42, 256*72, 256*68]
    for i in range(0, 6):
        sheet.col(i).width = width_c[i]

    # 設定三種單元格樣式 set_font(粗體，尺寸，居中)
    style_font_title = set_font(True, 240, True)
    style_font_content = set_font(False, 220, True)
    style_font_content1 = set_font(False, 220, False)

    # 表格各列的列名
    titles = ['電影名稱', '評分', '評論數', '電影連結', '圖片連結', '電影名言']
    index = 0
    # 將標題寫入excel
    for title in titles:
        # (單元格行序號，單元格列序號，單元格的內容，單元格樣式)
        sheet.write(0, index, title, style_font_title)
        index += 1

    # 將資料寫入excel
    index_r = 1
    # 從多條電影中每次取出一條
    for data in datas:
        index_c = 0
        # 從一條電影中每次取出一個屬性
        for item in data:
            # 前三列設定居中對齊
            if index_c <= 2:
                sheet.write(index_r, index_c, item, style_font_content)
            # 後三列設定預設對齊，即左對齊
            else:
                sheet.write(index_r, index_c, item, style_font_content1)
            index_c += 1
        index_r += 1

    # 儲存excel檔案到指定路徑
    excel.save(save_path)


# 呼叫測試程式
def main():
    base_url = "https://movie.douban.com/top250?start="

    # 1.爬取網頁
    # 從豆瓣上獲取html檔案並儲存到本地目錄下，該方法成功執行一次即可，儲存html，接下來本地操作
    # save_douban_html(base_url)

    # 2.解析資料
    # 逐個解析儲存在本地的html檔案
    datas = get_data()

    # 3.儲存資料
    # 儲存爬取資料到本地txt檔案
    # save_txt_path = 'Data/Text/top250.txt'
    # save_data_txt(datas, save_txt_path)
    # 將讀取的txt文字列印到控制檯
    # read_file('Data/Text/top250.txt')

    # 儲存爬取資料到本地excel檔案
    save_excel_path = 'Data/excel/top250.xls'
    save_data_excel(datas, save_excel_path)

    # 列印自定義分界線
    print(split(10))


# 主程式入口
if __name__ == '__main__':
    main()