1. 程式人生 > 其它 >使用python爬取晉江文學城小說評論

使用python爬取晉江文學城小說評論

使用python爬取晉江文學城小說評論資訊:評論id、評論樓層、評論使用者名稱、發表時間、章節id、評論內容

之前看到有作者想備份評論,就去看了下晉江頁面,沒有什麼反爬措施,評論內容都寫在html裡了,所以直接正則了。
有些評論數過多,所以每個csv儲存一章的評論(好像也沒什麼實用價值)。

"""
根據晉江小說的novelid和需要爬取的起始和終止章節,儲存這些章節的評論資訊
評論資訊包括:評論id、評論樓層、評論使用者名稱、發表時間、章節id、評論內容

訪問晉江評論庫不需要購買章節,可以隨機找一本
以該連結為例, http://www.jjwxc.net/onebook.php?novelid=2697774
novelid = 2697774
獲取 1-88章評論

"""

import requests
import re
import csv


# http://www.jjwxc.net/onebook.php?novelid=2697774
novelid = int(input("請輸入novelid:")) # 2697774

chapter_start = int(input("請輸入起始章節:")) # 1
chapter_end = int(input("請輸入終止章節:")) # 88

headers_dict = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}


# 獲取html頁面程式碼
def getHTMLpage(url):
    page = requests.get(url, headers=headers_dict)
    page.close()
    page.encoding = "gb18030"
    return page


# 獲取某一章的評論頁數
def get_summary(page):
    pattern1 = re.compile(r"共有<span class='redtext'>(?P<comment_count>\d+)</span>條評論,"
                          r"分<span class='redtext'>(?P<page_count>\d+)</span>頁", re.S)
    result = pattern1.search(page.text)
    return int(result.group("page_count"))


# 獲取評論相關資訊
def get_re_result(page):
    pattern2 = re.compile(r'data-commentid="(?P<comment_id>.*?)"'
                          r'.*?<span class="coltext">.*?№(?P<comment_floor>\d+).*?網友'
                          r'.*?target="_blank">(?P<user_name>.*?)</a></span>'
                          r'.*?發表時間:(?P<comment_time>.*?)&nbsp'
                          r'.*?所評章節:.*?data-chapterid="(?P<chapter_id>.*?)">'
                          r'.*?mormalcomment_.*?>(?P<content>.*?)</span>', re.S)
    result = pattern2.finditer(page.text)
    return result


chapter_id = chapter_start
while chapter_id <= chapter_end:
    page_id = 1
    url_chap = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}"
    page1 = getHTMLpage(url_chap)

    # 跳過被鎖定章節
    try:
        page_count = get_summary(page1)
    except AttributeError:
        chapter_id += 1
        continue

    with open(f'{novelid}_chapter{chapter_id:03}_comments.csv', 'w', encoding="utf-8", newline='') as f:
        fieldnames = ["comment_id", "comment_floor", "user_name", "comment_time", "chapter_id", "content"]
        csv_writer = csv.DictWriter(f, fieldnames=fieldnames)
        csv_writer.writeheader()

        while page_id <= page_count:
            page_url = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}"
            page2 = getHTMLpage(page_url)
            comments = get_re_result(page2)
            for comment in comments:
                dic = comment.groupdict()
                csv_writer.writerow(dic)
            page_id += 1
        print(f'chapter {chapter_id:03} has been saved.')

    chapter_id += 1