使用python爬取晉江文學城小說評論
阿新 • • 發佈:2021-08-21
使用python爬取晉江文學城小說評論資訊:評論id、評論樓層、評論使用者名稱、發表時間、章節id、評論內容
之前看到有作者想備份評論,就去看了下晉江頁面,沒有什麼反爬措施,評論內容都寫在html裡了,所以直接正則了。
有些評論數過多,所以每個csv儲存一章的評論(好像也沒什麼實用價值)。
""" 根據晉江小說的novelid和需要爬取的起始和終止章節,儲存這些章節的評論資訊 評論資訊包括:評論id、評論樓層、評論使用者名稱、發表時間、章節id、評論內容 訪問晉江評論庫不需要購買章節,可以隨機找一本 以該連結為例, http://www.jjwxc.net/onebook.php?novelid=2697774 novelid = 2697774 獲取 1-88章評論 """ import requests import re import csv # http://www.jjwxc.net/onebook.php?novelid=2697774 novelid = int(input("請輸入novelid:")) # 2697774 chapter_start = int(input("請輸入起始章節:")) # 1 chapter_end = int(input("請輸入終止章節:")) # 88 headers_dict = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36" } # 獲取html頁面程式碼 def getHTMLpage(url): page = requests.get(url, headers=headers_dict) page.close() page.encoding = "gb18030" return page # 獲取某一章的評論頁數 def get_summary(page): pattern1 = re.compile(r"共有<span class='redtext'>(?P<comment_count>\d+)</span>條評論," r"分<span class='redtext'>(?P<page_count>\d+)</span>頁", re.S) result = pattern1.search(page.text) return int(result.group("page_count")) # 獲取評論相關資訊 def get_re_result(page): pattern2 = re.compile(r'data-commentid="(?P<comment_id>.*?)"' r'.*?<span class="coltext">.*?№(?P<comment_floor>\d+).*?網友' r'.*?target="_blank">(?P<user_name>.*?)</a></span>' r'.*?發表時間:(?P<comment_time>.*?) ' r'.*?所評章節:.*?data-chapterid="(?P<chapter_id>.*?)">' r'.*?mormalcomment_.*?>(?P<content>.*?)</span>', re.S) result = pattern2.finditer(page.text) return result chapter_id = chapter_start while chapter_id <= chapter_end: page_id = 1 url_chap = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}" page1 = getHTMLpage(url_chap) # 跳過被鎖定章節 try: page_count = get_summary(page1) except AttributeError: chapter_id += 1 continue with open(f'{novelid}_chapter{chapter_id:03}_comments.csv', 'w', encoding="utf-8", newline='') as f: fieldnames = ["comment_id", "comment_floor", "user_name", "comment_time", "chapter_id", "content"] csv_writer = csv.DictWriter(f, fieldnames=fieldnames) csv_writer.writeheader() while page_id <= page_count: page_url = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}" page2 = getHTMLpage(page_url) comments = get_re_result(page2) for comment in comments: dic = comment.groupdict() csv_writer.writerow(dic) page_id += 1 print(f'chapter {chapter_id:03} has been saved.') chapter_id += 1