用python爬取新筆趣閣的所有小說，使用xpath提取

阿新 • • 發佈：2021-11-19

執行後會在執行的目錄下面建立一個建立一個筆趣閣的目錄，小說會一每個目錄分類，每部小說的章節都會在對應的小說目錄裡面

import time
import requests
from lxml import etree
import os

novel_name = "筆趣閣"
novel_length = 0


def main():
    getContents()
    get_novel_length()
    get_data()


def get_data():

    for i in range(novel_length):
        # 獲取到一部小說的連結和名字 

        href_name = get_href_name(index=i)
        # 一部小說的連結
        href = href_name[0]
        # 一部小說的名字
        name = href_name[1]
        # 網頁返回的內容
        html = getHtml(url=href)
        # 整部小說的章節連結和名字
        lists = link_title(html=html)

        for j in range(len(lists)):
            # 將要爬取的章節連結 

            link = lists[j][0]
            # 將要爬取的章節名字
            title = lists[j][1]
            to_link = "https://www.xbiquge.la" + link
            # 章節內容網頁返回的內容
            novel_html = getHtml(to_link)
            store_novel(novel_html=novel_html, name=name, title=title)
            time.sleep( 
1)


def store_novel(novel_html, name, title):
    """
    這個儲存的流程是，先將伺服器返回回來的資訊儲存到本地，或許你會問為什麼要多次一舉？
    因為不做儲存我取不到裡面正文的內容，只能取到最後一段，所以我就先將它存放到本地然後讀取出來
    請出來之後在筆趣閣這個目錄下建立一個以小說名為名稱的目錄，再將小說章節的內容放在這個目錄下面
    :param novel_html: 伺服器放回的章節資訊
    :param name: 爬取的小說名字
    :param title: 爬取的小說章節名
    :return: 
    """
    # 將傳輸過來的 HTML 資訊儲存起來
    with open(f"{novel_name}/novel.html", "w", encoding="utf-8") as f:
        f.write(novel_html)

    # 在主目錄小判斷有沒有小說的目錄,如果沒有就建立
    if not os.path.exists(f"{novel_name}/{name}"):
        # 建立一個以小說名為名稱的目錄
        os.mkdir(f"{novel_name}/{name}")

    with open(f"{novel_name}/novel.html", "r", encoding="utf-8") as f:
        html = f.read()
    xml = etree.HTML(html)
    content = xml.xpath('//div[@id="content"]/text()')

    with open(f"{novel_name}/{name}/{title}.txt", "a", encoding="utf-8") as f:
        for result in content:
            f.write(f"{result.strip()}\n")
    
    print(f"{name}\t{title} 爬取完成>>>")


def link_title(html):
    """
    這個函式是用來提取出每一部小說中的章節連結和名字的
    :param html: 網頁返回的內容
    :return: 章節名字和連結
    """
    # 建立一個列表用來儲存整部小說的章節連結和名字
    lists = []
    xml = etree.HTML(html)
    dd_list = xml.xpath('//div[@id="list"]/dl/dd')
    for data in dd_list:
        # 建立一個臨時列表，用來存放單個章節的連結和名字
        temp_list = []
        # 提取章節連結
        link = data.xpath('./a/@href')
        temp_list.append(str(link[0]))
        # 提取章節名字
        title = data.xpath('./a/text()')
        temp_list.append(str(title[0]))
        lists.append(temp_list)
    return lists


def get_href_name(index):
    """
    返回指定的連結和小說名字
    :param index: 第幾條資料
    :return: 返回讀取到的連結和名字
    """
    with open(f"{novel_name}/biqvge.txt", "r", encoding="utf-8") as f:
        data = f.readlines()
        href_name = data[index].split()
    return href_name


def get_novel_length():
    """
    獲取筆趣閣檔案中的連結有多少條
    """
    global novel_length
    with open(f"{novel_name}/biqvge.txt", "r", encoding="utf-8") as f:
        length = len(f.readlines())
    novel_length = length


def getContents():
    """
    這個函式用來做儲存處理，建立筆趣閣的目錄，然後將小說的名字和連結儲存起來
    """
    # 建立一個列表用來儲存整個頁面的資訊
    lists = []
    url = "https://www.xbiquge.la/xiaoshuodaquan/"
    html = getHtml(url=url)
    xml = etree.HTML(html)
    li_list = xml.xpath('//div[@id="main"]/div/ul/li')

    for li in li_list:
        # 建立一個臨時列表，用來儲存單個資訊的元素，例如小說的名字和連結
        temp_list = []
        # 找出小說的名字
        title = li.xpath('./a/text()')
        temp_list.append(str(title[0]))
        # 找出小說的連結
        href = li.xpath('./a/@href')
        temp_list.append(str(href[0]))
        # 追加到大列表中
        lists.append(temp_list)

    # 判斷筆趣閣目錄是否存在，不存在則建立
    if not os.path.exists(novel_name):
        print(f"沒有“{novel_name}”這個目錄，正在為你建立>>>>>")
        os.mkdir("筆趣閣")
        print("建立成功>>>>>")

    # 將資訊儲存起來，連結在前名字在後
    with open(f"{novel_name}/biqvge.txt", "a", encoding="utf-8") as f:
        for data in lists:
            f.write(f'{data[1]} {data[0]}\n')
    print("儲存成功")


def getHtml(url):
    """
    由於筆趣閣的請求不需要請求頭，所以就沒有新增多餘的頭部資訊
    :param url: 請求的連線
    :return: 返回請求後的處理內容
    """
    response = requests.get(url)
    html = response.content.decode("utf-8")
    return html


if __name__ == '__main__':
    main()