python爬蟲學習01--電子書爬取

阿新 • • 發佈：2020-07-13

python爬蟲學習01--電子書爬取

1.獲取網頁資訊

import requests        #匯入requests庫

'''

獲取網頁資訊

'''

if __name__ == '__main__':          #主函式入口

    target = 'https://www.xsbiquge.com/78_78513/108078.html'#要爬取的目標地址

    req = requests.get(url=target)  #進行get請求

    req.encoding='utf-8'            #設定編碼

    print(req.text)                 #列印輸出

2.引入BeautifulSoup對網頁內容進行解析

import requests        #匯入requests庫

from bs4 import BeautifulSoup  #引入BeautifulSoup庫

'''

引入BeautifulSoup對網頁內容進行解析

獲取網頁電子書文字資訊

'''

if __name__ == '__main__':          #主函式入口

    target = 'https://www.xsbiquge.com/78_78513/108078.html'#要爬取的目標地址

    req = requests.get(url=target)  #發起請求，獲取html資訊
 

    req.encoding='utf-8'            #設定編碼

    html = req.text                 #將網頁的html資訊儲存在html變數中

    bs = BeautifulSoup(html,'lxml') #使用lxml對網頁資訊進行解析

    texts = bs.find('div',id='content') #獲取所有<div id = "content">的內容

    print(texts)                            #列印輸出

3.切分資料，去掉空格，提取文字

import requests        #匯入requests庫

from bs4 import BeautifulSoup  #引入BeautifulSoup庫

'''

引入BeautifulSoup對網頁內容進行解析

獲取網頁電子書文字資訊

最後一句texts.text 是提取所有文字，然後再使用 strip 方法去掉回車，

最後使用 split 方法根據 \xa0 切分資料，因為每一段的開頭，都有四個空格

'''

if __name__ == '__main__':          #主函式入口

    target = 'https://www.xsbiquge.com/78_78513/108078.html'#要爬取的目標地址

    req = requests.get(url=target)  #發起請求，獲取html資訊

    req.encoding='utf-8'            #設定編碼

    html = req.text                 #將網頁的html資訊儲存在html變數中

    bs = BeautifulSoup(html,'lxml') #使用lxml對網頁資訊進行解析

    texts = bs.find('div',id='content') #獲取所有<div id = "content">的內容

    print(texts.text.strip().split('\xa0'*4))                            #列印輸出

4.檢視章節列表

import requests        #匯入requests庫

from bs4 import BeautifulSoup  #引入BeautifulSoup庫

'''

檢視章節列表資訊

引入BeautifulSoup對網頁內容進行解析

獲取網頁電子書文字資訊

'''

if __name__ == '__main__':          #主函式入口

    target = 'https://www.xsbiquge.com/78_78513/'#要爬取的目標地址,《元尊》的章節目錄網址

    req = requests.get(url=target)      #發起請求，獲取html資訊

    req.encoding='utf-8'                #設定編碼

    html = req.text                     #將網頁的html資訊儲存在html變數中

    bs = BeautifulSoup(html,'lxml')     #使用lxml對網頁資訊進行解析

    chapters = bs.find('div',id='list') #獲取所有<div id = "list">的內容

    chapters = chapters.find_all('a')         #找到list中的a標籤中的內容

    for chapter in chapters:

        print(chapter)                  #列印章節列表

5.獲取章節目錄和章節連結

import requests        #匯入requests庫

from bs4 import BeautifulSoup  #引入BeautifulSoup庫

'''

檢視章節列表資訊

引入BeautifulSoup對網頁內容進行解析

獲取網頁電子書文字資訊

'''

if __name__ == '__main__':          #主函式入口

    server = 'https://www.xsbiquge.com'

    target = 'https://www.xsbiquge.com/78_78513/'#要爬取的目標地址,《元尊》的章節目錄網址

    req = requests.get(url=target)      #發起請求，獲取html資訊

    req.encoding='utf-8'                #設定編碼

    html = req.text                     #將網頁的html資訊儲存在html變數中

    bs = BeautifulSoup(html,'lxml')     #使用lxml對網頁資訊進行解析

    chapters = bs.find('div',id='list') #獲取所有<div id = "list">的內容

    chapters = chapters.find_all('a')         #找到list中的a標籤中的內容

    for chapter in chapters:

        url = chapter.get('href')       #獲取章節連結中的href

        print("《"+chapter.string+"》")           #列印章節名字

        print(server+url)               #將電子書網站與獲取到的章節連線進行拼接，得到每一個章節的連結

6.整合資料，下載電子書檔案

import requests        #匯入requests庫

from bs4 import BeautifulSoup  #引入BeautifulSoup庫

import time

from tqdm import  tqdm

'''

檢視章節列表資訊

引入BeautifulSoup對網頁內容進行解析

獲取網頁電子書文字資訊

'''

def get_content(target):

    req = requests.get(url=target)  # 發起請求，獲取html資訊

    req.encoding = 'utf-8'  # 設定編碼

    html = req.text  # 將網頁的html資訊儲存在html變數中

    bf = BeautifulSoup(html, 'lxml')  # 使用lxml對網頁資訊進行解析

    texts = bf.find('div', id='content')  # 獲取所有<div id = "content">的內容

    content = texts.text.strip().split('\xa0' * 4)

    return content

if __name__ == '__main__':          #主函式入口

    server = 'https://www.xsbiquge.com'     #電子書網站地址

    book_name = '《元尊》.txt'

    target = 'https://www.xsbiquge.com/78_78513/'#要爬取的目標地址,《元尊》的章節目錄網址

    req = requests.get(url=target)      #發起請求，獲取html資訊

    req.encoding='utf-8'                #設定編碼

    html = req.text                     #將網頁的html資訊儲存在html變數中

    chapter_bs = BeautifulSoup(html,'lxml')     #使用lxml對網頁資訊進行解析

    chapters = chapter_bs.find('div',id='list') #獲取所有<div id = "list">的內容

    chapters = chapters.find_all('a')         #找到list中的a標籤中的內容

    for chapter in tqdm(chapters):

        chapter_name = chapter.string           #章節名字

        url = server + chapter.get('href')       #獲取章節連結中的href

        content = get_content(url)

        with open(book_name,'a',encoding='utf-8') as f:

            f.write("《"+chapter_name+"》")

            f.write('\n')

            f.write('\n'.join(content))

            f.write('\n')