1. 程式人生 > 實用技巧 >Python爬蟲爬取中國古詩詞網上的名句

Python爬蟲爬取中國古詩詞網上的名句

執行截圖:

 1 import requests
 2 from lxml import etree
 3 from urllib import request
 4 
 5 # 全域性變數(請求頭+檔案IO物件)
 6 headers = {
 7     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'}
 8 file=open('./古詩名句.txt','w',encoding='
utf-8') 9 10 11 # 採集前端原始碼 12 def index(): 13 for num in range(1, 12): 14 base_url = 'https://so.gushiwen.cn/mingju/default.aspx?p={}&c=&t='.format(num) # 網站翻頁 15 print('正在寫入', base_url, '中的資料資訊...') 16 response = requests.get(base_url, headers=headers) # 模擬訪問+請求頭 17
response.encoding = 'utf-8' # 解碼 18 html = response.text # 獲取原始碼 19 clean(html) # 清洗資料 20 21 22 # 清洗資料 23 def clean(html): 24 htmls=etree.HTML(html)#預處理 25 #xpath守則 26 Mingjus_urls=htmls.xpath('//div[@class="cont"]/a[1]/@href')#名句網址(待處理) 27 #print(Mingjus_urls)
28 Mingjus=htmls.xpath('//div[@class="cont"]/a[1]/text()')#名句 29 #print(Mingjus) 30 Poem_titles=htmls.xpath('//div[@class="cont"]/a[2]/text()') 31 #print(Poem_titles) 32 sto(Mingjus_urls,Mingjus,Poem_titles) 33 34 # 列印資料 35 def sto(Mingjus_urls,Mingjus,Poem_titles): 36 for M,Mingju,Poem_title in zip(Mingjus_urls,Mingjus,Poem_titles): 37 #拼接網址 38 Mingju_url='https://so.gushiwen.cn/'+M 39 #整合資料資訊 40 full_info=Mingju+'\t'+Poem_title+'\n'+'名句網址:'+Mingju_url 41 #寫入檔案 42 file.write(full_info+'\n') 43 44 if __name__ == '__main__': 45 index() 46 file.close()
View Code