1. 程式人生 > >python 爬qidian小說

python 爬qidian小說

判斷 re.sub replace break nbsp urllib step bre 第一章

  1 import re
  2 import urllib.request
  3 from bs4 import BeautifulSoup
  4 import time
  5 
  6 url=input("第一章網址:")
  7 
  8 def gethtml(url):
  9                                       #獲取頁面源代碼html
 10     page=urllib.request.urlopen(url)
 11     html=page.read().decode(utf-8)  #html是一個列表
12 soup=BeautifulSoup(html,html.parser) 13 14 return soup 15 16 def getcontent(soup,load): 17 18 content=soup.find_all("div",{"class":"read-content j_readContent"}) 19 20 content1=re.compile(r<p>([\s\S]*?)</p>) #匹配到段落內容 21 22
content2=content1.findall(str(content)) 23 24 content3=re.sub("</?\w+[^>]*>",‘‘,content2[0]) #除掉html標簽 25 26 content4=content3.replace(,。\n\n\0\0\0) #把以句號換位“。\n\n\0\0\0 兩個換行符三個空格” 到此,將章節內容獲取完畢 27 28 contentname=re.compile(r<h3 class="j_chapterName">(.*?)</h3>
) 29 30 contentname1=contentname.findall(str(soup)) #獲取章節名稱 31 32 book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4 33 34 with open(load, a) as f: 35 36 f.write(book) 37 38 39 40 def nextcontent(soup): 41 42 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 43 44 #print(str(content)) 45 46 step=re.compile(r<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">) 47 48 content1=step.findall(str(content)) 49 50 if content1 == []: #判斷該頁是否為最後一章,是,獲取最後一章(特殊)的url,不是,以常規方法獲取下一章url 51 52 step1=re.compile(r<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">) 53 54 content2=step1.findall(str(content)) 55 56 url="http:"+content2[0] 57 58 return url 59 else: 60 url="http:"+content1[0] 61 62 return url 63 64 def panduan(soup): 65 66 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 67 68 #print(str(content)) 69 70 step=re.compile(r<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">) 71 72 content1=step.findall(str(content)) 73 74 return content1 75 #------------------------------------------------------------------------- 76 77 78 79 #------------------------------------------------------------------------- 80 81 82 soup=gethtml(url) 83 bookname=re.findall(r<h1>(.*?)</h1> ,str(soup)) #匹配書名 84 85 86 87 load="d:/88/%s.txt" % bookname[0] 88 i=0 89 while 1==1: 90 soup=gethtml(url) 91 getcontent(soup,load) 92 url=nextcontent(soup) 93 content1=panduan(soup) #在該章裏匹配下一章的url,若無法匹配到(輸出為[]空),說明沒有下一章 94 i+=1 95 print("第%d章下載完成" % i) 96 97 if content1 == []: # 98 break 99 100 time.sleep(0.2) 101

python 爬qidian小說