python網路小說爬蟲程式的一些筆記,都是過程稿
阿新 • • 發佈:2022-04-08
#結構化的文字檔案轉換為dict, 輸入為一個本地檔案,每行兩列資訊;輸出為dict def file2dict(filepath): listDict = dict() os.chdir(".") with open(filepath, encoding='utf-8') as f_menulist: try: lines = f_menulist.readlines() #讀取全部內容 ,並以列表方式返回 filesum = len(lines) #一共有多少章,即目錄檔案有多少行 print("Total: " + str(filesum) + " links in menulist") #讀取全部或者一定的行數 linesToRead = filesum #linesToRead = 13 #一次性會讀取幾章(menulist列表裡多少行)行號相減再+1 linesStart = 0 #從哪一行開始讀取。從第一行開始則寫0. for i in range(linesStart, linesStart+linesToRead): #linesToRead#獲取網址和每章標題 currentline = lines[i].split("|") currenturl = currentline[0] currenttitle = currentline[1].strip()#[0:len(currentline[1])-1] #NOT inliude CR/LF listDict[currenturl] = currenttitle finally: f_menulist.close()return listDict
1 print(h) 2 listDict = file2dict(basedir + menufile) #一個本地檔案轉成dict 3 # print(listDict) 4 errcache = "" 5 lendic = 105#len(listDict) #這裡也可以指定一個數字,表示只讀取n行就停止 6 i = 0 7 firstchap = 1 8 for key,value in listDict.items(): #成功的寫下檔案,失敗的列出列表 9 i = i + 1 10 time.sleep(1.0)# Don't access website so frecquently 11 singlecontent = tryurl(key) #key是url,value是章節名稱 12 uf = get_file_by_urllib(key) #以url的檔名作為落地檔名 13 uf = os.path.basename(uf) 14 #print(uf) 15 16 if singlecontent[0]==200: 17 print(i, "/", lendic, "-----------", uf, "-------------") 18 # print(singlecontent[1][100:250], "\n") 19 soup = BeautifulSoup(singlecontent[1], 'lxml') 20 contentPart = soup.find(id='content') #含有正文的那段div 21 # print(contentPart.text) 22 fixedtxt = fixtxt(contentPart.text) #清洗正文 23 # print(fixedtxt) 24 stxt = "<h2>" + value + "</h2>\n<p>" + fixedtxt 25 h,f = htmlhead("大魏能臣-黑男爵") 26 if firstchap==1: #html的檔案頭 27 saveHtml(basedir+'out.xhtml', h) 28 firstchap = 0 29 30 if i==lendic: #最後一章html的檔案尾 31 saveHtml(basedir+'out.xhtml', stxt) 32 saveHtml(basedir+"out.xhtml", f) 33 else: 34 saveHtml(basedir+'out.xhtml', stxt+chapdiv()) #chapdiv是sigil的章節分割標識 35 else: 36 errcache = errcache + key + "|" + value + "\n" 37 print("XXXXXXXXXXX", singlecontent[1], "XXXXXXXXXXXXX") 38 if i==lendic: #include 55th line in listfile 39 print(errcache) 40 break 41 menuDict = GetMenuHtml(myheader, menuurl) #從一個網址裡提取各個章節的url和章節名稱 42 saveTxt2File("menulist.txt", menuDict, urldomain) #儲存在檔案裡
def fixtitle(str): ts = str.find("第",0,1) #標題不是以第字開頭 if ts==-1 : str = "第"+str te = str.find("章") if te==-1: a=1 else: a=1 #main for i in range(linesStart, linesStart+linesToRead): #linesToRead #獲取網址和標題 currentline = lines[i].split("|") currenturl = currentline[0] currenttitle = currentline[1].strip()#[0:len(currentline[1])-1] #NOT inliude cr/lf print("Line:"+str(i+1) + "***hacking: " + currenturl) StatusCode, htmlcontent = tryurl(currenturl) #print(type(htmlcontent)) j = 1.2 #計數器,返回非200,則sleep增加0.2秒,上限為2秒.嘗試3次都不行,就退出 while not(StatusCode==200): #get return correctly if j>3: sys.exit() else: j = j + 0.2 print(str(j) + " " + str(StatusCode)) time.sleep(0.8+j) StatusCode, htmlcontent = tryurl(currenturl) #清洗替換 htmlcontent = htmlcontent.replace("<br/>", "\r\n") htmlcontent = htmlcontent.replace(" ", "") htmlcontent = htmlcontent.replace("<!--go-->","") htmlcontent = htmlcontent.replace("<!--over-->","") #saveHtml("1", htmlcontent) soup = BeautifulSoup(htmlcontent, features="lxml") #htmlcontent = soup.prettify() print('Finish:' + currenttitle + "\n") #print the title of chapter #開始提取正文 txtcontent = "" allli = "" results = soup.select('#content') #print("len:"+str(len(results))) #print(type(t)) allli = allli + currenttitle + "\n" + results[0].text + "\n" #print(showsinthistype) #print(showsinthistype) #saveHtml("2", showsinthistype) #remove BOM if exist # txtcontent = fixtxt(txtcontent, bc, patterns) # print(txtcontent) #save current chapter to file thischap = allli + "\n\n" #儲存單獨的文章成一個檔案 saveHtml(author+"_"+bookname+"_"+str(linesStart+1)+"-"+str(linesStart+linesToRead), thischap) #allchap = allchap + (esulstle+"\n\n"+thischap).encode('utf-8') #allchap = allchap + thischap time.sleep(1) #所有文章合併在一起並儲存成檔案 #allchap = allchap + "\n\n" #saveHtml(author+"_"+bookname+"_"+str(linesStart+1)+"-"+str(linesStart+linesToRead), allchap) print("Done!") start_directory = r'.' os.startfile(start_directory)
--------------------
1 #read config info from a ini file 2 config = configparser.ConfigParser() 3 cfgfilename = 'cfg.ini' #this ini file should save as ascii tpye. Not UTF-8 4 config.read(cfgfilename) 5 6 author = config.get('article','author') 7 bookname = config.get('article','bookname') 8 9 menuurl = config.get('menu','menuurl') 10 print(menuurl) 11 12 baseurl = config.get('baseurl', 'baseurl') 13 14 print("cfg.ini: "+author+"_"+bookname+"_"+menuurl); 15 16 #用於清理文字的正則表示式 17 patterns = [] 18 #patterns.append( re.compile(r"<script>.+script>")) 19 #patterns.append( re.compile(r"\d.+?>")) #關閉貪婪模式 20 #print("patterns: " + patterns) 21 22 #open the list file 23 #with open("_menulist.txt", encoding='UTF-8') as f_menulist: 24 # try: 25 # lines = f_menulist.readlines() #讀取全部內容 ,並以列表方式返回 26 # filesum = len(lines) #一共有多少章,即目錄檔案有多少行 27 # print("Total: " + str(filesum) + " links in menulist") 28 29 #讀取全部或者一定的行數 30 # linesToRead = filesum 31 linesToRead = 5 #一次性會讀取幾章 32 linesStart = 1 #從哪一行開始讀取。從第一行開始則寫1 33 # finally: 34 # f_menulist.close() 35 36 #gzip-lib is for the encode utf-8 issue (headers section) 37 #sometimes they reture None insdead of utf-8 38 39 def fixtxt(txtcontent, badchar, patterns): 40 soup = BeautifulSoup(txtcontent, features="lxml") 41 for k in soup.find_all('div', id_="content") : 42 #a = k.find_all() #在每個對應li標籤下找a標籤 43 print(k.text) #獲取a標籤下的第一個字串 44 45 #del soup.a['target','title'] 46 #del soup.img['class'] 47 48 txtcontent = soup.text 49 print(soup) 50 51 txtcontent = txtcontent.replace("<br/><br/>", "\n") 52 txtcontent = txtcontent.strip() 53 54 55 for t in badchar: #badchar from ini file 56 print("XX: "+t) 57 txtcontent = txtcontent.replace(t, "") 58 59 #print(txtcontent) 60 61 for pattern in patterns: #patterns from ini file 62 print(str(pattern)) 63 text = re.findall(pattern, txtcontent) 64 if text: 65 print("match: " + str(text)) 66 for j in text: 67 txtcontent = txtcontent.replace(j, "") 68 69 txtcontent = soup.text 70 return txtcontent #print(txtcontent) 71 72 73 74 allchap = "" #幾個章節合併成一個檔案用 75 response = "" 76 #get and set charset 77 cs = response#.encoding 78 print("Get charset: " + cs) 79 80 #response.encoding = 'utf-8' 81 #print("Set charset: " + response.encoding) 82 83 StatusCode = 200 #response.status_code 84 #print("HTTP return: " + str(StatusCode)) 85 86 # get menuhtml 87 88 if StatusCode==200: #get return correctly 89 htmlcontent = response.content.decode(encoding="utf-8") 90 #print(type(htmlcontent)) 91 92 #htmlcontent = htmlcontent.replace("<br/>", "\n") #要先去除非成對的tag, such as br hr 93 #saveHtml("1", htmlcontent) 94 soup = BeautifulSoup(htmlcontent, features="lxml") 95 96 #htmlcontent = soup.prettify() 97 #開始menu section 98 txtcontent = "" 99 allli = "" 100 results = soup.select('dl > dd > a') 101 lines = len(results) 102 print("Total chapters: " + str(lines)) 103 print(results[11]) 104 105 for i in range(0, lines): #linesToRead 106 #獲取網址和標題 107 print(str(i)) 108 109 t = results[i] 110 currenturl = t.attrs['href'] 111 print(currenturl) 112 currenttitle = t.text 113 print(currenttitle) 114 115 allchap = baseurl + currenturl + "|" + currenttitle 116 currenturl = baseurl + currenturl 117 print("\n"+str(i) + "***hacking: " + currenturl) 118 119 #response = requests.get(currenturl, headers=myheader, verify=False) #如果伺服器有反爬蟲就加上這個 see:https://blog.csdn.net/win_turn/article/details/77142100 120 121 #get and set charset 122 #cs = response.encoding 123 #print("Get charset: " + cs) 124 125 #response.encoding = 'utf-8' 126 #print("Set charset: " + response.encoding) 127 128 #StatusCode = response.status_code 129 #print("HTTP return: " + str(StatusCode)) 130 131 132 #所有文章合併在一起並儲存成檔案 136 allchap = allchap + "\n" 137 saveHtml("_menulist", allchap) 138 143 144 def removeDup(oldtxt): 145 lines_seen = set() # 用set去除重複 146 147 for line in f: 148 line = line.strip('\n') 149 if line not in lines_seen: 150 outfile.write(line+ '\n') 151 lines_seen.add(line)
#清洗文字的函式
def fixtxt(txtcontent, badchar, patterns): soup = BeautifulSoup(txtcontent, features="lxml") for k in soup.find_all('div', id_="content") : #a = k.find_all() #在每個對應li標籤下找a標籤 print(k.text) #獲取a標籤下的第一個字串 #del soup.a['target','title'] #del soup.img['class'] txtcontent = soup.text print(soup) txtcontent = txtcontent.replace("<br/><br/>", "\n") txtcontent = txtcontent.strip() for t in badchar: #badchar from ini file print("XX: "+t) txtcontent = txtcontent.replace(t, "") #print(txtcontent) for pattern in patterns: #patterns from ini file print(str(pattern)) text = re.findall(pattern, txtcontent) if text: print("match: " + str(text)) for j in text: txtcontent = txtcontent.replace(j, "") txtcontent = soup.text return txtcontent #print(txtcontent) myheader = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',} allchap = "" #幾個章節合併成一個檔案用