1. 程式人生 > 其它 >python網路小說爬蟲程式的一些筆記,都是過程稿

python網路小說爬蟲程式的一些筆記,都是過程稿

#結構化的文字檔案轉換為dict, 輸入為一個本地檔案,每行兩列資訊;輸出為dict
def file2dict(filepath):
    listDict = dict()
    os.chdir(".")
    with open(filepath, encoding='utf-8') as f_menulist:
        try:
            lines = f_menulist.readlines()   #讀取全部內容 ,並以列表方式返回  
            filesum = len(lines)    #一共有多少章,即目錄檔案有多少行
            print
("Total: " + str(filesum) + " links in menulist") #讀取全部或者一定的行數 linesToRead = filesum #linesToRead = 13 #一次性會讀取幾章(menulist列表裡多少行)行號相減再+1 linesStart = 0 #從哪一行開始讀取。從第一行開始則寫0. for i in range(linesStart, linesStart+linesToRead): #linesToRead
#獲取網址和每章標題 currentline = lines[i].split("|") currenturl = currentline[0] currenttitle = currentline[1].strip()#[0:len(currentline[1])-1] #NOT inliude CR/LF listDict[currenturl] = currenttitle finally: f_menulist.close()
return listDict

 

 1     print(h)
 2     listDict = file2dict(basedir + menufile) #一個本地檔案轉成dict
 3     # print(listDict)
 4     errcache = ""
 5     lendic = 105#len(listDict) #這裡也可以指定一個數字,表示只讀取n行就停止
 6     i = 0
 7     firstchap = 1
 8     for key,value in listDict.items(): #成功的寫下檔案,失敗的列出列表
 9         i = i + 1
10         time.sleep(1.0)# Don't access website so frecquently
11         singlecontent = tryurl(key) #key是url,value是章節名稱
12         uf = get_file_by_urllib(key) #以url的檔名作為落地檔名
13         uf = os.path.basename(uf)
14         #print(uf)
15         
16         if singlecontent[0]==200:
17             print(i, "/", lendic, "-----------", uf, "-------------")
18             # print(singlecontent[1][100:250], "\n")
19             soup = BeautifulSoup(singlecontent[1], 'lxml')
20             contentPart = soup.find(id='content') #含有正文的那段div
21             # print(contentPart.text)
22             fixedtxt = fixtxt(contentPart.text) #清洗正文
23             # print(fixedtxt)
24             stxt = "<h2>" + value + "</h2>\n<p>" + fixedtxt
25             h,f = htmlhead("大魏能臣-黑男爵")
26             if firstchap==1: #html的檔案頭
27                 saveHtml(basedir+'out.xhtml', h)
28                 firstchap = 0
29             
30             if i==lendic: #最後一章html的檔案尾
31                 saveHtml(basedir+'out.xhtml', stxt)
32                 saveHtml(basedir+"out.xhtml", f)
33             else:
34                 saveHtml(basedir+'out.xhtml', stxt+chapdiv())    #chapdiv是sigil的章節分割標識
35         else:
36             errcache = errcache + key + "|" + value + "\n"
37             print("XXXXXXXXXXX", singlecontent[1], "XXXXXXXXXXXXX")
38         if i==lendic: #include 55th line in listfile
39             print(errcache)
40             break
41     menuDict = GetMenuHtml(myheader, menuurl) #從一個網址裡提取各個章節的url和章節名稱
42     saveTxt2File("menulist.txt", menuDict, urldomain) #儲存在檔案裡

 

def fixtitle(str):
    ts = str.find("第",0,1)
    #標題不是以第字開頭
    if ts==-1 :
        str = "第"+str
        te = str.find("章")
        if te==-1:
           a=1
    else:
        a=1
        
#main
for i in range(linesStart, linesStart+linesToRead):  #linesToRead
    #獲取網址和標題
    currentline = lines[i].split("|")
    currenturl = currentline[0]
    currenttitle = currentline[1].strip()#[0:len(currentline[1])-1] #NOT inliude cr/lf
    
    print("Line:"+str(i+1) + "***hacking: " + currenturl)

    StatusCode, htmlcontent = tryurl(currenturl)
    #print(type(htmlcontent))
    
    j = 1.2 #計數器,返回非200,則sleep增加0.2秒,上限為2秒.嘗試3次都不行,就退出
    while not(StatusCode==200): #get return correctly
        if j>3:
            sys.exit()
        else:
            j = j + 0.2
            print(str(j) + " " + str(StatusCode))
            time.sleep(0.8+j)
            
            StatusCode, htmlcontent = tryurl(currenturl)
    #清洗替換
    htmlcontent = htmlcontent.replace("<br/>", "\r\n")
    htmlcontent = htmlcontent.replace(" ", "")
    htmlcontent = htmlcontent.replace("<!--go-->","")
    htmlcontent = htmlcontent.replace("<!--over-->","")
    #saveHtml("1", htmlcontent)
    soup = BeautifulSoup(htmlcontent, features="lxml")
        
        #htmlcontent = soup.prettify()
    print('Finish:' + currenttitle + "\n") #print the title of chapter


        #開始提取正文
    txtcontent = ""
    allli = ""
    results = soup.select('#content')
        #print("len:"+str(len(results)))

            #print(type(t))
    allli = allli + currenttitle + "\n" + results[0].text + "\n"
        #print(showsinthistype)
        #print(showsinthistype)
        
        #saveHtml("2", showsinthistype)
        #remove BOM if exist


        # txtcontent = fixtxt(txtcontent, bc, patterns)
        #     print(txtcontent)
        #save current chapter to file
    thischap = allli + "\n\n"
        #儲存單獨的文章成一個檔案
    saveHtml(author+"_"+bookname+"_"+str(linesStart+1)+"-"+str(linesStart+linesToRead), thischap)

        #allchap = allchap + (esulstle+"\n\n"+thischap).encode('utf-8')
        #allchap = allchap + thischap
    time.sleep(1)
    
    #所有文章合併在一起並儲存成檔案
    #allchap = allchap + "\n\n"
    #saveHtml(author+"_"+bookname+"_"+str(linesStart+1)+"-"+str(linesStart+linesToRead), allchap)

    
print("Done!")
start_directory = r'.'
os.startfile(start_directory)

--------------------

  1 #read config info from a ini file
  2 config = configparser.ConfigParser()
  3 cfgfilename = 'cfg.ini' #this ini file should save as ascii tpye. Not UTF-8
  4 config.read(cfgfilename)
  5 
  6 author = config.get('article','author')
  7 bookname = config.get('article','bookname')
  8 
  9 menuurl = config.get('menu','menuurl')
 10 print(menuurl)
 11 
 12 baseurl = config.get('baseurl', 'baseurl')
 13 
 14 print("cfg.ini: "+author+"_"+bookname+"_"+menuurl);
 15 
 16 #用於清理文字的正則表示式
 17 patterns = []
 18 #patterns.append( re.compile(r"<script>.+script>"))
 19 #patterns.append( re.compile(r"\d.+?>")) #關閉貪婪模式
 20 #print("patterns: " + patterns)
 21 
 22 #open the list file
 23 #with open("_menulist.txt", encoding='UTF-8') as f_menulist:
 24 #    try:
 25 #        lines = f_menulist.readlines()   #讀取全部內容 ,並以列表方式返回  
 26 #        filesum = len(lines)    #一共有多少章,即目錄檔案有多少行
 27 #        print("Total: " + str(filesum) + " links in menulist")
 28 
 29         #讀取全部或者一定的行數
 30 #        linesToRead = filesum
 31 linesToRead = 5 #一次性會讀取幾章
 32 linesStart = 1 #從哪一行開始讀取。從第一行開始則寫1
 33 #    finally:
 34 #        f_menulist.close()
 35 
 36 #gzip-lib is for the encode utf-8 issue (headers section)
 37 #sometimes they reture None insdead of utf-8
 38 
 39 def fixtxt(txtcontent, badchar, patterns):
 40     soup = BeautifulSoup(txtcontent, features="lxml")
 41     for k in soup.find_all('div', id_="content") :
 42         #a = k.find_all()       #在每個對應li標籤下找a標籤
 43         print(k.text)        #獲取a標籤下的第一個字串
 44         
 45         #del soup.a['target','title']
 46         #del soup.img['class']
 47 
 48         txtcontent = soup.text
 49         print(soup)
 50 
 51     txtcontent = txtcontent.replace("<br/><br/>", "\n")
 52     txtcontent = txtcontent.strip()
 53 
 54 
 55     for t in badchar: #badchar from ini file
 56         print("XX: "+t)
 57         txtcontent = txtcontent.replace(t, "")
 58 
 59                     #print(txtcontent)
 60 
 61     for pattern in patterns: #patterns from ini file
 62         print(str(pattern))
 63         text = re.findall(pattern, txtcontent)
 64         if text:
 65             print("match: " + str(text))
 66             for j in text:
 67                 txtcontent = txtcontent.replace(j, "")
 68 
 69         txtcontent = soup.text       
 70         return txtcontent #print(txtcontent)
 71 
 72 
 73 
 74 allchap = "" #幾個章節合併成一個檔案用
 75 response = ""
 76 #get and set charset
 77 cs = response#.encoding
 78 print("Get charset: " + cs)
 79     
 80 #response.encoding = 'utf-8'
 81 #print("Set charset: " + response.encoding)
 82 
 83 StatusCode = 200 #response.status_code
 84 #print("HTTP return: " + str(StatusCode))
 85 
 86 # get menuhtml
 87 
 88 if StatusCode==200: #get return correctly
 89     htmlcontent = response.content.decode(encoding="utf-8")
 90     #print(type(htmlcontent))
 91        
 92     #htmlcontent = htmlcontent.replace("<br/>", "\n") #要先去除非成對的tag, such as br hr &nbsp;
 93     #saveHtml("1", htmlcontent)
 94     soup = BeautifulSoup(htmlcontent, features="lxml")
 95     
 96     #htmlcontent = soup.prettify()
 97     #開始menu section
 98     txtcontent = ""
 99     allli = ""
100     results = soup.select('dl > dd > a')
101     lines = len(results)
102     print("Total chapters: " + str(lines))
103     print(results[11])
104 
105 for i in range(0, lines):  #linesToRead
106     #獲取網址和標題
107     print(str(i))
108 
109     t = results[i]
110     currenturl = t.attrs['href']
111     print(currenturl)
112     currenttitle = t.text
113     print(currenttitle)
114 
115     allchap = baseurl + currenturl + "|" + currenttitle
116     currenturl = baseurl + currenturl
117     print("\n"+str(i) + "***hacking: " + currenturl)
118 
119     #response = requests.get(currenturl, headers=myheader, verify=False) #如果伺服器有反爬蟲就加上這個 see:https://blog.csdn.net/win_turn/article/details/77142100
120 
121     #get and set charset
122     #cs = response.encoding
123     #print("Get charset: " + cs)
124         
125     #response.encoding = 'utf-8'
126     #print("Set charset: " + response.encoding)
127 
128     #StatusCode = response.status_code
129     #print("HTTP return: " + str(StatusCode))
130 
131 
132     #所有文章合併在一起並儲存成檔案
136     allchap = allchap + "\n"
137     saveHtml("_menulist", allchap)
138 
143 
144 def removeDup(oldtxt): 
145     lines_seen = set()  # 用set去除重複
146  
147 for line in f:
148     line = line.strip('\n')
149     if line not in lines_seen:
150         outfile.write(line+ '\n')
151         lines_seen.add(line)

 

#清洗文字的函式
def
fixtxt(txtcontent, badchar, patterns): soup = BeautifulSoup(txtcontent, features="lxml") for k in soup.find_all('div', id_="content") : #a = k.find_all() #在每個對應li標籤下找a標籤 print(k.text) #獲取a標籤下的第一個字串 #del soup.a['target','title'] #del soup.img['class'] txtcontent = soup.text print(soup) txtcontent = txtcontent.replace("<br/><br/>", "\n") txtcontent = txtcontent.strip() for t in badchar: #badchar from ini file print("XX: "+t) txtcontent = txtcontent.replace(t, "") #print(txtcontent) for pattern in patterns: #patterns from ini file print(str(pattern)) text = re.findall(pattern, txtcontent) if text: print("match: " + str(text)) for j in text: txtcontent = txtcontent.replace(j, "") txtcontent = soup.text return txtcontent #print(txtcontent) myheader = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',} allchap = "" #幾個章節合併成一個檔案用