1. 程式人生 > >9.3.2 網頁爬蟲

9.3.2 網頁爬蟲

實現 exc 頁面 數據 dir repl datetime ret find

  網頁爬蟲常用來在互聯網上爬取感興趣的頁面或文件,結合數據處理與分析技術可以得到更深層次的信息。下面的代碼實現了網頁爬蟲,可以抓取指定網頁中的所有鏈接,並且可以指定關鍵字和抓取深度。

 1 import sys
 2 import multiprocessing
 3 import re
 4 import os
 5 import urllib.request as lib
 6 
 7 def craw_links(url,depth,keywords,processed):
 8     ‘‘‘
 9     :param url:       要爬取的網址
10     :param depth:     爬取深度
11 :param keywords: 要爬取的關鍵字組成的元組 12 :param procdssed: 進程池 13 :return: 14 ‘‘‘ 15 16 contents = [] 17 18 if url.startswith((http://,https://)): 19 if url not in processed: 20 #make this url as processed 21 processed.append(url) 22 else:
23 #avoid processing the same url again 24 return 25 26 print(Crawing + url + ...) 27 fp = lib.urlopen(url) #向url 發出請求 28 29 #Python3 returns bytes,so need to decode 30 contents_decoded = fp.read().decode(utf-8
) 31 fp.close() #至此已經讀取爬取的網頁文本內容 32 33 pattern = |.join(keywords) 34 35 #if this page contains certain keywords,save it to a file 36 flag = False 37 if pattern: 38 searched = re.search(pattern,contents_decoded) #用正則表達式去返回的網頁文本中匹配關鍵字 39 else: 40 #if the keywords to filter is not given,save current page 41 flag = True 42 43 if flag or searched: 44 with open(craw\\ + url.replace(:,_).replace(/,_),w) as fp: 45 fp.writelines(contents) 46 47 #find all the links in the current page 48 links = re.findall(href="(.*?)",contents_decoded) 49 50 #craw all links in the current page 51 for link in links: 52 #consider the relative path 53 if not link.startswith((http://,https://)): 54 try: 55 index = url.rindex(/) 56 link = url[0:index+1] + link 57 except: 58 pass 59 if depth > 0 and link.endswith((.htm,.html)): 60 craw_links(link,depth-1,keywords,processed) 61 62 if __name__ == __main__: 63 processed = [] 64 keywords=(datetime,KeyWord2) 65 if not os.path.exists(craw) or not os.path.isdir(craw): 66 os.mkdir(craw) 67 craw_links(rhttps://docs.python.org/3/library/index.html,1,keywords,processed)

9.3.2 網頁爬蟲