9.3.2 網頁爬蟲
阿新 • • 發佈:2018-05-04
實現 exc 頁面 數據 dir repl datetime ret find
網頁爬蟲常用來在互聯網上爬取感興趣的頁面或文件,結合數據處理與分析技術可以得到更深層次的信息。下面的代碼實現了網頁爬蟲,可以抓取指定網頁中的所有鏈接,並且可以指定關鍵字和抓取深度。
1 import sys
2 import multiprocessing
3 import re
4 import os
5 import urllib.request as lib
6
7 def craw_links(url,depth,keywords,processed):
8 ‘‘‘
9 :param url: 要爬取的網址
10 :param depth: 爬取深度
11 :param keywords: 要爬取的關鍵字組成的元組
12 :param procdssed: 進程池
13 :return:
14 ‘‘‘
15
16 contents = []
17
18 if url.startswith((‘http://‘,‘https://‘)):
19 if url not in processed:
20 #make this url as processed
21 processed.append(url)
22 else:
23 #avoid processing the same url again
24 return
25
26 print(‘Crawing ‘ + url + ‘...‘)
27 fp = lib.urlopen(url) #向url 發出請求
28
29 #Python3 returns bytes,so need to decode
30 contents_decoded = fp.read().decode(‘utf-8‘ )
31 fp.close() #至此已經讀取爬取的網頁文本內容
32
33 pattern = ‘|‘.join(keywords)
34
35 #if this page contains certain keywords,save it to a file
36 flag = False
37 if pattern:
38 searched = re.search(pattern,contents_decoded) #用正則表達式去返回的網頁文本中匹配關鍵字
39 else:
40 #if the keywords to filter is not given,save current page
41 flag = True
42
43 if flag or searched:
44 with open(‘craw\\‘ + url.replace(‘:‘,‘_‘).replace(‘/‘,‘_‘),‘w‘) as fp:
45 fp.writelines(contents)
46
47 #find all the links in the current page
48 links = re.findall(‘href="(.*?)"‘,contents_decoded)
49
50 #craw all links in the current page
51 for link in links:
52 #consider the relative path
53 if not link.startswith((‘http://‘,‘https://‘)):
54 try:
55 index = url.rindex(‘/‘)
56 link = url[0:index+1] + link
57 except:
58 pass
59 if depth > 0 and link.endswith((‘.htm‘,‘.html‘)):
60 craw_links(link,depth-1,keywords,processed)
61
62 if __name__ == ‘__main__‘:
63 processed = []
64 keywords=(‘datetime‘,‘KeyWord2‘)
65 if not os.path.exists(‘craw‘) or not os.path.isdir(‘craw‘):
66 os.mkdir(‘craw‘)
67 craw_links(r‘https://docs.python.org/3/library/index.html‘,1,keywords,processed)
9.3.2 網頁爬蟲