1. 程式人生 > 實用技巧 >利用python爬取百度百科python詞條相關的1000個頁面資料

利用python爬取百度百科python詞條相關的1000個頁面資料

1.分析目標,獲取抓取策略

1)入口頁

記錄該網址:https://baike.baidu.com/item/Python/407313?fr=aladdin

2)URL格式

<a href="/item/%E5%8D%A1%E8%80%90%E5%9F%BA%E6%A2%85%E9%9A%86%E5%A4%A7%E5%AD%A6" target="_blank">卡耐基梅隆大學</a>

<a href="/item/MATLAB" target="_blank">MATLAB</a>

其詞條的超連結是一個不完整的的網址,需要進行字串的拼接才能進行後續的訪問

3)資料格式

<dd class="lemmaWgt-lemmaTitle-title">
    <h1>Python</h1>
    ......
</dd>

<div class="lemma-summary" label-module="lemmaSummary">
    <div class="para" label-module="para">Python是一種跨平臺的
    ......
    </div>
</div> 

標題是由<ddclass="lemmaWgt-lemmaTitle-title"><h1>...<h1></dd>組成,內容是由<div class="lemma-summary">...</div>組成

4)頁面編碼:UTF-8

2.程式碼編寫

1)排程程式

 1 from practice.spider.baike_spider import url_manager, html_parser, html_outputer, html_downloader
 2 
 3 
 4 class SpiderMain(object):
 5 
 6     def __init__(self):
 7         self.urls = url_manager.UrlManager()
 8         self.downloader = html_downloader.HtmlDownloader()
9 self.parser = html_parser.HtmlParser() 10 self.outputer = html_outputer.HtmlOutputer() 11 12 def craw(self, root_url): 13 count = 1 14 self.urls.add_new_url(root_url) 15 while self.urls.has_new_url(): 16 try: 17 new_url = self.urls.get_new_url() 18 print(f'craw {count}:{new_url}') 19 html_cont = str(self.downloader.download(new_url), 'utf-8') 20 # print(html_cont) 21 new_urls, new_data = self.parser.parse(new_url, html_cont) 22 self.urls.add_new_urls(new_urls) 23 self.outputer.collect_data(new_data) 24 25 if count == 1000: 26 break 27 28 count = count + 1 29 except: 30 print('craw failed') 31 32 self.outputer.output_html() 33 34 35 if __name__ == '__main__': 36 root_url = 'https://baike.baidu.com/item/Python/407313' 37 obj_spider = SpiderMain() 38 obj_spider.craw(root_url)

2)URL管理器

 1 class UrlManager(object):
 2 
 3     def __init__(self):
 4         self.new_urls = set()
 5         self.old_urls = set()
 6 
 7     def add_new_url(self, url):
 8         if url is None:
 9             return
10 
11         if url not in self.new_urls and url not in self.old_urls:
12             self.new_urls.add(url)
13 
14     def add_new_urls(self, urls):
15         if urls is None or len(urls) == 0:
16             return
17 
18         for url in urls:
19             self.add_new_url(url)
20         # print('new_urls: ', self.new_urls)
21 
22     def get_new_url(self):
23         new_url = self.new_urls.pop()
24         self.old_urls.add(new_url)
25 
26         return new_url
27 
28     def has_new_url(self):
29         return len(self.new_urls) != 0

3)HTML下載器

 1 # import urllib.request
 2 from urllib import request
 3 
 4 
 5 class HtmlDownloader(object):
 6 
 7     def download(self, url):
 8         if url is None:
 9             return None
10 
11         response = request.urlopen(url)
12         if response.getcode() != 200:
13             return None
14 
15         return response.read()

4)HTML解析器

 1 from bs4 import BeautifulSoup
 2 from urllib.parse import urljoin
 3 import urllib.parse
 4 import re
 5 
 6 
 7 class HtmlParser(object):
 8 
 9     def _get_new_urls(self, page_url, soup):
10         new_urls = set()
11 
12         # 獲取所有連結,例如a 標籤
13         '''<a target="_blank" href="/item/Unix%20shell">Unix shell</a>'''
14         links = soup.find_all('a', href=re.compile(r'/item/'))
15 
16         for link in links:
17             new_url = link['href']
18             new_full_url = urllib.parse.urljoin(page_url, new_url)
19             new_urls.add(new_full_url)
20 
21         return new_urls
22 
23     def _get_new_data(self, page_url, soup):
24         res_data = {}
25 
26         # url
27         res_data['url'] = page_url
28 
29         # <dd class="lemmaWgt-lemmaTitle-title">
30         # <h1>Python</h1>
31         title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
32         res_data['title'] = title_node.get_text()
33 
34         # <div class="lemma-summary" label-module="lemmaSummary">
35         summary_node = soup.find('div', class_='lemma-summary')
36         res_data['summary'] = summary_node.get_text()
37 
38         return res_data
39 
40     def parse(self, page_url, html_cont):
41         if page_url is None or html_cont is None:
42             return
43 
44         soup = BeautifulSoup(html_cont, 'html.parser')
45 
46         new_urls = self._get_new_urls(page_url, soup)
47         # print('new_urls: ', new_urls)
48         new_data = self._get_new_data(page_url, soup)
49         # print('new_data: ', new_data)
50 
51         return new_urls, new_data

5)HTML輸出器

 1 from urllib.parse import unquote
 2 
 3 
 4 class HtmlOutputer(object):
 5 
 6     def __init__(self):
 7         self.datas = []
 8 
 9     def output_html(self):
10 
11         with open('output.html', 'w', encoding='utf-8') as fout:
12             fout.write('<html>')
13             fout.write('<body>')
14             fout.write('<table border="1" cellspacing="0" cellpadding="0">')
15 
16             for data in self.datas:
17                 fout.write('<tr>')
18                 fout.write(f'<td>{unquote(data["url"], encoding="utf-8")}</td>')
19                 fout.write(f'<td>{data["title"]}</td>')
20                 fout.write(f'<td>{data["summary"]}</td>')
21                 fout.write('</tr>')
22 
23             fout.write('</table>')
24             fout.write('</body>')
25             fout.write('</html>')
26 
27     def collect_data(self, data):
28         if data is None:
29             return
30 
31         self.datas.append(data)
32         # print('self.datas: ', self.datas)

3.爬取結果展示

原始碼下載地址:https://github.com/Nie-quan/spider.git