利用python爬取百度百科python詞條相關的1000個頁面資料
阿新 • • 發佈:2020-10-10
1.分析目標,獲取抓取策略
1)入口頁
記錄該網址:https://baike.baidu.com/item/Python/407313?fr=aladdin
2)URL格式
<a href="/item/%E5%8D%A1%E8%80%90%E5%9F%BA%E6%A2%85%E9%9A%86%E5%A4%A7%E5%AD%A6" target="_blank">卡耐基梅隆大學</a> <a href="/item/MATLAB" target="_blank">MATLAB</a>
其詞條的超連結是一個不完整的的網址,需要進行字串的拼接才能進行後續的訪問
3)資料格式
<dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1> ...... </dd>
<div class="lemma-summary" label-module="lemmaSummary"> <div class="para" label-module="para">Python是一種跨平臺的 ...... </div> </div>
標題是由<ddclass="lemmaWgt-lemmaTitle-title"><h1>...<h1></dd>組成,內容是由<div class="lemma-summary">...</div>組成
4)頁面編碼:UTF-8
2.程式碼編寫
1)排程程式
1 from practice.spider.baike_spider import url_manager, html_parser, html_outputer, html_downloader 2 3 4 class SpiderMain(object): 5 6 def __init__(self): 7 self.urls = url_manager.UrlManager() 8 self.downloader = html_downloader.HtmlDownloader()9 self.parser = html_parser.HtmlParser() 10 self.outputer = html_outputer.HtmlOutputer() 11 12 def craw(self, root_url): 13 count = 1 14 self.urls.add_new_url(root_url) 15 while self.urls.has_new_url(): 16 try: 17 new_url = self.urls.get_new_url() 18 print(f'craw {count}:{new_url}') 19 html_cont = str(self.downloader.download(new_url), 'utf-8') 20 # print(html_cont) 21 new_urls, new_data = self.parser.parse(new_url, html_cont) 22 self.urls.add_new_urls(new_urls) 23 self.outputer.collect_data(new_data) 24 25 if count == 1000: 26 break 27 28 count = count + 1 29 except: 30 print('craw failed') 31 32 self.outputer.output_html() 33 34 35 if __name__ == '__main__': 36 root_url = 'https://baike.baidu.com/item/Python/407313' 37 obj_spider = SpiderMain() 38 obj_spider.craw(root_url)
2)URL管理器
1 class UrlManager(object): 2 3 def __init__(self): 4 self.new_urls = set() 5 self.old_urls = set() 6 7 def add_new_url(self, url): 8 if url is None: 9 return 10 11 if url not in self.new_urls and url not in self.old_urls: 12 self.new_urls.add(url) 13 14 def add_new_urls(self, urls): 15 if urls is None or len(urls) == 0: 16 return 17 18 for url in urls: 19 self.add_new_url(url) 20 # print('new_urls: ', self.new_urls) 21 22 def get_new_url(self): 23 new_url = self.new_urls.pop() 24 self.old_urls.add(new_url) 25 26 return new_url 27 28 def has_new_url(self): 29 return len(self.new_urls) != 0
3)HTML下載器
1 # import urllib.request 2 from urllib import request 3 4 5 class HtmlDownloader(object): 6 7 def download(self, url): 8 if url is None: 9 return None 10 11 response = request.urlopen(url) 12 if response.getcode() != 200: 13 return None 14 15 return response.read()
4)HTML解析器
1 from bs4 import BeautifulSoup 2 from urllib.parse import urljoin 3 import urllib.parse 4 import re 5 6 7 class HtmlParser(object): 8 9 def _get_new_urls(self, page_url, soup): 10 new_urls = set() 11 12 # 獲取所有連結,例如a 標籤 13 '''<a target="_blank" href="/item/Unix%20shell">Unix shell</a>''' 14 links = soup.find_all('a', href=re.compile(r'/item/')) 15 16 for link in links: 17 new_url = link['href'] 18 new_full_url = urllib.parse.urljoin(page_url, new_url) 19 new_urls.add(new_full_url) 20 21 return new_urls 22 23 def _get_new_data(self, page_url, soup): 24 res_data = {} 25 26 # url 27 res_data['url'] = page_url 28 29 # <dd class="lemmaWgt-lemmaTitle-title"> 30 # <h1>Python</h1> 31 title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') 32 res_data['title'] = title_node.get_text() 33 34 # <div class="lemma-summary" label-module="lemmaSummary"> 35 summary_node = soup.find('div', class_='lemma-summary') 36 res_data['summary'] = summary_node.get_text() 37 38 return res_data 39 40 def parse(self, page_url, html_cont): 41 if page_url is None or html_cont is None: 42 return 43 44 soup = BeautifulSoup(html_cont, 'html.parser') 45 46 new_urls = self._get_new_urls(page_url, soup) 47 # print('new_urls: ', new_urls) 48 new_data = self._get_new_data(page_url, soup) 49 # print('new_data: ', new_data) 50 51 return new_urls, new_data
5)HTML輸出器
1 from urllib.parse import unquote 2 3 4 class HtmlOutputer(object): 5 6 def __init__(self): 7 self.datas = [] 8 9 def output_html(self): 10 11 with open('output.html', 'w', encoding='utf-8') as fout: 12 fout.write('<html>') 13 fout.write('<body>') 14 fout.write('<table border="1" cellspacing="0" cellpadding="0">') 15 16 for data in self.datas: 17 fout.write('<tr>') 18 fout.write(f'<td>{unquote(data["url"], encoding="utf-8")}</td>') 19 fout.write(f'<td>{data["title"]}</td>') 20 fout.write(f'<td>{data["summary"]}</td>') 21 fout.write('</tr>') 22 23 fout.write('</table>') 24 fout.write('</body>') 25 fout.write('</html>') 26 27 def collect_data(self, data): 28 if data is None: 29 return 30 31 self.datas.append(data) 32 # print('self.datas: ', self.datas)
3.爬取結果展示
原始碼下載地址:https://github.com/Nie-quan/spider.git