1. 程式人生 > >爬蟲——三個小實戰

爬蟲——三個小實戰

操作 aik x64 函數 win 調用 pan 添加 {}

貼吧爬取

寫代碼前,構思需要的功能塊;寫代碼時,把各個功能模塊名提前寫好

初始化

初始化必要參數,完成基礎設置 爬取百度貼吧lol吧:爬取地址中的get參數須傳遞(可以指定不同主題的貼吧和頁碼)

  • 主題名
  • 初始網址
  • 請求頭

生成網址

生成每一頁的路由地址

  • 根據列表生成式生成多個頁面的地址

下載

get請求給每一頁的地址,爬取頁面

保存

保存爬取結果到文件中,把每一頁爬取結果寫入到對應名字的文件中

控制流程

將以上爬取操作封裝到run函數中,方便外部對象調用,以後會在此添加多線程

  • 生成要爬取的每一頁的路由地址
  • 通過for循環遍歷每一個路由地址
  • 對每個路由地址進行爬取和獲取頁碼操作,並進行保存

源碼

 1 import requests
 2 
 3 class TiebaSpider:
 4     def __init__(self, tieba_name_crawl):
 5         """
 6         初始化必要參數,完成基礎設置
 7         爬取百度貼吧lol吧:爬取地址中的get參數須傳遞(可以指定不同主題的貼吧和頁碼)
 8         """
 9         self.tieba_name = tieba_name_crawl
10         self.url_base = https://tieba.baidu.com/f?kw=
+ tieba_name_crawl + &ie=utf-8&pn={} 11 self.headers = {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0} 12 13 def make_url(self): 14 """ 15 生成每一頁的路由地址 16 :return:(列表生成式) 17 """ 18 return [self.url_base.format(i) for
i in range(4)] 19 20 def download_url(self, url_str): 21 """ 22 get請求給每一頁的地址,爬取頁面 23 :param url_str: 每一頁的路由地址 24 :return: 爬取的結果 25 """ 26 result = requests.get(url_str, headers=self.headers) 27 return result.text 28 29 def save_result(self, result, page_num): 30 """ 31 保存爬取結果到文件中 32 :param result: 每一頁的爬取結果 33 :param page_num: 頁碼,方便分類保存 34 :return: 把每一頁爬取結果寫入到對應名字的文件中 35 """ 36 # with open(‘./download/lol‘ + str(page_num) + ‘.html‘, ‘ab‘) as f: 37 # f.write(result.encode(‘utf-8‘)) 38 file_path = ./download/{}~第{}頁.html.format(self.tieba_name,page_num) 39 with open(file_path,wb) as f: 40 f.write(result.encode(utf-8)) 41 42 def run(self): 43 """ 44 將以上爬取操作封裝到run函數中,方便外部對象調用,以後會在此添加多線程 45 · 生成要爬取的每一頁的路由地址 46 · 通過for循環遍歷每一個路由地址 47 · 對每個路由地址進行爬取和獲取頁碼操作,並進行保存 48 :return: 49 """ 50 url_lists = self.make_url() 51 for url_str in url_lists: 52 result_str = self.download_url(url_str) 53 p_num = url_lists.index(url_str) + 1 54 self.save_result(result=result_str,page_num=p_num) 55 56 if __name__ == __main__: 57 tieba_spider = TiebaSpider(lol) 58 tieba_spider.run()

爬取糗事百科

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import lxml.html
 4 
 5 class QiushiSpider:
 6     def __init__(self):
 7         """
 8         初始化必要參數,完成基礎設置
 9         """
10         # self.tieba_name = qiushi_name_crawl
11         # https: // www.qiushibaike.com / 8
12         # hr / page / 2 /
13         self.url_base = https://www.qiushibaike.com/8hr/page/{}/
14         # self.url_base = ‘https://tieba.baidu.com/f?kw=‘ + qiushi_name_crawl + ‘&ie=utf-8&pn={}‘
15         self.headers = {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0}
16 
17     def make_url(self):
18         return [self.url_base.format(i) for i in range(4)]
19 
20     def download_url(self, url_str):
21         result = requests.get(url_str, headers=self.headers)
22         #----------
23         # html = lxml.html.fromstring(result.text)
24         # html_data = html.xpath(‘//div[@class="content"]/span[1]/text()‘)
25         # data_all = []
26         # # for h in html_data:
27         # #     data_all.append(h)
28         # return html_data
29         #-----------
30         return result.text
31 
32     def save_result(self, result, page_num):
33         with open(./download/qiushi + str(page_num) + .html, ab) as f:
34             f.write(result.encode(utf-8))
35 
36 
37 # qiushi = QiushiSpider()
38 # qiushi_url = qiushi.make_url()
39 # j = 1
40 # for i in qiushi_url:
41 #     qiushi_text = qiushi.download_url(url_str=i)
42 #     qiushi.save_result(result=qiushi_text, page_num=j)
43 #     j += 1

爬取國家信息

BeautifulSoup方式

 1 import requests
 2 from bs4 import BeautifulSoup
 3 class CountrySoup:
 4     def __init__(self,country_name):
 5         self.country_name = country_name
 6         self.url_base = http://example.webscraping.com/places/default/view/{}.format(self.country_name)
 7         self.headers = {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0,}
 8 
 9     def download_url(self):
10         result = requests.get(self.url_base,headers=self.headers)
11         soup = BeautifulSoup(result.text,lxml)
12         tr = soup.find(attrs={id:"places_country__row"})
13         print(tr,type(tr))
14         td = tr.find(attrs={class:"w2p_fw"})
15         print(td,type(td))
16 
17         print(td.text)

lxml方式

 1 class CountrySpider:
 2     def __init__(self,country_name):
 3         self.country_name = country_name
 4         self.url_base = http://example.webscraping.com/places/default/view/{}.format(self.country_name)
 5         self.headers = {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0,}
 6 
 7     def download_url(self,url_str):
 8         result = requests.get(url_str,headers=self.headers)
 9         html = lxml.html.fromstring(result.text)
10         data_country = html.xpath(//tr[@id="places_country__row"]/td[@class="w2p_fw"]/text())
11         data_capital = html.xpath(//tr[@id="places_capital__row"]/td[@class="w2p_fw"]/text())
12         data_area = html.xpath(//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text())
13         data_all = [國家:+data_country[0],首都:+data_capital[0],國土面積:+data_area[0]]
14         return data_all
15         # print(html_data)
16 
17     def save_result(self,result):
18         print(type(result),result)
19         for r in result:
20             r = r + \n
21             with open(./country.txt,ab) as f:
22                 f.write(r.encode(utf-8))
23         # with open(‘./country.txt‘,‘ab‘) as f:
24         #     f.writelines(result)
25     def run(self):
26         result = self.download_url(self.url_base)
27         self.save_result(result)
28 
29 
30 if __name__ == __main__:
31     # c = CountrySpider(‘Bolivia-27‘)
32     # c.run()
33     s = CountrySoup(Bolivia-27)
34     s.download_url()

爬蟲——三個小實戰