Python之簡單爬取網頁內容
阿新 • • 發佈:2018-12-22
爬去網頁通用流程
這樣看著雖然很麻煩,但是爬取網頁都離不開這四個步驟,以後如果爬取更復雜的網頁內容,只需要在這個基礎上新增內容就ok了。
import requests class Qiushi: # 初始化函式 def __init__(self,name): self.name = name self.url_base = 'https://www.qiushibaike.com/8hr/page/{}/' self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'} def make_url(self): """ 生成下載連線列表 :return: """ #爬取糗事百科的前十頁 return [self.url_base.format(i) for i in range(1,11)] def download(self,url_str): """ 通過requests.get()方法下載指定頁面,獲得頁面結果 :param url_str: :return: """ result = requests.get(url_str,headers=self.headers) return result.content def save_content(self,html_content,page_num): """ 以html 形式儲存下載內容 :param html_content: :param page_num: :return: """ # 先建立download資料夾,然後把爬取的內容儲存在download檔案裡 file_name = './download/'+'{}--第{}頁.html'.format(self.name,page_num) with open(file_name,'wb') as fb: fb.write(html_content) def run(self): """ 下載主執行緒,實現主要邏輯 :return: """ # 獲取到所有的連結 url_lists = self.make_url() for url in url_lists: html_content = self.download(url) # 獲取到對應頁數 page_num = url_lists.index(url)+1 self.save_content(html_content,page_num) if __name__=='__main__': qiushi= Qiushi('糗事百科') qiushi.run()
爬取成功後的結果