專案練習:電影列表爬蟲
阿新 • • 發佈:2018-12-02
1 # -*- coding:utf-8 -*- 2 # Author:Sure Feng 3 4 import requests 5 import json 6 7 8 class DoubanFileSpider(object): 9 def __init__(self): 10 self.start_tempt_url = [ 11 { 12 "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=recommend&page_limit=20&page_start={}", 13 "country": "cn" 14 }, { 15 "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%97%A5%E6%9C%AC&sort=recommend&page_limit=20&page_start={}", 16 "country": "janpan" 17 }, { 18 "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%AC%A7%E7%BE%8E&sort=recommend&page_limit=20&page_start={}", 19 "country": "usa" 20 } 21 ] 22 self.headers = { 23 "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36"} 24 25 def parse_url(self, url): 26 """傳送請求,獲取響應""" 27 respond = requests.get(url, headers=self.headers) 28 return respond.content.decode() 29 30 def save_data(self, list_str, country): 31 """儲存資料""" 32 with open("douban.txt", "a", encoding="utf-8") as f: 33 for info_str in list_str: 34 # print(info_str) 35 info_str["country"] = country 36 f.write(json.dumps(info_str, ensure_ascii=False)) 37 f.write("\n") 38 39 def get_content(self, json_str): 40 """提取資料""" 41 dict_ret = json.loads(json_str) 42 content_list = dict_ret["subjects"] 43 return content_list 44 45 def run(self): 46 """實現主要邏輯""" 47 for url_tempt in self.start_tempt_url: 48 num = 0 49 country = url_tempt["country"] 50 while True: 51 # start_url 52 start_url = url_tempt["url_tempt"].format(num) 53 print(start_url) 54 # 傳送請求,獲取響應 55 json_str = self.parse_url(start_url) 56 # 提取資料 57 content_list = self.get_content(json_str) 58 # 儲存 59 self.save_data(content_list, country) 60 # 構造下一頁的URL地址,重複步驟 61 if len(content_list) < 20: 62 break 63 num += 20 64 65 66 if __name__ == "__main__": 67 douban_spider = DoubanFileSpider() 68 douban_spider.run()