scrapy 爬取智聯招聘
阿新 • • 發佈:2019-01-11
準備工作
1. scrapy startproject Jobs
2. cd Jobs
3. scrapy genspider ZhaopinSpider www.zhaopin.com
4. scrapy crawl ZhaopinSpider
5. pip install diskcache
6. pip install tinydb
7. scrapy crawl ZhaopinSpider -o chongqing.json
ZhaopinSpider
# -*- coding: utf-8 -*- import osimport json from tinydb import TinyDB, Query from furl import furl import scrapy class ZhaopinspiderSpider(scrapy.Spider): name = 'ZhaopinSpider' allowed_domains = ['www.zhaopin.com', 'sou.zhaopin.com', 'fe-api.zhaopin.com'] start_urls = ['https://www.zhaopin.com/citymap'] cache_db= TinyDB('ZhaopinSpider-cache.json') # 快取資料庫 allowed_cities = ['重慶', ]# '成都', '上海', '深圳', '昆明', '杭州', '貴陽', '寧波'] ## 允許的城市 F = furl('https://fe-api.zhaopin.com/c/i/sou?pageSize=90&kt=3') # URL母版 PAGE_SIZE = 90 # 分頁大小 def get_city_code(self, city_name): '''(根據城市名)獲取城市程式碼''' Q = Query() city = self.cache_db.get(Q.name.search(city_name)) if isinstance(city, dict): return city['code'] else: print('@' * 100) print(type(city)) def init_city_info(self, response): '''初始化城市資訊''' # 取原始碼 script_text = response.xpath('//script[text()[contains(., "__INITIAL_STATE__")]]/text()').extract_first() # 去收尾空格 script_text = script_text.strip() # 預處理為符合json規範的資料 script_json = script_text[script_text.index('=') + 1:] # 將json字串轉為字典 script_dict = json.loads(script_json) ''' # 儲存取得的json, 便於除錯檢視 with open('text.json', 'wt', encoding='utf-8') as f: json.dump(script_dict, f, indent=4, ensure_ascii=False) ''' ''' city_list = [] # 儲存城市列表 # 將字典中的城市提取到列表中,便於查詢 for ch in script_dict['cityList']['cityMapList']: city_list.extend(script_dict['cityList']['cityMapList'][ch]) # 篩選出重慶,並獲取城市碼 city_code = (list(filter(lambda city: city['name'] == '重慶', city_list)) or [{'code': None}])[0]['code'] ''' for ch in script_dict['cityList']['cityMapList']: for city in script_dict['cityList']['cityMapList'][ch]: self.cache_db.insert(city) def parse(self, response): # if not os.path.exists('ZhaopinSpider-cache.json'): if not bool(self.eache_db.all()): self.init_city_info(response) # 迭代每一個要爬取的城市 for city_name in self.allowed_cities: # 啟動 爬取某個城市 第一個請求 # import ipdb; ipdb.set_trace() yield self.request_city(city_name) def request_city(self, city_name, page_start=0): '''構造 爬取某個具體的城市 的請求物件''' city_code = self.get_city_code(city_name) url_data = { 'cityId': city_code, 'kw': 'python', 'start': page_start } # 要爬取的頁面的URL url = self.F.copy().add(url_data).url # import ipdb; ipdb.set_trace() req = scrapy.Request(url, callback=self.parse_city, dont_filter=False) # 使用 meta 傳遞附加資料,在 callback 中可以通過 respo.meta 取得 req.meta['city_name'] = city_name req.meta['page_start'] = page_start return req def parse_city(self, response): '''解析具體的頁面''' # 解析json格式的響應結果 resp_dict = json.loads(response.body_as_unicode()) # 總共所能爬取的條數 num_found = resp_dict['data']['numFound'] # 獲取當前請求的 page_start page_start = response.meta['page_start'] # 下一次請求,需要的 start 引數 next_start = page_start + self.PAGE_SIZE # import ipdb; ipdb.set_trace() # 判斷是否有下一頁 if next_start < num_found: # 獲取當前請求的 城市名 city_name = response.meta['city_name'] # 傳送下一頁請求 yield self.request_city(city_name, page_start=next_start) # 解析資料 for item in resp_dict['data']['results']: # TODO: 解析資料,只取我們需要的資訊 item['spiderName'] = self.name # 返回每一條資料 yield item