1. 程式人生 > >pyspider爬蟲框架之拉勾網招聘資訊爬取

pyspider爬蟲框架之拉勾網招聘資訊爬取

需求
  1. 遍歷所有職位目錄
  2. 點選職位分類,進入之後按照地區抓取,職位名稱,釋出時間,薪酬,工作年限要求,學歷要求,招聘公司,所屬行業,所處輪次
  3. 進入職位詳情頁,抓取HR聊天意願(用時),簡歷處理,活躍時段。
程式碼

程式碼有詳細的註解,就不一步一步講解了,再有個問題是,如果沒有代理,慎用,IP容易被禁

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-17 14:49:15
# Project: lagou

from pyspider.libs.base_handler import *
import re
import
datetime from pymongo import MongoClient DB_NAME = 'research' DB_COL = 'lagou_recruit' db = client[DB_NAME] col = db[DB_COL] class Handler(BaseHandler): crawl_config = { "headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
}, "proxy": "http://localhost:6666" ## 搭建的代理服務 } url = 'https://www.lagou.com/' def format_date(self, date): return datetime.datetime.strptime(date, '%Y-%m-%d') def get_today(self): return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'
), '%Y-%m-%d') @every(minutes=24 * 60) def on_start(self): self.crawl(self.url, callback=self.index_page) @config(age=60) def index_page(self, response): page = response.etree ## 職位分類列表 cat_list = page.xpath("//div[@class='mainNavs']/div[@class='menu_box']/div[@class='menu_sub dn']/dl") ## 遍歷每個類別 for each in cat_list: ## 大類 b_title = each.xpath("./dt/span/text()")[0] print('-----------',b_title,'------------') ##子類列表 sub_list = each.xpath("./dd/a") for sub in sub_list: sub_title = sub.xpath("./text()")[0] link_url = sub.xpath("./@href")[0] print(sub_title,' ',link_url) save = {"belonging": sub_title} self.crawl(link_url, callback=self.parse_categories, save=save) @config(age=60) def parse_categories(self, response): page = response.etree base_url = "https://www.lagou.com/jobs/list_" ## 每個城市列表 city_list = page.xpath("//div[@class='details']/div[@class='has-more']/div[@class='more more-positions workPosition']/li/a")[1:-1] ## 去掉第一個全國和最後一個。。。。 ## 遍歷每個城市 for each in city_list: city = each.xpath("./text()")[0] print(city) link_url = base_url + response.save["belonging"] params = {"px": "default", "city": city } save = {"belonging": response.save["belonging"], "city": city} self.crawl(link_url, callback=self.parse_city, params=params, save=save) @config(age=60) def parse_city(self, response): page = response.etree ## 地區列表 district_list = page.xpath("//div[@class='contents' and @data-type='district']/a")[1:] ## 不要 不限 print(response.url) ## 遍歷地區 for num,each in enumerate(district_list): district = each.xpath("./text()")[0] print(district) params = { "district": district } link_url = response.url + "#%s" % num save = {"belonging": response.save["belonging"], "city": response.save["city"], "district": district } self.crawl(link_url, callback=self.parse_district, params=params, save=save) @config(age=60) def parse_district(self, response): page = response.etree headers = {"Host": "www.lagou.com", #### 不加這些請求頭引數,資料請求不到 "Origin": "https://www.lagou.com", "Referer": response.url, "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36" } base_url = 'https://www.lagou.com/jobs/positionAjax.json' ##翻頁 try: page_num = int(page.xpath("//div[@class='page-number']/span[@class='span totalNum']/text()")[0]) print(page_num) except: return for each in range(1,page_num+1): data = {"first": "false", "pn": each, "kd": response.save["belonging"] } params = {"px": "default", "city": response.save["city"], "district": response.save["district"], "needAddtionalResult": "false" } link_url = base_url + "#%s" % each self.crawl(link_url, callback=self.parse_page, params=params, method='POST', data=data, save=response.save, headers=headers) @config(age=60) def parse_page(self, response): page = response.json base_url = 'https://www.lagou.com/jobs/{}.html' #內容列表 contents = page["content"]["positionResult"]["result"] print(contents) ## 遍歷 for each in contents: ## 職位名稱 position_name = each["positionName"] print(position_name) ## 釋出時間 public_time = each["formatCreateTime"] print(public_time) if re.findall('\d+:\d+',public_time): public_time = datetime.datetime.now().strftime('%Y-%m-%d') print(public_time) if re.findall(u'(\d+)天前',public_time): delta = int(re.findall(u'(\d+)天前',public_time)[0]) public_time = (datetime.datetime.now()+datetime.timedelta(days=-delta)).strftime('%Y-%m-%d') print(public_time) if re.findall(u'昨天',public_time): public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d') print(public_time) ## 薪酬 salary = each["salary"] print(salary) ## 工作年限要求 experience = each["workYear"] print(experience) ## 學歷要求 education = each["education"] print(education) ## 公司 company = each["companyFullName"] print(company) ## 公司所屬行業 company_belong = each["industryField"] print(company_belong) ## 所屬輪次 rounds = each["financeStage"] print(rounds) ## 福利 welfare = '-'.join(each["companyLabelList"]) print(welfare) print('----------------------------------------') save = {"belonging": response.save["belonging"], "city": response.save["city"], "district": response.save["district"], "position_name": position_name, "public_time": public_time, "salary": salary, "experience": experience, "education": education, "company": company, "company_belong": company_belong, "rounds": rounds, "welfare": welfare } position_id = each["positionId"] link_url = base_url.format(position_id) self.crawl(link_url, callback=self.parse_detail, save=save) def parse_detail(self, response): page = response.etree try: ## hr的資訊 hr_info = page.xpath("//dd[@class='jd_publisher']/div/div[@class='publisher_data']")[0] chat_will = hr_info.xpath("./div[1]/span[@class='data']/text()")[0] print(chat_will) resume_processing = hr_info.xpath("./div[2]/span[@class='data']/text()")[0] print(resume_processing) active_time = hr_info.xpath("./div[3]/span[@class='data']/text()")[0] print(active_time) except: chat_will = '' resume_processing = '' active_time = '' result = {"belonging": response.save["belonging"], "city": response.save["city"], "district": response.save["district"], "position_name": response.save["position_name"], "public_time": self.format_date(response.save["public_time"]), "salary": response.save["salary"], "experience": response.save["experience"], "education": response.save["education"], "company": response.save["company"], "company_belong": response.save["company_belong"], "rounds": response.save["rounds"], "welfare": response.save["welfare"], "chat_will": chat_will, "resume_processing": resume_processing, "active_time": active_time, "update_time": datetime.datetime.now(), "date": self.get_today() } yield result def on_result(self, result): if result is None: return update_key = { 'position_name': result['position_name'], 'public_time': result['public_time'], 'city': result['city'], 'district': result['district'], 'company': result['company'], 'belonging': result['belonging'] } col.update(update_key, {'$set': result}, upsert=True)