pyspider爬蟲框架之拉勾網招聘資訊爬取
阿新 • • 發佈:2019-01-27
需求
- 遍歷所有職位目錄
- 點選職位分類,進入之後按照地區抓取,職位名稱,釋出時間,薪酬,工作年限要求,學歷要求,招聘公司,所屬行業,所處輪次
- 進入職位詳情頁,抓取HR聊天意願(用時),簡歷處理,活躍時段。
程式碼
程式碼有詳細的註解,就不一步一步講解了,再有個問題是,如果沒有代理,慎用,IP容易被禁
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-17 14:49:15
# Project: lagou
from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient
DB_NAME = 'research'
DB_COL = 'lagou_recruit'
db = client[DB_NAME]
col = db[DB_COL]
class Handler(BaseHandler):
crawl_config = {
"headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
},
"proxy": "http://localhost:6666" ## 搭建的代理服務
}
url = 'https://www.lagou.com/'
def format_date(self, date):
return datetime.datetime.strptime(date, '%Y-%m-%d')
def get_today(self):
return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d' ), '%Y-%m-%d')
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.url, callback=self.index_page)
@config(age=60)
def index_page(self, response):
page = response.etree
## 職位分類列表
cat_list = page.xpath("//div[@class='mainNavs']/div[@class='menu_box']/div[@class='menu_sub dn']/dl")
## 遍歷每個類別
for each in cat_list:
## 大類
b_title = each.xpath("./dt/span/text()")[0]
print('-----------',b_title,'------------')
##子類列表
sub_list = each.xpath("./dd/a")
for sub in sub_list:
sub_title = sub.xpath("./text()")[0]
link_url = sub.xpath("./@href")[0]
print(sub_title,' ',link_url)
save = {"belonging": sub_title}
self.crawl(link_url, callback=self.parse_categories, save=save)
@config(age=60)
def parse_categories(self, response):
page = response.etree
base_url = "https://www.lagou.com/jobs/list_"
## 每個城市列表
city_list = page.xpath("//div[@class='details']/div[@class='has-more']/div[@class='more more-positions workPosition']/li/a")[1:-1] ## 去掉第一個全國和最後一個。。。。
## 遍歷每個城市
for each in city_list:
city = each.xpath("./text()")[0]
print(city)
link_url = base_url + response.save["belonging"]
params = {"px": "default",
"city": city
}
save = {"belonging": response.save["belonging"], "city": city}
self.crawl(link_url, callback=self.parse_city, params=params, save=save)
@config(age=60)
def parse_city(self, response):
page = response.etree
## 地區列表
district_list = page.xpath("//div[@class='contents' and @data-type='district']/a")[1:] ## 不要 不限
print(response.url)
## 遍歷地區
for num,each in enumerate(district_list):
district = each.xpath("./text()")[0]
print(district)
params = {
"district": district
}
link_url = response.url + "#%s" % num
save = {"belonging": response.save["belonging"],
"city": response.save["city"],
"district": district
}
self.crawl(link_url, callback=self.parse_district, params=params, save=save)
@config(age=60)
def parse_district(self, response):
page = response.etree
headers = {"Host": "www.lagou.com", #### 不加這些請求頭引數,資料請求不到
"Origin": "https://www.lagou.com",
"Referer": response.url,
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
}
base_url = 'https://www.lagou.com/jobs/positionAjax.json'
##翻頁
try:
page_num = int(page.xpath("//div[@class='page-number']/span[@class='span totalNum']/text()")[0])
print(page_num)
except:
return
for each in range(1,page_num+1):
data = {"first": "false",
"pn": each,
"kd": response.save["belonging"]
}
params = {"px": "default",
"city": response.save["city"],
"district": response.save["district"],
"needAddtionalResult": "false"
}
link_url = base_url + "#%s" % each
self.crawl(link_url, callback=self.parse_page, params=params, method='POST', data=data, save=response.save, headers=headers)
@config(age=60)
def parse_page(self, response):
page = response.json
base_url = 'https://www.lagou.com/jobs/{}.html'
#內容列表
contents = page["content"]["positionResult"]["result"]
print(contents)
## 遍歷
for each in contents:
## 職位名稱
position_name = each["positionName"]
print(position_name)
## 釋出時間
public_time = each["formatCreateTime"]
print(public_time)
if re.findall('\d+:\d+',public_time):
public_time = datetime.datetime.now().strftime('%Y-%m-%d')
print(public_time)
if re.findall(u'(\d+)天前',public_time):
delta = int(re.findall(u'(\d+)天前',public_time)[0])
public_time = (datetime.datetime.now()+datetime.timedelta(days=-delta)).strftime('%Y-%m-%d')
print(public_time)
if re.findall(u'昨天',public_time):
public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
print(public_time)
## 薪酬
salary = each["salary"]
print(salary)
## 工作年限要求
experience = each["workYear"]
print(experience)
## 學歷要求
education = each["education"]
print(education)
## 公司
company = each["companyFullName"]
print(company)
## 公司所屬行業
company_belong = each["industryField"]
print(company_belong)
## 所屬輪次
rounds = each["financeStage"]
print(rounds)
## 福利
welfare = '-'.join(each["companyLabelList"])
print(welfare)
print('----------------------------------------')
save = {"belonging": response.save["belonging"],
"city": response.save["city"],
"district": response.save["district"],
"position_name": position_name,
"public_time": public_time,
"salary": salary,
"experience": experience,
"education": education,
"company": company,
"company_belong": company_belong,
"rounds": rounds,
"welfare": welfare
}
position_id = each["positionId"]
link_url = base_url.format(position_id)
self.crawl(link_url, callback=self.parse_detail, save=save)
def parse_detail(self, response):
page = response.etree
try:
## hr的資訊
hr_info = page.xpath("//dd[@class='jd_publisher']/div/div[@class='publisher_data']")[0]
chat_will = hr_info.xpath("./div[1]/span[@class='data']/text()")[0]
print(chat_will)
resume_processing = hr_info.xpath("./div[2]/span[@class='data']/text()")[0]
print(resume_processing)
active_time = hr_info.xpath("./div[3]/span[@class='data']/text()")[0]
print(active_time)
except:
chat_will = ''
resume_processing = ''
active_time = ''
result = {"belonging": response.save["belonging"],
"city": response.save["city"],
"district": response.save["district"],
"position_name": response.save["position_name"],
"public_time": self.format_date(response.save["public_time"]),
"salary": response.save["salary"],
"experience": response.save["experience"],
"education": response.save["education"],
"company": response.save["company"],
"company_belong": response.save["company_belong"],
"rounds": response.save["rounds"],
"welfare": response.save["welfare"],
"chat_will": chat_will,
"resume_processing": resume_processing,
"active_time": active_time,
"update_time": datetime.datetime.now(),
"date": self.get_today()
}
yield result
def on_result(self, result):
if result is None:
return
update_key = {
'position_name': result['position_name'],
'public_time': result['public_time'],
'city': result['city'],
'district': result['district'],
'company': result['company'],
'belonging': result['belonging']
}
col.update(update_key, {'$set': result}, upsert=True)