1. 程式人生 > >pyspider爬蟲框架之boss直聘招聘資訊爬取

pyspider爬蟲框架之boss直聘招聘資訊爬取

需求

需求:
1、 遍歷首頁所有職位分類
2、 點選進入職位分類詳情頁,按照地區抓取,職位名稱,月薪,經驗年限要求,學歷要求,招聘公司,所屬行業,輪次,人數(規模),釋出時間
3、 點選進入職位詳情頁,抓取該職位的技能標籤。

程式碼

程式碼有註釋,沒有代理慎用

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-06 10:40:07
# Project: boss_recruit

from pyspider.libs.base_handler import *
import re
import datetime
from
pymongo import MongoClient # 連線線下資料庫 # admin 資料庫有帳號,連線-認證-切換 DB_NAME = 'research' DB_COL = 'boss_recruit' db = client[DB_NAME] col = db[DB_COL] class Handler(BaseHandler): crawl_config = { "headers":{"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
}, "proxy": "http://localhost:6666" } url = 'https://www.zhipin.com/?ka=header-home' def format_date(self, date): return datetime.datetime.strptime(date, '%Y%m%d') @every(minutes=24 * 60) def on_start(self): print(get_proxy()) self.crawl(self.url, callback=self.index_page, proxy=get_proxy()) @config(age=60)
def index_page(self, response): page = response.etree base_url = 'https://www.zhipin.com' # 所有行業列表 vocation_list = page.xpath("//div[@class='job-menu']//div[@class='menu-sub']/ul/li") for each in vocation_list: belong = each.xpath("./h4/text()")[0] detail_list = each.xpath("./div[@class='text']/a") print(belong) for detail in detail_list: detail_title = detail.xpath("./text()")[0] detail_url = base_url + detail.xpath("./@href")[0] #save = {"belonging":[belong, detail_title]} save = {"belonging": detail_title} print(detail_title, detail_url) self.crawl(detail_url, callback=self.detail_page, save=save, proxy=get_proxy()) @config(age=60) def detail_page(self, response): page = response.etree base_url = 'https://www.zhipin.com' # 城市列表 city_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-city show-condition-district']/dd/a[@ka]")[1:] #不要全國 for each in city_list: city_name = each.xpath("./text()")[0] city_url = base_url + each.xpath("./@href")[0] params = {"ka": each.xpath("./@ka")[0]} save = {"city": city_name, "belonging": response.save["belonging"]} self.crawl(city_url, callback=self.parse_city, params=params, save=save, proxy=get_proxy()) @config(age=60) def parse_city(self, response): page = response.etree base_url = 'https://www.zhipin.com' #該城市的地區列表 district_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-district show-condition-district']/dd/a[position()>1]") for each in district_list: district_name = each.xpath("./text()")[0] print(district_name) district_url = base_url + each.xpath("./@href")[0] params = {"ka": each.xpath("./@ka")[0]} save = {"district": district_name, "city": response.save["city"], "belonging": response.save["belonging"]} self.crawl(district_url, callback=self.parse_district, params=params, save=save, proxy=get_proxy()) @config(age=60) def parse_district(self, response): page = response.etree base_url = 'https://www.zhipin.com' #該地區的區域列表 area_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-area show-condition-area']/dd/a[position()>1]") for each in area_list: area_name = each.xpath("./text()")[0] print(area_name) area_url = base_url + each.xpath("./@href")[0] params = {"ka": each.xpath("./@ka")[0]} save = {"area": area_name, "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "base_url": area_url, "page_num": 1} self.crawl(area_url, callback=self.parse_content, params=params, save=save, proxy=get_proxy()) @config(age=60) def parse_page(self, response): page = response.etree #翻頁 page_url = response.save.pop("base_url") page_num = 10 print(page_url) for each in range(1, page_num+1): ka = 'page-{}'.format(each) params = {"page": each, "ka": ka } self.crawl(page_url, callback=self.parse_content, params=params, save=response.save) @config(age=60) def parse_content(self, response): page = response.etree base_url = 'https://www.zhipin.com' page_url = response.save.get("base_url") #內容列表 content_list = page.xpath("//div[@class='job-list']/ul/li") #判斷是否有內容 if content_list == []: return for each in content_list: # 職位名稱 position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0] #薪水 salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0] #經驗 experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1] #學歷 education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2] #公司 company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0] #輪數 if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3: rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1] #規模 scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2] else: rounds = '' #規模 scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1] #釋出時間 public_time = each.xpath("./div[@class='job-primary']/div[@class='info-publis']/p/text()")[0] if ''.join(re.findall(u'昨天',public_time)): public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y%m%d') elif ''.join(re.findall('\d+:\d+',public_time)): public_time = datetime.datetime.now().strftime('%Y%m%d') else: public_time = '2018' + ''.join(re.findall(u'(\d+)月(\d+)日',public_time)[0]) print(public_time) #職位詳情連結 position_url = base_url + each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/@href")[0] print(position_url) save = {"area": response.save["area"], "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "position_name": position_name, "salary": salary, "experience": experience, "education": education, "company": company, "rounds": rounds, "scale": scale, "public_time": public_time } #爬取職位的詳情 self.crawl(position_url, callback=self.parse_body, save=save, proxy=get_proxy()) #翻頁 page_num = response.save.get('page_num') print(page_num) page_num += 1 if page_num <= 10: ka = 'page-{}'.format(page_num) params = {"page": page_num, "ka": ka } response.save.update({"page_num": page_num}) self.crawl(page_url, callback=self.parse_content, params=params, save=response.save, proxy=get_proxy()) def parse_body(self, response): page = response.etree print(response.save["public_time"]) #職位技能 skill = ''.join(page.xpath("//div[@class='detail-content']/div[@class='job-sec'][1]//text()")).strip() print(skill) result = {"skill": skill, "area": response.save["area"], "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "position_name": response.save["position_name"], "salary": response.save["salary"], "experience": response.save["experience"], "education": response.save["education"], "company": response.save["company"], "rounds": response.save["rounds"], "scale": response.save["scale"], "public_time": self.format_date(response.save["public_time"]), "update_time": datetime.datetime.now() } yield result def on_result(self, result): if result is None: return update_key = { 'position_name': result['position_name'], 'public_time': result['public_time'], 'city': result['city'], 'district': result['district'], 'area': result['area'], 'company': result['company'] } col.update(update_key, {'$set': result}, upsert=True)