前程無憂
阿新 • • 發佈:2018-09-03
學歷 style span arc https ems form lba cal
# -*- coding: utf-8 -*- import scrapy import re from zhaopin_project.items import LagouItem class QianchengwuyouSpider(scrapy.Spider): name = ‘qianchengwuyou‘ allowed_domains = [‘51job.com‘] start_urls = [‘http://51job.com/‘] def parse(self, response): for i in range(1,1620): base_url= ‘https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,{}.html‘.format(i) # print(base_url) yield scrapy.Request(base_url,callback=self.parse_detail) def parse_detail(self,response): html_str = response.xpath(‘//div[@class="el"]/p/span/a/@href‘).extract() # print(html_str)for html_list in html_str: yield scrapy.Request(html_list,callback=self.parse_list) def parse_list(self,response): try: # 職位名稱 title = response.xpath(‘//div[@class="cn"]/h1/text()‘).extract_first() # 月薪 salary = response.xpath(‘//div[@class="cn"]/strong/text()‘).extract_first() # 位置 p = re.findall(r‘<p class="msg ltype" title="(.*)">‘,response.text)[0] ss = p.split(‘ | ‘) position = ss[0] #經驗要求 jingyan = ss[1] # 學歷要求 if len(ss) ==4: xueli = ‘學歷不限‘ else: xueli = ss[2] # 時間 shijian = ss[-1] # 發布網站 fabu = ‘前程無憂‘ # 職位描述 job_bt = response.xpath(‘//div[@class="tBorderTop_box"]/div/p/text()‘).extract() job_bt = ‘‘.join(job_bt) # print(‘--‘*50) item = LagouItem() item[‘title‘] = title item[‘salary‘] = salary item[‘position‘] = position item[‘jingyan‘] = jingyan item[‘xueli‘] = xueli item[‘shijian‘] = shijian item[‘fabu‘] = fabu item[‘job_bt‘] = job_bt yield item except: pass
前程無憂