爬蟲---------scrapy------瀏覽器爬取()
阿新 • • 發佈:2018-11-17
# -*- coding: utf-8 -*- import scrapy # from scrapy.linkextractors import LinkExtractor # from scrapy.spiders import CrawlSpider, Rule from Zhilian.items import ZhilianItem class ZhilianSpider(scrapy.Spider): name = 'zhilian' allowed_domains = ['zhaopin.com'] start_urls = ["https://sou.zhaopin.com/?pageSize=60&jl=北京" + "&kw=python" + "&kt=3&p=" + str(i) for i in range(int(input("起始:")), int(input("終止:")))] # rules = ( # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), # ) def parse(self, response): job_list = response.xpath("//div[@id='listContent']/div") # print(job_list) for job in job_list: item = ZhilianItem() item["name"] = job.xpath(".//span/@title").extract_first() item["salary"] = job.xpath(".//p/text()").extract_first() item["fuli"] = job.xpath(".//div[contains(@class,'welfare')]/text()").extract() item["address"] = job.xpath(".//ul/li[1]/text()").extract_first() item["jingyan"] = job.xpath(".//li[contains(@class,'demand')][2]/text()").extract_first() item["company"] = job.xpath(".//div/a/@title").extract_first() next_url = job.xpath(".//div[contains(@class,'jobName')]//a/@href").extract_first()#獲取第二頁連結 # yield item yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item}) def parse_next(self,response): item = response.meta["item"] item["job_info"] = r"\n".join(response.xpath("//div[@class='pos-ul']/p").exract()) item["company_info"] = r"\n".join(response.xpath("//div[@class='intro-content']/p/text()")).extract() yield item