使用scrapy框架+模擬瀏覽器方法實現爬取智聯的職位資訊
阿新 • • 發佈:2018-12-25
由於智聯的頁面是由js動態載入的,一般的方法只能得到js載入前的頁面,為了得到載入過的頁面需要通過模擬瀏覽器來拿到完整的頁面.
下面的程式碼只是簡單的實現,爬取智聯頁面的部分功能,其他根據需要自己實現
中介軟體(middleswares.py)程式碼:
from scrapy.http import HtmlResponse from selenium import webdriver import time # from selenium.webdriver.chrome.options import Options from selenium.webdriver.firefox.options import Options class SeleniumMiddleware(object): def __init__(self): self.options = Options() # self.options.add_argument('-headless') # self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",chrome_options=self.options) self.browser = webdriver.Firefox(executable_path=r"D:\python_others\Spider\code\day06\tools\geckodriver.exe", firefox_options=self.options) def process_request(self, request, spider): if int(request.meta['page']) == 2: # 執行javascript使瀏覽器滾動條滾動到最後 self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(3) div = self.browser.find_element_by_css_selector(".soupager") next_page = div.find_element_by_tag_name("button") next_page[1].click() # page = self.browser.find_element_by_xpath('//*[@id="pagination_content"]/div/button[2]') # page.click() # time.sleep(10) else: if (request.meta['page']) == 0: try: print("url is ::::", request.url) self.browser.get(request.url) except TimeoutError as e: print("超時") time.sleep(5) return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8", request=request) # 在模擬瀏覽器過程中如果還想要在downloader實現下載 只要中介軟體不return就可以 # 出現頁面一直載入的情況時,顯示頁面一直在載入,只要差掉載入過程的小圓圈,頁面就會加載出來 # browser.execute_script(('window.stop()') 使用這個方法
爬蟲檔案(spider.py)程式碼:
# -*- coding: utf-8 -*- import time import scrapy import lxml.html from scrapy import Request class JobDes(object): def __init__(self): self.detail_url = "" self.title = "" def parse_lxml_zhilian(html_str): tree = lxml.html.fromstring(html_str) job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href') job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title') print(job_url) print(job_name) #全域性變數用於判斷翻頁速度是否遠大於區域性下載速度 count = 0 class ZhaopinSpider(scrapy.Spider): name = 'zhaopin' # allowed_domains = ['ts.zhaopin.com'] # start_urls = ['http://ts.zhaopin.com/'] def start_requests(self): url_str = 'https://sou.zhaopin.com/?jl=489&kw=python&kt=3' yield Request(url=url_str, callback=self.parse, meta={"page": "0"}) def parse(self, response): #使用模擬器翻頁載入ajax頁面 #在模擬器彈出頁面分析抓取頁面 #抓取標籤不是一成不變的,謹慎使用帶數字的css選擇器nth-child(1) #使用簡單可調式的頁面去除錯 #selenium可以用於模擬測試 rs = response.css('#listContent > div:nth-child(1)') page_next = response.xpath('//*[@id="pagination_content"]/div/button[2]') # pagination_content > div > button:nth-child(7) print("rs is :::::", rs) print("page_next is :::::", page_next) # listContent > div:nth-child(1) # pagination_content > div > button:nth-child(7) # button.btn:nth-child(8) #每頁60個下載任務,每翻一頁多60條任務 global count count += 60 for r in rs: job_url = parse_lxml_zhilian(r) yield Request(url=job_url, callback=self.parse_detal, meta={"page": "3"}, dont_filter=True) if len(page_next) > 0: #當下載任務大於300時,暫停翻頁等待資料下載 while count > 300: time.sleep(0.5) yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True) def parse_detal(self): pass