1. 程式人生 > >[Python爬蟲]Scrapy配合Selenium和PhantomJS爬取動態網頁

[Python爬蟲]Scrapy配合Selenium和PhantomJS爬取動態網頁

Python世界中Scrapy一直是爬蟲的一個較為成熟的解決方案,目前javascript在網頁中應用越來越廣泛,越來越多的網站選擇使用javascript動態的生成網頁的內容,使得很多純html的爬蟲解決方案失效。針對這種動態網站的爬取,目前也有很多解決方案。其中Selenium+PhantomJS是較為簡單和穩定的一種。

Selenium是一個網頁的自動化測試工具,其本身是用python編寫的。PhantomJS可以認為是一個基於WebKit核心的Headless瀏覽器。我們通過Selenium的Webdriver引入PhantomJS支援,使用PhantomJS來解析動態的網頁。

本文以爬取建行理財資料為例:

以下是建行理財頁面的截圖,其中紅色框標出的理財資料是後續使用js動態載入的,並不是一開始就寫死在html上的。
理財頁面

另外我還發現,如果僅僅使用phantomjs載入該網頁,沒辦法自動執行js指令碼,獲取理財列表。通過分析該網頁,我發現只要先選擇區域資訊,然後再載入一次頁面,即可下載對應的理財列表。
理財列表

廢話不多說,先上完整程式碼:

爬蟲的程式碼檔案

# -*- coding: utf-8 -*-
import scrapy, urlparse, re
from selenium import webdriver
from scrapy.http import
HtmlResponse, Request from scrapy.loader.processors import MapCompose from robot.items import FinanceItem from w3lib.html import remove_tags from datetime import datetime from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import
By from time import sleep from robot.db.modules import FinanceInfo from robot.util import FinanceLoader class CcbSpider(scrapy.Spider): ''' 中國建設銀行爬蟲 ''' name = "ccb" allowed_domains = ["ccb.com"] module = FinanceInfo def __init__(self, *args, **kwargs): try: PHANTOMJS_PATH = kwargs['PHANTOMJS_PATH'] self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH, service_args=["--ssl-protocol=any", "--ignore-ssl-errors=true", "--load-images=false", "--disk-cache=true"]) except Exception as e: self.logger.error(e, exc_info=True) exit(-2) super(CcbSpider, self).__init__(*args, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): kwargs['PHANTOMJS_PATH'] = crawler.settings['PHANTOMJS_PATH'] spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def start_requests(self): url = 'http://finance.ccb.com/cn/finance/product.html' self.driver.get(url) # 點選網頁中id為txt的元素 self.driver.find_element_by_id("txt").click() wait = WebDriverWait(self.driver, 2) # 等待class為select_hide的元素,變為可見 wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'select_hide'))) # 點選id為500000的元素 self.driver.find_element_by_id("500000").click() self.driver.get(url) # 通過css選擇器獲取理財種類的tab for element in self.driver.find_elements_by_css_selector(".life_tab>a"): element.click() sleep(1) while True: content = self.driver.page_source.encode('utf-8') url = self.driver.current_url.encode('utf-8') resp = HtmlResponse(url, encoding='utf-8', status=200, body=content) div = resp.css(".insurance_tab_detail[style*='display: block']") hrefs = div.css("td[class='list_title'] a::attr(href)").extract() for href in hrefs: req = Request(url=urlparse.urljoin(url, href), callback=self.parse) req.meta['parse'] = True yield req if self.driver.find_element_by_id("pageDiv").is_displayed(): current, total = resp.css("#pageNum").xpath("./text()").extract()[0].split("/", 1) if int(current) == int(total): break else: self.driver.find_element_by_id("next").click() else: break def parse(self, response): self.logger.info("Start to parse the url %s \n", response.url) self.logger.info("url: %s", response.url) load = FinanceLoader(item=FinanceItem(), response=response) load.add_value('updatetime', datetime.now()) load.add_css('name', "#name", MapCompose(remove_tags)) load.add_css('id', "#pdId", MapCompose(remove_tags)) load.add_value('type', u"理財") expected_annual_return = response.css("#yieldRate2").xpath("./text()").extract() if len(expected_annual_return) > 0: expected_annual_return = expected_annual_return[0] tmp = re.compile(u"\d+.\d+%").findall(expected_annual_return) if len(tmp) == 0: load.add_value("expected_annual_return", expected_annual_return) else: load.add_value("expected_annual_return", u",".join(tmp)) invest_duration = response.css("#investPeriod2").xpath("./text()").extract() if len(invest_duration) > 0: invest_duration = invest_duration[0] tmp = re.compile(u"(\d+)天").findall(invest_duration) if len(tmp) == 0: load.add_value("invest_duration", invest_duration) else: load.add_value("invest_duration", u",".join(tmp)) load.add_css("currency", "#currencyType", MapCompose(remove_tags)) load.add_css("launch_area", "#saleCitys", MapCompose(remove_tags)) load.add_css("subtype", "#yieldSpec", MapCompose(remove_tags)) load.add_css("risk_level", "#riskLevel", MapCompose(remove_tags)) load.add_css("redeem", "#proMode", MapCompose(remove_tags)) detail = response.css("#instructionUrl a::attr(href)").extract() if len(detail) > 0: detail = detail[0] if not detail.strip().startswith("http"): detail = urlparse.urljoin("http://finance.ccb.com", detail) load.add_value("detail", detail) minimum_amount = response.css("#purFloorAmt2").xpath("./text()").extract() if len(minimum_amount) > 0: minimum_amount = minimum_amount[0] try: tmp = re.compile(u"(\d+)萬").search(minimum_amount).group(1) tmp = str(int(tmp)*10000) except AttributeError as e: tmp = '0' load.add_value('minimum_amount', tmp) start_date = response.css("#collBgnDate3").xpath("./text()").extract() if len(start_date) > 0: start_date = start_date[0].strip() try: start_date = datetime.strptime(start_date, "%Y.%m.%d %H:%M").date() load.add_value("start_date", start_date) except Exception as e: pass end_date = response.css("#collEndDate3").xpath("./text()").extract() if len(end_date) > 0: end_date = end_date[0].strip() try: end_date = datetime.strptime(end_date, "%Y.%m.%d %H:%M").date() load.add_value("end_date", end_date) except Exception as e: pass item = load.load_item() self.logger.debug("ID: %s", load.get_value(response.css("#pdId").extract()[0], MapCompose(remove_tags))[0]) self.logger.debug("item: %s", str(item)) return item def closed(self, reason): self.driver.quit() def __str__(self): return "CcbSpider"

scrapy的配置檔案

# -*- coding: utf-8 -*-
BOT_NAME = 'robot'

SPIDER_MODULES = ['robot.spiders']
NEWSPIDER_MODULE = 'robot.spiders'

# Logging Setting
# LOG_FILE = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "log/spider.log"))
LOG_LEVEL = "INFO"
LOG_STDOUT = False
LOG_FORMAT = '%(asctime)s %(filename)s[line:%(lineno)d] [%(name)s] %(levelname)s: %(message)s'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY=1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
COOKIES_ENABLED=True


# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'robot.middlewares.PhantomJSMiddleware': 1000,
}


# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'robot.pipelines.DBPipeline': 1000,
}

PHANTOMJS_PATH = r'/root/phantomjs/bin/phantomjs'
DB_PATH = r'mysql+pymysql://robot:[email protected]:3306/robot'

程式碼解析

首先,在scrapy的setting.py中,加入PhantomJS的安裝路徑,即上述程式碼中的PHANTOMJS_PATH = r'/root/phantomjs/bin/phantomjs'
接下來,我們來分析爬蟲的程式碼檔案,在爬蟲類_init_過程中,我們需要啟動selenium的Webdriver,並將其使用的瀏覽器設定為PhantomJS,self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH, service_args=["--ssl-protocol=any", "--ignore-ssl-errors=true", "--load-images=false", "--disk-cache=true"])。 其中

  • --ssl-protocol=any, --ignore-ssl-errors=true用來設定ssl的
  • --load-images=false設定了讓PhantomJS不載入圖片,有利於提高PhantomJS的速度
  • --disk-cache=true 啟用本地磁碟快取,同樣有利於提高提高PhantomJS的速度

然後在start_request中

  1. 先模擬點選id為txt元素,self.driver.find_element_by_id("txt").click()如圖所示:id為txt的元素
  2. 呼叫Webdriver的等待函式,等待彈框顯示 wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'select_hide')))
  3. 在彈框中,模擬點選重慶市(id為500000),self.driver.find_element_by_id("500000").click() 如圖所示重慶市
  4. 再重新獲取一次頁面, self.driver.get(url)

    PS: 我猜測,上面的點選會把地理位置資訊儲存在PhantomJS的網頁快取中,再一次獲取頁面就不用再去設定地理位置資訊,而通過這個資訊,即可馬上獲取對應的理財列表了

  5. 遍歷理財種類的tab頁 for element in self.driver.find_elements_by_css_selector(".life_tab>a"): element.click()
    理財種類

  6. 遍歷當前頁面上理財列表,並且判斷是否還有下一頁,如果還有下一頁,模擬點選下一頁按鈕
 if self.driver.find_element_by_id("pageDiv").is_displayed():
     current, total = resp.css("#pageNum").xpath("./text()").extract()[0].split("/", 1)
     if int(current) == int(total):
         break
     else:
         self.driver.find_element_by_id("next").click()
  else:
         break

以上就是selenium和PhantomJS配合使用的簡單解釋。

PS: 注意,在爬蟲類中一定要加上def closed(self, reason):這個函式,並且在函式中顯式退出PhantomJS,否則PhantomJS的程序會一直保留。當你使用scrapyd進行部署執行的時候,這個問題會導致只能執行幾個爬蟲,之後程式就卡住了。