1. 程式人生 > >Selenium+Scrapy爬取淘寶

Selenium+Scrapy爬取淘寶

好久不見,今天給大家分享如何用自動化工具selenium和scrapy框架來爬取淘寶。

爬取網站時候的坑!

剛開始爬的時候,就想著直接進入淘寶主頁,然後用selenium工具自動一步步執行然後爬取到自己想得到的資料,然而!令我沒想到的是,利用自動化工具可以對關鍵詞進行抓取之類,但是很奇怪的是資料抓不下來,於是不得不對進入的連結進行修正。
通過觀察得到了這樣的網址['https://s.taobao.com/search?q={q}'.format(q=QUESTION)] QUESTION是要搜尋的關鍵詞。廢話不多說,直接上程式碼吧。

spider的編寫

為了網頁的載入速度,我們一般不載入網頁的圖片。

    # 設定chrome不載入圖片
    chrome_opt = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_opt.add_experimental_option("prefs", prefs)

載入不同的瀏覽器,接受相應的訊號等

    def __init__(self):
        super(TaobaoSpider, self).__init__()

        if DEFAULT_BROWSER == 'Chrome'
: self.browser = webdriver.Chrome(chrome_options=self.chrome_opt) elif DEFAULT_BROWSER == 'PhantomJS': self.browser = webdriver.PhantomJS() self.browser.maximize_window() self.wait = WebDriverWait(self.browser, 5) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) def
spider_closed(self):
self.browser.close()

解析商品列表頁資訊

    # 解析商品列表頁資訊
    def parse(self, response):
        goods = response.css('div.item.J_MouserOnverReq')
        for good in goods:
            title = good.css('div.row.row-2.title a.J_ClickStat::text').extract()
            if isinstance(title, list):
                title = ''.join(title).strip()
            price = good.css('div.price.g_price.g_price-highlight strong::text').extract_first()
            free_shipping = 'Yes' if good.css('div.ship.icon-service-free') else 'No'
            # 月售數量
            month_sale = good.css('div.deal-cnt::text').extract_first()
            month_sale = re.match(r'\d+', month_sale).group(0)
            goods_url = good.css('div.row.row-2.title a.J_ClickStat::attr(href)').extract_first()

            shop = good.xpath('//div[@class="shop"]/a/span[2]/text()').extract_first()
            shop_type = '天貓' if good.css('span.icon-service-tianmao') else '淘寶'
            addr = good.css('div.location::text').extract_first()
            data = {
                'title': title,
                'price': price,
                'free_shipping': free_shipping,
                'month_sale' : month_sale,
                'goods_url': goods_url,
                'shop': shop,
                'shop_type': shop_type,
                'addr': addr
            }

            yield scrapy.Request(urljoin('https:', goods_url), meta={'data': data}, callback=self.parse_grade)
        # 獲取下一頁連結
        next_key = response.css('li.next a::attr(data-key)').extract_first()
        next_value = response.css('li.next a::attr(data-value)').extract_first()
        next_url = self.start_urls[0] + '&' + next_key + '=' + next_value
        yield scrapy.Request(next_url, callback=self.parse)

解析商品詳情頁資訊

    def parse_grade(self, response):
        item = TaobaospiderItem()
        data = response.meta['data']
        item['title'] = data['title']
        item['price'] = data['price']
        item['free_shipping'] = data['free_shipping']
        item['month_sale'] = data['month_sale']
        item['goods_url'] = data['goods_url']
        item['shop'] = data['shop']
        item['shop_type'] = data['shop_type']
        item['addr'] = data['addr']

        same_grade = response.css('div.tb-shop-rate a::text').extract()
        if len(same_grade) == 3:
            item['same_grade'] = float(same_grade[0].strip())
            item['service_grade'] = float(same_grade[1].strip())
            item['shipping_grade'] = float(same_grade[2].strip())

        yield item

引用的類庫

import scrapy
from selenium import webdriver
from ..settings import QUESTION, DEFAULT_BROWSER
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from ..items import TaobaospiderItem
import re
from urllib.parse import urljoin
from selenium.webdriver.support.ui import WebDriverWait

items.py

import scrapy

class TaobaospiderItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    free_shipping = scrapy.Field()
    month_sale = scrapy.Field()
    goods_url = scrapy.Field()
    shop = scrapy.Field()
    shop_type = scrapy.Field()
    addr = scrapy.Field()
    same_grade = scrapy.Field()
    service_grade = scrapy.Field()
    shipping_grade = scrapy.Field()

    def get_article_info_insert_sql(self):
        insert_sql = """
            insert into info(
            title,
            price,
            free_shipping,
            month_sale,
            goods_url,
            shop,
            shop_type,
            addr,
            same_grade,
            service_grade,
            shipping_grade
            )
            values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE title = VALUES(title) 
        """
        params = (self['title'],
                  self['price'],
                  self['free_shipping'],
                  self['month_sale'],
                  self['goods_url'],
                  self['shop'],
                  self['shop_type'],
                  self['addr'],
                  self['same_grade'],
                  self['service_grade'],
                  self['shipping_grade'],
                  )
        return insert_sql, params

middlewares.py

from scrapy.http import HtmlResponse
import logging

class ChromeMiddleware(object):

    def __init__(self):
        self.logger = logging.getLogger(__name__)

    def process_request(self, request, spider):
        browser = spider.browser
        browser.get(request.url)
        #模擬下拉
        browser.execute_script('window.scrollTo(0,document.body.scrollHeight);var leftOfPage = document.body.scrollHeight;return leftOfPage;')
        self.logger.debug('getting ' + request.url)
        return HtmlResponse(url=request.url, body=browser.page_source, request=request, encoding='utf-8')

pipelines.py

import logging
from twisted.enterprise import adbapi

class MysqlTwistedPipeline(object):
    def __init__(self, params):
        self.dbpool = adbapi.ConnectionPool('pymysql', **params)
        self.logger = logging.getLogger(__name__)

    @classmethod
    def from_settings(cls, settings):
        return cls(settings.get('MYSQL_PARAMS'))

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert_article_info, item)
        query.addErrback(self.handle_error, item, spider)

    def do_insert_article_info(self, cursor, item):
        insert_sql, params = item.get_article_info_insert_sql()
        try:
            cursor.execute(insert_sql, params)
        except Exception as e:
            print(e)
        pass

    def handle_error(self, failure, item, spider):
        self.logger.debug(failure)

settings.py

DOWNLOADER_MIDDLEWARES = {
    'taobaospider.middlewares.ChromeMiddleware': 543,
}

ITEM_PIPELINES = {
    'taobaospider.pipelines.MysqlTwistedPipeline': 300,
}

QUESTION = 'Python'
#設定預設瀏覽器
DEFAULT_BROWSER = 'Chrome'

MYSQL_PARAMS = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'root',
    'db': '1',
    'charset': 'utf8'
}

爬取結果


歡迎關注我的個人公眾號。