Selenium+Scrapy爬取淘寶
阿新 • • 發佈:2018-12-25
好久不見,今天給大家分享如何用自動化工具selenium和scrapy框架來爬取淘寶。
爬取網站時候的坑!
剛開始爬的時候,就想著直接進入淘寶主頁,然後用selenium工具自動一步步執行然後爬取到自己想得到的資料,然而!令我沒想到的是,利用自動化工具可以對關鍵詞進行抓取之類,但是很奇怪的是資料抓不下來,於是不得不對進入的連結進行修正。
通過觀察得到了這樣的網址['https://s.taobao.com/search?q={q}'.format(q=QUESTION)]
QUESTION
是要搜尋的關鍵詞。廢話不多說,直接上程式碼吧。
spider的編寫
為了網頁的載入速度,我們一般不載入網頁的圖片。
# 設定chrome不載入圖片
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)
載入不同的瀏覽器,接受相應的訊號等
def __init__(self):
super(TaobaoSpider, self).__init__()
if DEFAULT_BROWSER == 'Chrome' :
self.browser = webdriver.Chrome(chrome_options=self.chrome_opt)
elif DEFAULT_BROWSER == 'PhantomJS':
self.browser = webdriver.PhantomJS()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 5)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_closed(self):
self.browser.close()
解析商品列表頁資訊
# 解析商品列表頁資訊
def parse(self, response):
goods = response.css('div.item.J_MouserOnverReq')
for good in goods:
title = good.css('div.row.row-2.title a.J_ClickStat::text').extract()
if isinstance(title, list):
title = ''.join(title).strip()
price = good.css('div.price.g_price.g_price-highlight strong::text').extract_first()
free_shipping = 'Yes' if good.css('div.ship.icon-service-free') else 'No'
# 月售數量
month_sale = good.css('div.deal-cnt::text').extract_first()
month_sale = re.match(r'\d+', month_sale).group(0)
goods_url = good.css('div.row.row-2.title a.J_ClickStat::attr(href)').extract_first()
shop = good.xpath('//div[@class="shop"]/a/span[2]/text()').extract_first()
shop_type = '天貓' if good.css('span.icon-service-tianmao') else '淘寶'
addr = good.css('div.location::text').extract_first()
data = {
'title': title,
'price': price,
'free_shipping': free_shipping,
'month_sale' : month_sale,
'goods_url': goods_url,
'shop': shop,
'shop_type': shop_type,
'addr': addr
}
yield scrapy.Request(urljoin('https:', goods_url), meta={'data': data}, callback=self.parse_grade)
# 獲取下一頁連結
next_key = response.css('li.next a::attr(data-key)').extract_first()
next_value = response.css('li.next a::attr(data-value)').extract_first()
next_url = self.start_urls[0] + '&' + next_key + '=' + next_value
yield scrapy.Request(next_url, callback=self.parse)
解析商品詳情頁資訊
def parse_grade(self, response):
item = TaobaospiderItem()
data = response.meta['data']
item['title'] = data['title']
item['price'] = data['price']
item['free_shipping'] = data['free_shipping']
item['month_sale'] = data['month_sale']
item['goods_url'] = data['goods_url']
item['shop'] = data['shop']
item['shop_type'] = data['shop_type']
item['addr'] = data['addr']
same_grade = response.css('div.tb-shop-rate a::text').extract()
if len(same_grade) == 3:
item['same_grade'] = float(same_grade[0].strip())
item['service_grade'] = float(same_grade[1].strip())
item['shipping_grade'] = float(same_grade[2].strip())
yield item
引用的類庫
import scrapy
from selenium import webdriver
from ..settings import QUESTION, DEFAULT_BROWSER
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from ..items import TaobaospiderItem
import re
from urllib.parse import urljoin
from selenium.webdriver.support.ui import WebDriverWait
items.py
import scrapy
class TaobaospiderItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
free_shipping = scrapy.Field()
month_sale = scrapy.Field()
goods_url = scrapy.Field()
shop = scrapy.Field()
shop_type = scrapy.Field()
addr = scrapy.Field()
same_grade = scrapy.Field()
service_grade = scrapy.Field()
shipping_grade = scrapy.Field()
def get_article_info_insert_sql(self):
insert_sql = """
insert into info(
title,
price,
free_shipping,
month_sale,
goods_url,
shop,
shop_type,
addr,
same_grade,
service_grade,
shipping_grade
)
values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE title = VALUES(title)
"""
params = (self['title'],
self['price'],
self['free_shipping'],
self['month_sale'],
self['goods_url'],
self['shop'],
self['shop_type'],
self['addr'],
self['same_grade'],
self['service_grade'],
self['shipping_grade'],
)
return insert_sql, params
middlewares.py
from scrapy.http import HtmlResponse
import logging
class ChromeMiddleware(object):
def __init__(self):
self.logger = logging.getLogger(__name__)
def process_request(self, request, spider):
browser = spider.browser
browser.get(request.url)
#模擬下拉
browser.execute_script('window.scrollTo(0,document.body.scrollHeight);var leftOfPage = document.body.scrollHeight;return leftOfPage;')
self.logger.debug('getting ' + request.url)
return HtmlResponse(url=request.url, body=browser.page_source, request=request, encoding='utf-8')
pipelines.py
import logging
from twisted.enterprise import adbapi
class MysqlTwistedPipeline(object):
def __init__(self, params):
self.dbpool = adbapi.ConnectionPool('pymysql', **params)
self.logger = logging.getLogger(__name__)
@classmethod
def from_settings(cls, settings):
return cls(settings.get('MYSQL_PARAMS'))
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert_article_info, item)
query.addErrback(self.handle_error, item, spider)
def do_insert_article_info(self, cursor, item):
insert_sql, params = item.get_article_info_insert_sql()
try:
cursor.execute(insert_sql, params)
except Exception as e:
print(e)
pass
def handle_error(self, failure, item, spider):
self.logger.debug(failure)
settings.py
DOWNLOADER_MIDDLEWARES = {
'taobaospider.middlewares.ChromeMiddleware': 543,
}
ITEM_PIPELINES = {
'taobaospider.pipelines.MysqlTwistedPipeline': 300,
}
QUESTION = 'Python'
#設定預設瀏覽器
DEFAULT_BROWSER = 'Chrome'
MYSQL_PARAMS = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'root',
'db': '1',
'charset': 'utf8'
}
爬取結果
歡迎關注我的個人公眾號。