1. 程式人生 > >爬蟲專案:京東商品資料爬取

爬蟲專案:京東商品資料爬取

spider程式碼:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from jingdong.items import JingdongItem
import re
import urllib

class JdSpider(scrapy.Spider):
    name = 'jd'
    allowed_domains = ['jd.com']
    start_urls = ['http://jd.com/']

    def parse(self, response):
        key = "筆記本"
        search_url = "https://search.jd.com/Search?keyword=" + key + "&enc=utf-8&wq=" + key
        for i in range(1,101):
            page_url = search_url + "&page=" + str(i*2-1)
            yield Request(url=page_url,callback=self.next)
    def next(self,response):
        id = response.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku').extract()
        #print(id)
        for j in range(len(id)):
            ture_url = "https://item.jd.com/" + str(id[j]) + ".html"
            yield Request(url=ture_url,callback=self.next2)
    def next2(self,response):
        item = JingdongItem()
        item['title'] = response.xpath('//head/title/text()').extract()[0].replace('【圖片 價格 品牌 報價】-京東','').replace('【行情 報價 價格 評測】-京東','')
        item['link'] = response.url
        #價格抓包
        ture_id = re.findall(r'https://item.jd.com/(.*?).html',item['link'])[0]
        price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id)
        price_txt = urllib.request.urlopen(price_url).read().decode('utf-8', 'ignore')
        item['price'] = re.findall(r'"p":"(.*?)"',price_txt)[0]
        #評論抓包
        comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(ture_id)
        comment_txt = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore')
        item['comment'] = re.findall(r'"CommentCount":(.*?),"',comment_txt)[0]
        return item

pipline程式碼:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql.cursors

class JingdongPipeline(object):
    # 連線登入mysql,新建資料表
    def __init__(self):
        self.conn = pymysql.connect(host="127.0.0.1",
                                    user="root",
                                    passwd="",
                                    db="jd",
        charset = 'utf8')
        cur = self.conn.cursor()
        cur.execute("USE jd")
        cur.execute(
            "CREATE TABLE computer(title VARCHAR(100),link VARCHAR(50),price VARCHAR(50),comment VARCHAR(50))")
        self.conn.commit()

    def process_item(self, item, spider):
        try:
            title_1 = item['title']
            link_1 = item['link']
            price_1 = item['price']
            comment_1 = item['comment']
            cur = self.conn.cursor()
            cur.execute("INSERT INTO computer(title,link,price,comment) VALUES (%s,%s,%s,%s)",(title_1,link_1,price_1,comment_1))
            self.conn.commit()
        except Exception as err:
            pass
        return item



使用的是navicat作為mysql的互動

最後結果:


遇到的一些難題:

1、mysql的安裝,參考我的另一篇博文:

2、抓包:我所取的資料裡面,有兩個欄位是需要抓包的,一個是price,另一個是comment,抓包的時候注意包的地址,裡面一般會包括關鍵字,例如price的包的連結名裡面也會有price

3、我的navicat插資料進去的時候中文會顯示‘???‘的亂碼,這裡我是參考:點選開啟連結

4、經過多次除錯之後,發現訪問資料量太多了,京東開始問我要驗證碼了,驗證碼解碼方面還在學習當中,掌握了之後再回頭做修改