爬蟲專案:京東商品資料爬取
阿新 • • 發佈:2019-01-25
spider程式碼:
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from jingdong.items import JingdongItem import re import urllib class JdSpider(scrapy.Spider): name = 'jd' allowed_domains = ['jd.com'] start_urls = ['http://jd.com/'] def parse(self, response): key = "筆記本" search_url = "https://search.jd.com/Search?keyword=" + key + "&enc=utf-8&wq=" + key for i in range(1,101): page_url = search_url + "&page=" + str(i*2-1) yield Request(url=page_url,callback=self.next) def next(self,response): id = response.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku').extract() #print(id) for j in range(len(id)): ture_url = "https://item.jd.com/" + str(id[j]) + ".html" yield Request(url=ture_url,callback=self.next2) def next2(self,response): item = JingdongItem() item['title'] = response.xpath('//head/title/text()').extract()[0].replace('【圖片 價格 品牌 報價】-京東','').replace('【行情 報價 價格 評測】-京東','') item['link'] = response.url #價格抓包 ture_id = re.findall(r'https://item.jd.com/(.*?).html',item['link'])[0] price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id) price_txt = urllib.request.urlopen(price_url).read().decode('utf-8', 'ignore') item['price'] = re.findall(r'"p":"(.*?)"',price_txt)[0] #評論抓包 comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(ture_id) comment_txt = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore') item['comment'] = re.findall(r'"CommentCount":(.*?),"',comment_txt)[0] return item
pipline程式碼:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql.cursors class JingdongPipeline(object): # 連線登入mysql,新建資料表 def __init__(self): self.conn = pymysql.connect(host="127.0.0.1", user="root", passwd="", db="jd", charset = 'utf8') cur = self.conn.cursor() cur.execute("USE jd") cur.execute( "CREATE TABLE computer(title VARCHAR(100),link VARCHAR(50),price VARCHAR(50),comment VARCHAR(50))") self.conn.commit() def process_item(self, item, spider): try: title_1 = item['title'] link_1 = item['link'] price_1 = item['price'] comment_1 = item['comment'] cur = self.conn.cursor() cur.execute("INSERT INTO computer(title,link,price,comment) VALUES (%s,%s,%s,%s)",(title_1,link_1,price_1,comment_1)) self.conn.commit() except Exception as err: pass return item
使用的是navicat作為mysql的互動
最後結果:
遇到的一些難題:
1、mysql的安裝,參考我的另一篇博文:
2、抓包:我所取的資料裡面,有兩個欄位是需要抓包的,一個是price,另一個是comment,抓包的時候注意包的地址,裡面一般會包括關鍵字,例如price的包的連結名裡面也會有price
3、我的navicat插資料進去的時候中文會顯示‘???‘的亂碼,這裡我是參考:點選開啟連結
4、經過多次除錯之後,發現訪問資料量太多了,京東開始問我要驗證碼了,驗證碼解碼方面還在學習當中,掌握了之後再回頭做修改