1. 程式人生 > 其它 >京東商品資訊爬取

京東商品資訊爬取

技術標籤:爬蟲從入門到放棄爬蟲

上一回寫到:在做課設的時候爬淘寶遇到了滑塊驗證碼,不會驗證碼,也沒有錢買代理,就只能換個網站來完成作業了
在這裡插入圖片描述
哎,老三套,分析網站,提取資料,儲存資料,走你!傳送門
隨便搜尋個隨便,然後翻頁,分析url,這個比較簡單,直接上結果!我們在構造url的時候要新增三個引數,其中前兩個是我們要搜尋的關鍵字(用urllib.parse的quote() 進行編碼),然後就是page, 就是頁碼數 x 2 - 1.ok,然後在看看資料在哪裡,通過分析頁面和檢視網頁原始碼,我們可以看到資料就在HTML中,好辦了,直接上xpath!然後儲存檔案我們用csv格式,此外,該網站沒有啥反爬,但是不知道爬多了會咋樣(所以你們要慢點!)話不多說,這次直接上程式碼!

import requests
from lxml import etree
import csv
from urllib.parse import quote
import os
import time


def get_info(url, filename):
    headers = {
        'cookie': '__jdu=16071443968301455206652; areaId=4; PCSYCityID=CN_500000_500100_500113; shshshfpa=dd18dbea-499d-54a4-7ea7-359ad2fa7730-1608267723; shshshfpb=x5dT6XzEE0aeElsKysLcC%2Fw%3D%3D; ipLoc-djd=4-48202-52490-0; unpl=V2_ZzNtbUQDRh0hDBRTeEoMBWIKQVwRBRMcfV0WVy8bCQEwB0VeclRCFnQURldnGl4UZwQZWUVcQRxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXQNnBhdfSlBKF3QIQ1B%2fGl0CYAcRbXJQcyVFAEdTfxFsBFcCIh8WC0sXdgtOXTYZWwRhAxdYQF9EHHcJRlF%2fHV8EYAQWXnJWcxY%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_7e49e5c62ba148b0bca89da2e3d5f5f2|1608713799888; __jda=122270672.16071443968301455206652.1607144397.1608636741.1608713800.7; __jdc=122270672; shshshfp=0c8100930c2187d3e1b5bb395da22c3c; 3AB9D23F7A4B3C9B=DZW22QB3YTOIWTOXQAKS74XHJWB6BSADTIGSBPETJIB7VM46K744NIYGMTQLV5VNXUPNBPINNKV2QHBBK2TO233XMI; __jdb=122270672.9.16071443968301455206652|7.1608713800; shshshsID=891e85485bd9a1afbe9aea50e3359ffa_9_1608714178510',
        'referer': 'https://search.jd.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'

    }
    response = requests.get(url, headers=headers).content.decode('utf-8')
    html = etree.HTML(response)
    li_list = html.xpath('//ul[@class="gl-warp clearfix"]/li')
    for li in li_list:
        item = []
        try:
            title = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em//text()')
            title = ''.join(title).replace(' ', "").replace('\n', '')
            nick = li.xpath('.//div[@class="p-shop"]/span/a//text()')[0]
            price = li.xpath('.//div[@class="p-price"]/strong/i//text()')[0]
            item = [title, nick, price]
            tocsv(item, filename)
        except:
            nick = li.xpath('.//div[@class="p-shop"]/span/a//text()')
            print(nick)



def tocsv(file, filename):
    with open(filename, 'a+', encoding='utf-8', newline='') as f:
        f.seek(0)
        write = csv.writer(f)
        if f.read() == '':
            write.writerow(('標題', '店鋪', '價格'))
        write.writerow((file[0], file[1], file[2]))


if __name__ == '__main__':
    key = quote('手套')
    page = 20
    filename = 'jd.csv'
    url = 'https://search.jd.com/Search?keyword={}&qrst=1&wq={}&stock=1&page={}'
    if os.path.exists(filename):
        os.remove(filename)
    for i in range(1, page + 1):
        url_page = url.format(key, key, i * 2 - 1)
        print(url_page)
        get_info(url_page, filename)
        time.sleep(3)