1. 程式人生 > 實用技巧 >爬取京東商城商品資訊

爬取京東商城商品資訊

0x01 基於chrome+selenium爬取京東商城8G記憶體條

from selenium import webdriver
from selenium.webdriver import ActionChains #獲取屬性
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pymongo
import csv
import time
from pyecharts.charts import Bar
from pyecharts import options as opts
import pandas as pd

#連線mongdb資料庫
client=pymongo.MongoClient(host='localhost',port=27017)
db=client.JD

datalist=[]


def spider_data():
    browser=webdriver.Chrome()
    url='https://www.jd.com/'
    browser.get(url)
    browser.find_element_by_id('key').send_keys('8G記憶體條')
    browser.find_element_by_id('key').send_keys(Keys.ENTER)
    #顯示等待下一頁元素載入完成
    #WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'pn-next')))

    count=0
    while True:
        try:
            count+=1
            # 顯示等待,直到所有商品資訊載入完成
            WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'gl-item')))
            #滾動條下拉到最下面
            browser.execute_script('document.documentElement.scrollTop=10000')
            time.sleep(3)
            browser.execute_script('document.documentElement.scrollTop=0')
            lists = browser.find_elements_by_class_name('gl-item')
            for li in lists:
                name=li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
                price=li.find_element_by_xpath('.//div[@class="p-price"]//i').text
                commit=li.find_element_by_xpath('.//div[@class="p-commit"]//a').text
                shop_name=li.find_element_by_xpath('.//div[@class="p-shop"]//a').text

                datas={}
                datas['name']=name
                datas['price']=price
                datas['commit']=commit
                datas['shop_name']=shop_name

                datalist.append(datas)
                #連線mongodb資料庫
                collection=db.datas
                collection.insert(datas)
                print(datas)

        except :
            print('ERROR')

        if count==1:
            break

        #爬取下一頁
        next=browser.find_element_by_css_selector('a.pn-next')
        next.click()
    print("資料爬取完成")

#寫入資料
def write_data():

    with open('E:/data_csv.csv','w',encoding='utf-8',newline='') as f:
        try:
            title=datalist[0].keys()
            writer=csv.DictWriter(f,title)
            writer.writeheader()
            writer.writerows(datalist)
        except:
            print('Error')
    print('檔案寫入完成')


# 資料清洗
def clear_data():

    data = pd.read_csv('E:\data_csv.csv')
    # 刪除_id列
    data.drop('_id', axis=1, inplace=True)
    # 刪除'去看二手'行
    data.drop(data[data['commit'].str.contains('去看二手')].index, inplace=True)
    def convert_data(var):
        # 將+,萬去除
        new_value = var.replace('+', '').replace('萬', '')
        return float(new_value)
    # 寫入cvs檔案中
    data['commit'] = data['commit'].apply(convert_data)
    # 清除commit數大於100的行
    data.drop(data[data['commit'] >= 100].index, inplace=True)
    # 儲存為csv檔案
    data.to_csv('E:\clear_data.csv')

def group_data():
    # 資料清洗
    data1 = pd.read_csv('E:\clear_data.csv')
    # 刪除其它品牌,保留金士頓
    data1.drop(data1[data1['name'].str.contains('十銓|宇瞻|光威|美商海盜船|威剛|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=True)
    # 儲存為csv檔案
    data1.to_csv('E:\Kingston.csv')

    data2 = pd.read_csv('E:\clear_data.csv')
    # 篩選出威剛
    data2.drop(data2[data2['name'].str.contains('金士頓|十銓|宇瞻|光威|美商海盜船|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=True)
    # 儲存為csv檔案
    data2.to_csv('E:\weigang.csv')
    print('資料清洗完成')

#資料視覺化
def show_data():

    data_path=pd.read_csv('E:\clear_data.csv')
    bar=Bar()
    bar.add_xaxis(data_path['name'].tolist())
    bar.add_yaxis('價格',data_path['price'].tolist())
    bar.add_yaxis('評論',data_path['commit'].tolist())
    bar.set_global_opts(title_opts=opts.TitleOpts(title="商品價格"))
    bar.render('all_data.html')

    data_path1 = pd.read_csv('E:\Kingston.csv')
    bar1 = Bar()
    bar1.add_xaxis(data_path1['name'].tolist())
    bar1.add_yaxis('價格', data_path1['price'].tolist())
    bar1.add_yaxis('評論', data_path1['commit'].tolist())
    bar1.set_global_opts(title_opts=opts.TitleOpts(title="商品價格"))
    bar1.render('Kingston.html')

    data_path2 = pd.read_csv('E:\weigang.csv')
    bar2 = Bar()
    bar2.add_xaxis(data_path2['name'].tolist())
    bar2.add_yaxis('價格', data_path2['price'].tolist())
    bar2.add_yaxis('評論', data_path2['commit'].tolist())
    bar2.set_global_opts(title_opts=opts.TitleOpts(title="商品價格"))
    bar2.render('weigang.html')


if __name__=='__main__':

    spider_data()
    write_data()
    clear_data()
    group_data()
    show_data()


0x02 結果展示


參考連結:
https://blog.csdn.net/weixin_44024393/article/details/89289694