爬取京東商城商品資訊
阿新 • • 發佈:2020-07-21
0x01 基於chrome+selenium爬取京東商城8G記憶體條
from selenium import webdriver from selenium.webdriver import ActionChains #獲取屬性 from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import pymongo import csv import time from pyecharts.charts import Bar from pyecharts import options as opts import pandas as pd #連線mongdb資料庫 client=pymongo.MongoClient(host='localhost',port=27017) db=client.JD datalist=[] def spider_data(): browser=webdriver.Chrome() url='https://www.jd.com/' browser.get(url) browser.find_element_by_id('key').send_keys('8G記憶體條') browser.find_element_by_id('key').send_keys(Keys.ENTER) #顯示等待下一頁元素載入完成 #WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'pn-next'))) count=0 while True: try: count+=1 # 顯示等待,直到所有商品資訊載入完成 WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'gl-item'))) #滾動條下拉到最下面 browser.execute_script('document.documentElement.scrollTop=10000') time.sleep(3) browser.execute_script('document.documentElement.scrollTop=0') lists = browser.find_elements_by_class_name('gl-item') for li in lists: name=li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text price=li.find_element_by_xpath('.//div[@class="p-price"]//i').text commit=li.find_element_by_xpath('.//div[@class="p-commit"]//a').text shop_name=li.find_element_by_xpath('.//div[@class="p-shop"]//a').text datas={} datas['name']=name datas['price']=price datas['commit']=commit datas['shop_name']=shop_name datalist.append(datas) #連線mongodb資料庫 collection=db.datas collection.insert(datas) print(datas) except : print('ERROR') if count==1: break #爬取下一頁 next=browser.find_element_by_css_selector('a.pn-next') next.click() print("資料爬取完成") #寫入資料 def write_data(): with open('E:/data_csv.csv','w',encoding='utf-8',newline='') as f: try: title=datalist[0].keys() writer=csv.DictWriter(f,title) writer.writeheader() writer.writerows(datalist) except: print('Error') print('檔案寫入完成') # 資料清洗 def clear_data(): data = pd.read_csv('E:\data_csv.csv') # 刪除_id列 data.drop('_id', axis=1, inplace=True) # 刪除'去看二手'行 data.drop(data[data['commit'].str.contains('去看二手')].index, inplace=True) def convert_data(var): # 將+,萬去除 new_value = var.replace('+', '').replace('萬', '') return float(new_value) # 寫入cvs檔案中 data['commit'] = data['commit'].apply(convert_data) # 清除commit數大於100的行 data.drop(data[data['commit'] >= 100].index, inplace=True) # 儲存為csv檔案 data.to_csv('E:\clear_data.csv') def group_data(): # 資料清洗 data1 = pd.read_csv('E:\clear_data.csv') # 刪除其它品牌,保留金士頓 data1.drop(data1[data1['name'].str.contains('十銓|宇瞻|光威|美商海盜船|威剛|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=True) # 儲存為csv檔案 data1.to_csv('E:\Kingston.csv') data2 = pd.read_csv('E:\clear_data.csv') # 篩選出威剛 data2.drop(data2[data2['name'].str.contains('金士頓|十銓|宇瞻|光威|美商海盜船|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=True) # 儲存為csv檔案 data2.to_csv('E:\weigang.csv') print('資料清洗完成') #資料視覺化 def show_data(): data_path=pd.read_csv('E:\clear_data.csv') bar=Bar() bar.add_xaxis(data_path['name'].tolist()) bar.add_yaxis('價格',data_path['price'].tolist()) bar.add_yaxis('評論',data_path['commit'].tolist()) bar.set_global_opts(title_opts=opts.TitleOpts(title="商品價格")) bar.render('all_data.html') data_path1 = pd.read_csv('E:\Kingston.csv') bar1 = Bar() bar1.add_xaxis(data_path1['name'].tolist()) bar1.add_yaxis('價格', data_path1['price'].tolist()) bar1.add_yaxis('評論', data_path1['commit'].tolist()) bar1.set_global_opts(title_opts=opts.TitleOpts(title="商品價格")) bar1.render('Kingston.html') data_path2 = pd.read_csv('E:\weigang.csv') bar2 = Bar() bar2.add_xaxis(data_path2['name'].tolist()) bar2.add_yaxis('價格', data_path2['price'].tolist()) bar2.add_yaxis('評論', data_path2['commit'].tolist()) bar2.set_global_opts(title_opts=opts.TitleOpts(title="商品價格")) bar2.render('weigang.html') if __name__=='__main__': spider_data() write_data() clear_data() group_data() show_data()
0x02 結果展示
參考連結:
https://blog.csdn.net/weixin_44024393/article/details/89289694