1. 程式人生 > >通過selenium實現的京東商品爬取

通過selenium實現的京東商品爬取

comm path python header end 查找 drive sna exec

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from lxml import etree import csv import requests,re,time #搜索的商品名稱 shopname="Python設計模式" #聲明瀏覽器對象 browser=webdriver.Chrome() browser.get("https://www.jd.com") #查找節點 inputtext = browser.find_element_by_class_name(‘text‘) #輸入數據 inputtext.send_keys(shopname) #提交 btn = browser.find_element_by_class_name(‘button‘) btn.click() #搜索後的頁面 #顯式等待 wait = WebDriverWait(browser, 10) wait.until(ec.title_contains(shopname)) with open(shopname+".csv",‘a‘) as f: wr= csv.DictWriter(f,[‘name‘,‘price‘,‘shop‘]) wr.writeheader() while True: #判斷是否為反爬蟲機制窗體 是否正常 if len(browser.window_handles)>1: handles=browser.window_handles[1] browser.switch_to_window(handles) browser.close() # 滾動條 browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") wait.until(ec.presence_of_element_located((By.CLASS_NAME, ‘pn-next‘))) # 爬取內容 html = etree.HTML(browser.page_source) # 讀取每個商品 shops = html.xpath(‘//div[contains(@class,"gl-i-wrap")]‘) # 下一頁 npage =html.xpath(‘//a[@class="pn-next disabled"]/em//text()‘) for shop in shops: name = shop.xpath(‘.//div[contains(@class,"p-name")]//em//text()‘) name = "".join(name) price = shop.xpath(‘.//div[contains(@class,"p-price")]//i//text()‘) price = "".join(price) sname = shop.xpath(‘.//div[contains(@class,"p-shop")]//a//@title‘) sname = "".join(sname) if sname.strip() == ‘‘: sname = "京東自營" wr.writerow({‘name‘:name,‘price‘:price,‘shop‘:sname}) if len(npage)>0: break try: pbtn = browser.find_element_by_class_name("pn-next") pbtn.click() except: pass browser.close()

通過selenium實現的京東商品爬取