我要爬爬蟲(11)-用selenium爬取淘寶商品資訊
阿新 • • 發佈:2019-02-02
思路就是用selenium操作瀏覽器,訪問淘寶,輸入關鍵詞,查詢,用pyquery解析目標資訊,翻頁,儲存到mongodb.
函式定義三個:
1 開啟瀏覽器,查詢初始化,翻頁
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_page (page):
#例項化一個等待,預設等待2秒
wait = WebDriverWait(browser,2)
input = wait.until(EC.presence_of_element_located((By.ID,'q')))
input.send_keys('足球')
#顯示等待,並設定等待條件,EC下有多種條件可選擇,這裡是可點選;By方法決定匹配節點的標準,這裡是xpath;
enter = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="J_SearchForm"]/div/div[1]/button' )))
enter.click()
for i in range(page):
#這裡'>'是用來選取子節點用的;比較節點的值和頁數是否相等,即判斷當前頁數是否正確
current_page = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active > span'),str(i+1)))
#等待條件,目標資訊是否加載出來
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item' )))
print(i+1)
for index,item in enumerate(crawl()):
save_to_mongo(item)
print(index,item)
#處理完一頁就進行翻頁
next_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.form .input.J_Input')))
next_page.clear()
next_page.send_keys(i+2)
confirm = browser.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[2]/span[3]')
confirm.click()
2 解析原始碼,選取目標資訊
from pyquery import PyQuery as pq
def crawl():
#用pyquery處理原始碼
source = pq(browser.page_source)
#items()轉化為列舉型別
items = source.find('#mainsrp-itemlist .items .item').items()
for item in items:
body={}
body['image']=item.find('.pic .img').attr('data-src')
body['price']=item('.price').text()[2:]
body['person_buy']=item('.deal-cnt').text()[:-3]
body['name']=item.find('.J_ClickStat').text()
body['store']=item('.shopname').text()
body['location']=item('.location').text()
yield body
3 儲存到mongodb
from pymongo import MongoClient
mongo = MongoClient()
db = mongo['Taobao']
goods = db['goods']
def save_to_mongo(data):
try:
football.insert(data)
except:
print('儲存失敗')
還有不開啟瀏覽器的模式,加入引數chrome_options即可。
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://s.taobao.com')
結果展示
mongo中