1. 程式人生 > >PYTHON 爬蟲筆記十:利用selenium+PyQuery實現淘寶美食數據搜集並保存至MongeDB(實戰項目三)

PYTHON 爬蟲筆記十:利用selenium+PyQuery實現淘寶美食數據搜集並保存至MongeDB(實戰項目三)

pre pager 淘寶 NPU group color 存在 pan rgs

利用selenium+PyQuery實現淘寶美食數據搜集並保存至MongeDB

  • 目標站點分析

  • 流程框架

  • 爬蟲實戰

  1. spider詳情頁

    import pymongo
    import re
    
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    from pyquery import PyQuery as pq from config import * import pymongo client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] #browser = webdriver.Chrome() browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) #創建PhantomJS瀏覽器 wait = WebDriverWait(browser, 10) browser.set_window_size(
    1400,900) def search(): #請求頁面 print(正在搜索。。。) try: browser.get(https://world.taobao.com/) #請求淘寶首頁 input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, #mq)) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,#J_PopSearch > div.sb-search > div > form > input[type="submit"]:nth-child(2)
    ))) input.send_keys(KEYWORD) submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > div.total))) get_products() return total.text except TimeoutError: total = search() print(total) def next_page(page_number): #翻頁操作 print(正在翻頁。。。,page_number) try: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > input))#判斷頁面是否加載出輸入框 ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit))) #判斷是否加載出搜索按鈕 input.clear() input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > ul > li.item.active > span),str(page_number))) #在做結果判斷的時候,經常想判斷某個元素中是否存在指定的文本, get_products() except TimeoutError: next_page(next_page()) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-itemlist .items .item))) html = browser.page_source #獲取詳情頁html代碼 doc = pq(html) #創建一個Pyquery對象 items = doc(#mainsrp-itemlist .items .item).items() #css選擇器獲取所以items ,調用items方法取得所取的內容 for item in items: producet = { title: item.find(.title).text(), location: item.find(.location).text(), price:item.find(.price).text(), deal:item.find(.deal-cnt).text()[:-3], shop:item.find(.shop).text(), image: item.find(.pic .img).attr(src), } print(producet) save_to_monge(producet) def save_to_monge(result): try: if db[MONGO_TABLE].insert(result): print(存儲成功!,result) except Exception: print(存儲失敗!,result) def main(): try: total = search() total = int(re.compile((\d+)).search(total).group(1)) for i in range(2,total+1): next_page(i) except Exception: print(出錯啦) browser.close() if __name__ == __main__: main()
  2. config配置頁

    MONGO_URL=localhost
    MONGO_DB=taobao
    MONGO_TABLE=taobao
    
    SERVICE_ARGS = [--load-images=false,--disk-cache=false]
    
    KEYWORD =美食

PYTHON 爬蟲筆記十:利用selenium+PyQuery實現淘寶美食數據搜集並保存至MongeDB(實戰項目三)