selenium和pyquery抓取非同步載入資料
阿新 • • 發佈:2018-12-13
from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from pyquery import PyQuery as pq import time #開啟不同的瀏覽器例項 def openBrower(brower_type): if brower_type == 'chrome': return webdriver.Chrome("C:/Users/net/PycharmProjects/untitled/venv/Scripts/chromedriver.exe") elif brower_type == 'firefox': return webdriver.Firefox() elif brower_type == 'safari': return webdriver.Safari() elif brower_type == 'PhantomJS': return webdriver.PhantomJS()else : return webdriver.Ie() def parse_website(): # 通過Chrome()方法開啟chrome瀏覽器 browser = openBrower('chrome') # 訪問京東網站 browser.get("https://www.jd.com") # 等待50秒 wait = WebDriverWait(browser, 50) # 通過css選擇器的id屬性獲得輸入框。until方法表示瀏覽器完全載入到對應的節點,才返回相應的物件。presence_of_all_elements_located是通過css選擇器載入節點input = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#key')) ) # input = browser.find_element_by_id('key') # 在輸入框中寫入要查詢的資訊 input[0].send_keys('計算機書籍') # 查詢按鈕完全載入完畢,返回查詢按鈕物件 submit_button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')) ) # 點選查詢按鈕 submit_button.click() # 模擬下滑到底部操作 for i in range(0,3): browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) # 商品列表的總頁數 total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b') ) ) html = browser.page_source.replace('xmlns', 'another_attr') parse_book(1,html) for page_num in range(2,int(total[0].text) + 1): print('當前第' + str(page_num) + '頁') parse_next_page(page_num,browser,wait) ##解析下一頁 def parse_next_page(page_num,browser,wait): next_page_button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')) ) next_page_button.click() #滑動到頁面底部,用於載入資料 for i in range(0,3): browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(10) #一頁顯示60個商品,"#J_goodsList > ul > li:nth-child(60)確保60個商品都正常加載出來。 wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) ) # 判斷翻頁成功,當底部的分頁介面上顯示第幾頁時,就顯示翻頁成功。 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_num)) ) html = browser.page_source.replace('xmlns', 'another_attr') parse_book(page_num, html) def parse_book(page,html): doc = pq(html) li_list = doc('.gl-item').items() print('-------------------第' + str(page) + '頁的圖書資訊---------------------') for item in li_list: image_html = item('.gl-i-wrap .p-img') book_img_url = item.find('img').attr('data-lazy-img') if book_img_url == "done": book_img_url = item.find('img').attr('src') print('圖片地址:' + book_img_url) item('.p-name').find('font').remove() book_name = item('.p-name').find('em').text() print('書名:' + book_name) price = item('.p-price').find('em').text() + str(item('.p-price').find('i').text()) print('價格:' + price) commit = item('.p-commit').find('strong').text() print('評價數量:' + commit) shopnum = item('.p-shopnum').find('a').text() print('出版社:' + shopnum) print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') def main(): parse_website() if __name__ == "__main__": main()