第十章 採集javascript 使用selenium庫進行獲取
阿新 • • 發佈:2018-12-19
#!/usr/bin/env python # _*_ coding:utf-8 _*_ # 等3秒獲取指定的內容 會有selenium不支援無頭的phantomjs的警告 # from selenium import webdriver # import time # driver=webdriver.PhantomJS(executable_path='D:/pycharm/phantomjs-2.1.1-windows/bin/phantomjs') # driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html") # time.sleep(3) # print(driver.find_element_by_id('content').text) # driver.close() # 用id檢查頁面是不是已經完全載入 # from selenium import webdriver # from selenium.webdriver.common.by import By # from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.support.ui import WebDriverWait # # driver=webdriver.PhantomJS(executable_path='D:/pycharm/phantomjs-2.1.1-windows/bin/phantomjs') # driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html") # try: # element=WebDriverWait(driver,10).until( # EC.presence_of_element_located(By.ID,"loadedButton")) # finally: # print(driver.find_element_by_id("content").text) # driver.close() # 處理重定向 客戶端重定向的處理 在頁面開始載入時監控dom元素,重複呼叫這個元素 # 直到selenium丟擲StaleElementReferenceException異常,說明網頁發生了跳轉 # 每半分鐘檢查一次網頁,看是否html的標籤還在不在,時限為10s import time from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException def waitForLoad(driver): elem=driver.find_element_by_tag_name("html") count=0 while True: count+=1 if count>20: print("timing put after 10 seconds and returns") return time.sleep(.5) try: elem==driver.find_element_by_tag_name("html") except StaleElementReferenceException: return driver=webdriver.phantomjs(executable_path='D:/pycharm/phantomjs-2.1.1-windows/bin/phantomjs') driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html") waitForLoad(driver) print(driver.page_source)