session和selenium+PhantomJS模擬登陸v2ex
阿新 • • 發佈:2018-12-15
- session方式登入
def parse_url(self, url): response = requests.get(url=url, headers=self.headers) return response.content start = time.time() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://www.v2ex.com/signin'} url = r'https://www.v2ex.com/signin' session = requests.session() resp_content = session.get(url, headers=headers).content resp_html = html.fromstring(resp_content) names = resp_html.xpath('//*[@id="Main"]/div[2]/div[2]/form/table[1]//input[1]//@name') once_url = resp_html.xpath(r'//*[@id="Main"]/div[2]/div[2]/form/table[1]//tr//@style')[0] once_value = re.match(".*once=([0-9]*)(.*)", once_url).group(1) verifyUrl = "https://www.v2ex.com/_captcha?once=" + once_value resp = requests.get(verifyUrl, headers=headers, cookies=session.cookies.get_dict()) verifyBytes = resp.content verify_start = time.time() dama = indetifyCode(verifyBytes) verify_end = time.time() print("識別驗證碼耗時:", verify_end - verify_start) encode_verify = str(dama, encoding="utf-8") data = { names[0]: 'xxx', names[1]: 'xxx', names[2]: encode_verify, 'once': once_value, 'next': r'/' } login_url = 'https://www.v2ex.com/signin' final_resp = session.post(url=login_url, data=data) print(final_resp.status_code) end = time.time() print("總耗時: ", end - start)
v2ex中,由於賬號密碼欄的名稱是變化的,所以只有先載入得到html後再通過xpath進行匹配,匹配成功後獲取對應post請求時所需填充的資料,並且由於其中的驗證碼是和cookie繫結通過once引數進行獲取的,所以獲取驗證碼一定要加上cookie引數。 2. selenium
browser = webdriver.Chrome("D:/ChromeDownload/chromedriver_win32/chromedriver.exe") url = r'https://www.v2ex.com/signin' browser.get(url=url) username = browser.find_element_by_xpath(r'//input[@placeholder="使用者名稱或電子郵箱地址"]') username.send_keys('xxx') pwd = browser.find_element_by_xpath(r'//tbody[1]/tr[2]/td[2]/input') pwd.send_keys('xxx') # 驗證碼 style = browser.find_element_by_xpath(r'//tbody[1]/tr[3]/td[2]/div[1]').get_attribute("style") once = re.match(".*once=([0-9]*)\"\);(.*)", style).group(1) verifyUrl = "https://www.v2ex.com/_captcha?once=" + once headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://www.v2ex.com/signin'} cookies = {i["name"]: i["value"] for i in browser.get_cookies()} resp = requests.get(verifyUrl, headers=headers, cookies=cookies) verifyBytes = resp.content print(resp.status_code) dama = indetifyCode(verifyBytes) verify = browser.find_element_by_xpath(r'//input[@placeholder="請輸入上圖中的驗證碼"]') #這裡不能直接用verify.send_keys(dama),型別不匹配會報錯 #入參要求是str,這種獲取的是byte #報錯資訊:TypeError: sequence item 0: expected str instance, int found verify.send_keys(str(dama, encoding="utf-8")) browser.find_element_by_xpath(r'//input[@value="登入"]').click()
以上是完整的程式碼,註釋中記錄了部分自己遇到的坑,用的是yundama平臺的驗證碼識別,其中的api需要改一部分,官網給的打碼api是要傳入參路徑呼叫的是YDM_DecodeByByPath
,而我們這裡可以直接通過請求獲取到二維碼圖片的二進位制流,無需儲存到本地,直接把其中呼叫的api改成YDM_DecodeByByBytes
。
通過以上程式碼測試,發現確實如果直接用selenium+PhantomJS耗時會比session耗時多很多,測試過程(出去驗證碼識別階段)session登入整體流程耗時2秒左右,而selenium則需要接近7秒。