爬蟲day 04(通過登錄去爬蟲 解決django的csrf_token)
阿新 • • 發佈:2017-11-11
ica lin urllib uil encode links 爬蟲 roc htm
#通過登錄去爬蟲 #首先要有用戶名和密碼 import urllib.request import http.cookiejar from lxml import etree head = { ‘Connection‘: ‘Keep-Alive‘, ‘Accept‘: ‘text/html, application/xhtml+xml, */*‘, ‘Accept-Language‘: ‘en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko‘ } # 給opener加上cookie def makeMyOpener(head): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = headerreturn opener # 爬自己的頁面 oper = makeMyOpener(head) uop = oper.open(‘http://127.0.0.1:8000/index/loginHtml/‘, timeout = 1000) data = uop.read() html = data.decode() # lxml提取 csrfmiddlewaretoken
selector = etree.HTML(html) links = selector.xpath(‘//form/input[@name="csrfmiddlewaretoken"]/@value‘) for link in links: csrfmiddlewaretoken= link print(link) url = ‘http://127.0.0.1:8000/index/login/‘ datas = {‘csrfmiddlewaretoken‘:csrfmiddlewaretoken,‘email‘:‘aa‘,‘pwd‘:‘aa‘}
# 必須要把字符串改為二進制流 data_encoded = urllib.parse.urlencode(datas).encode(encoding=‘utf-8‘) response = oper.open(url,data_encoded) content = response.read() html = content.decode() print(html)
爬蟲day 04(通過登錄去爬蟲 解決django的csrf_token)