第一個爬蟲小程式(攜帶登陸後的cookie)
阿新 • • 發佈:2019-01-02
import requests class TiebaSpider: def __init__(self,tieba_name): """ 初始化引數,完成基礎配置 """ self.tieba_name = tieba_name self.url_base = "https://tieba.baidu.com/f?kw=" + tieba_name_crawl + "&ie=utf-8&pn={}" self.headers = {"User-Agent":"WSF"} def make_url_lists(self): """ 生成下載列表 """ return [self.url_base.format(i) for i in range(1,11)] def download_url(self,url_str): """ 使用requests get方法下載指定頁面,並返回頁面效果 """ result = requests.get(url_str,headers = self.headers) return result.content def save(self,result,page_num): """ 儲存下載內容 """ file_path = "{}-第{}頁.html".format(self.tieba_name,page_num) with open(file_path,"wb") as f: f.write(result) def run(self): """ 下載主執行緒,實現主要的下載邏輯 """ url_lists = self.make_url_lists() for url_str in url_lists: result_str = self.download_url(url_str) p_num = url_lists.index(url_str) + 1 self.save_result(result_str,p_num) if __name__ == '__main__': tieba_spider = TiebaSpider("薛之謙") tieba_spider.run()
理解 session 和 cookie
session:當用戶訪問http-server時,會生成一個sessionID(唯一標識),在一定訪問週期內可用,在瀏覽網頁時會將記錄儲存在cookie中,下次訪問有快取記錄.
session 伺服器端生成一個字串儲存在某個使用者的唯一標識.用來唯一標識客戶端的訪問(如健身中心會員卡)
cookie 儲存在客戶機的資料,其中含有sessionID,傳送給伺服器後表明使用者身份.
import lxml.html import requests import re def parse_form(html): tree = lxml.html.fromstring(html) data = {} for e in tree.cssselect('form input'): if e.get('name'): data[e.get('name')] = e.get('value') return data def get_cookie(): s = requests.session() result = s.get('http://example.webscraping.com/places/default/user/login?_next=/places/default/index') post_data = parse_form(result.text) print(s.cookies.get_dict()) login_url ='http://example.webscraping.com/places/default/user/login?_next=/places/default/index' post_data['email']= '
[email protected]' post_data['password'] = '2336517498' s.post(login_url,post_data) rs = s.post('http://example.webscraping.com/places/default/user/login?_next=/places/default/index') with open('login1.html','w+') as f: f.write(rs.text) if __name__ == '__main__': get_cookie()