1. 程式人生 > >requests庫的基本使用 | 爬蟲

requests庫的基本使用 | 爬蟲

type 必須 web 過多 safari 驗證 數據 有效 客戶

# proxies代理
# 1.用法
import requests

# 構建一個url
# url = ‘http://www.baidu.com‘
# 構建請求頭
# headers = {
#     ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘
# }
# 構建代理
# 網上查找免費代理
# proxies = {
#     "http": "http://61.135.217.7:80",
#     "https": "https://61.135.217.7:80",
# } # 特殊代理 # 常規代理已經被大型網站封了; # proxies = { # "http": "http://賬號:密碼@IP:PORT", # "https": "http://賬號:密碼@IP:PORT", # } # 發送請求 # response = requests.get(url, headers=headers, proxies=proxies) # 如何驗證代理是否成功? # 只要沒報錯;過濾掉速度慢的,時刻驗證; # 2.cookie與session # cookie:一種持久保存在磁盤中;一種臨時保存在緩存中; # 不安全,存放在本地的cookie會被他人分析進而實施欺騙行為;
# session:存放位置:服務器內存.文件,數據庫; # session設置有效期,當訪問增多會占用服務器性能; # sessionid存放在cookie中,cookie要是被禁了,就涉及到url重構; # cookie數據限制,不要能超過4k; # ①帶上cookie與session的好處: # 能夠請求登錄後的頁面 # ②帶上cookie與session的壞處: # 一套cookie和session會對應一個用戶; # 請求次數過多過快,會被識別成爬蟲; # 可以使用cookie池,賬號池; # 不需要cookie的時候盡量不要使用cookie, # 但是為了登錄,我們必須發送帶有cookies的請求;
# 如何使用requests處理cookies和session? # 1.處理cookies # # 請求頭中添加cookie # # cookie參數:字典 # 需要訪問的頁面:http://www.renren.com/910033035 # cookie: # 方法一:將cookie放入請求頭中 # """anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0""" # 構建url import re # url = ‘http://www.renren.com/910033035‘ # 構建請求頭 # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # ‘Cookie‘: "anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0" # } # 發起請求獲取響應 # response = requests.get(url, headers=headers) # 驗證登錄 # 1.根據響應url判斷是否登錄成功 # 2.保存成文件 # 3.正則判斷 # print(re.findall(r‘新用戶oF0z‘, response.content.decode())) # with open(‘renren.html‘, ‘w‘) as f: # f.write(response.content.decode()) # 方法二:cookie傳參 # url = ‘http://www.renren.com/910033035‘ # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # } # # # 構建cookies字典 # temp = "anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0" # cookies = dict() # # 拆分cookies,以字典的方式存放 # for i in temp.split(‘; ‘): # key = i.split(‘=‘)[0] # value = i.split(‘=‘)[1] # cookies[key] = value # # print(cookies) # # # 發起請求 # response = requests.get(url, headers=headers, cookies=cookies) # # # 驗證是否成功 # print(re.findall(r‘新用戶oF0z‘, response.content.decode())) # 2.session # 處理session # requests提供了一個session類,來實現客戶端和服務器端的回話保持; # 1.例化session對象 # 2.使用session對象發送get或者post請求 # session.get(url) # session.post(url, data=data) # # 構建url, 訪問表單的url # url = ‘http://www.renren.com/PLogin.do‘ # # 構建請求頭 # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # } # # 構建登錄數據 # post_data = { # ‘email‘: ‘18868271201‘, # ‘password‘: ‘laaimeng2011‘ # } # # 創建一個session對象 # session = requests.session() # # 發送請求 # response = session.post(url, headers=headers, data=post_data) # print(response.url) # # # 驗證 # # 跳轉其他頁面,不需要再提交其他請求數據; # # session可以保持登錄狀態 # response1 = session.get(‘http://www.renren.com/910033035‘) # print(response1.url) # 3.tip小技巧 # response = requests.get(‘http://www.baidu.com‘) # # # cookie相關操作 # # 從請求頭中獲取cookies # cook = response.cookies # # print(cook) # # 打印的是一個cookieJar對象 # print(type(cook)) # # # 將這個對象轉換成字典的形式 # dict_cook = requests.utils.dict_from_cookiejar(cook) # print(dict_cook) # print(type(dict_cook)) # # # 轉換回去 # jar = requests.utils.cookiejar_from_dict(dict_cook) # print(jar) # print(type(jar)) # # 關閉ssl認證 # # 有些網站采用這樣的認證證書,我們需要通過verify=False,來關閉 # response = requests.get(‘https://www.12306.cn/mormhwed/‘, verify=False) # # 會有警告,但是可以直接打印源碼 # print(response.content.decode()) # # 超時處理 # url = ‘http://www.youtube.com‘ # # 三分鐘的超時延遲 # # 所以我們設置一個短的timeout=3超時處理,可以驗證代理情況 # # 如果爬蟲使用多線程,超時延遲會影響效率 # response = requests.get(url, timeout=3)

requests庫的基本使用 | 爬蟲