爬蟲1-urllib,Request,opener,proxy
阿新 • • 發佈:2018-12-11
一, urllib (直接請求網址)
from urllib import request with request.urlopen('http://www.runoob.com') as f: if f.status == 200: #200 f.status返回狀態碼;f.reason:OK data=f.read() # 讀取返回的主體內容,資料格式為位元組碼 #print(data.decode()) #print(f.getheaders()) # 讀取返回的頭資訊,頭資訊格式為元祖列表 # for k,v in f.getheaders(): # print(k,v) try: # 把爬到的資料裝入檔案 with open('first.html', 'w+') as fp: fp.write(data.decode()) fp.close() except Exception as ex: print(ex)
二, Request (模仿不同瀏覽器,不同的請求頭)
如果我們要想模擬瀏覽器傳送GET請求,就需要使用Request物件,
通過往Request物件新增HTTP頭,我們就可以把請求偽裝成瀏覽器
用不同的瀏覽器在傳送請求的時候,會有不同的User-Agent頭。
from urllib import request,parse import random url='http://www.runoob.com' query_obj={"s":"js"} query_string=parse.urlencode(query_obj)# get提交資料 要對資料urlencode() url=url+"/?"+query_string # print(url) http://www.runoob.com/?s=js req=request.Request(url) ua_list = [ 不同瀏覽器的頭部 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] user_agent=random.choice(ua_list)# 隨機讀取列表 req.add_header('User-Agent',user_agent) #隨機模仿一個瀏覽器 # print(dir(req)) # print(req.full_url) 獲取請求的完整地址 # print(req.headers['User-agent']) #獲取請求頭資訊 req.get_header('User-agent') with request.urlopen(req) as f: data=f.read() print(data.decode())
三,opener (使ip從不同的出口出去)
opener是 urllib.OpenerDirector 的例項,
我們之前一直都在使用的urlopen,它是一個特殊的opener(也就是模組幫我們構建好的)
但是基本的urlopen()方法不支援代理、cookie等其他的HTTP/HTTPS高階功能。
from urllib import request,parse,error import random import json import ssl ssl._create_default_https_context = ssl._create_unverified_context #用於解決https不能爬取的問題 url="https://www.meishij.net/chufang/diy/wancan/?&" qs={ "page":2 } url=url+parse.urlencode(qs) req=request.Request(url) ua_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] # 隨機讀取列表 user_agent=random.choice(ua_list) req.add_header('User-Agent',user_agent) # 構建一個HTTPHandler 處理器物件,支援處理HTTPS請求 http_handler = request.HTTPSHandler(debuglevel=1) # 呼叫request.build_opener()方法,建立支援處理HTTP請求的opener物件 opener = request.build_opener(http_handler) # 使ip從不同的出口出去 try: with opener.open(req) as f: # 原來這裡是request.urlopen(req) data = f.read() # 這裡data為位元組碼 with open('ttt.json', 'wb') as fp: #若用w+ 括號裡寫encoding='utf-8' 下面data要data.decode() fp.write(data) fp.close() except error.HTTPError as err: #若http出錯 pass except error.URLError as err: #若url出錯 pass except Exception as err: pass
四,proxy(代理ip)
ProxyHandler處理器(代理設定)
使用代理IP,這是爬蟲/反爬蟲的第二大招,通常也是最好用的。
很多網站會檢測某一段時間某個IP的訪問次數(通過流量統計,系統日誌等),如果訪問次數多的不像正常人,它會禁止這個IP的訪問。
所以我們可以設定一些代理伺服器,每隔一段時間換一個代理,就算IP被禁止,依然可以換個IP繼續爬取。
會有時間延遲 網速要好
from urllib import request,parse,error
import random
import json
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url="https://www.meishij.net/chufang/diy/wancan/?&"
qs={
"page":2
}
url=url+parse.urlencode(qs)
req=request.Request(url)
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
# 隨機讀取列表
user_agent=random.choice(ua_list)
req.add_header('User-Agent',user_agent)
proxy_list = [
{"https" : "116.192.167.32:32267"},
{"https" : "14.117.176.252:808"},
{"https" : "121.31.140.130:8123"}
]
# 隨機選擇一個代理
proxy = random.choice(proxy_list)
http_handler = request.ProxyHandler(proxy)
opener = request.build_opener(http_handler)
try:
with opener.open(req) as f:
data = f.read()
with open('a.json', 'wb') as fp:
fp.write(data)
fp.close()
except error.HTTPError as err:
print(err)
except error.URLError as err:
print(err)
except Exception as err:
print(err)