1. 程式人生 > >python反爬之使用者代理

python反爬之使用者代理

# requests是第三方庫,需要安裝 pip install requests
import requests
import random
# 通常很多網站都會設定檢測請求頭中的User-Agent,所以在編寫爬蟲程式碼時一般都會加上user-agent
url = 'http://www.zhihu.com'

# 如果同一個user-agent請求次數過多,可能也可能被檢測出來,所以我們可以寫一個列表,裡面存放很多的user-agent,每次請求在列表中隨機抽取一個
user_list = ["Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"]

headers = {
    'User-Agent':random.choice(user_list)
}
# 可以列印檢視每次選出了哪一個user-agent
print(headers)

r = requests.get(url=url,headers=headers)
# 也可以試一下,如果不加headers會不會返回正確的狀態碼
print(r.status_code)
'''
常見的狀態碼及含義:
    200 OK     一切正常
    301 Moved Permanently     重定向到新的URL,永久性
    302 Found     重定向到臨時的URL,非永久性
    304 Not Modified     請求的資源未更新
    400 Bad Request     非法請求
    401 Unauthorized     請求未經授權
    403 Forbidden     禁止訪問
    404 Not Found     沒有找到對應頁面
    500 Internal Server Error     伺服器內部出現錯誤
    501 Not Implemented     伺服器不支援實現請求所需要的功能
'''