Python之網頁爬蟲request模組
阿新 • • 發佈:2018-12-15
#########網頁爬蟲#########
## requests模組
- 對requests模組的理解
http/1.1請求的封裝, 可以輕鬆實現cookie, IP代理, 登陸驗證等操作;
Requests 使用的是 urllib3,因此繼承了它的所有特性。Requests 支援 HTTP 連線保持和連線池,支援使用 cookie 保持會話,支援檔案上傳,支援自動確定響應內容的編碼,支援國際化的 URL 和 POST 資料自動編碼。現代、國際化、人性化。
- 對requests模組的使用 get方法
import requests url = 'http://www.baidu.com' # 使用get方法來獲取或url地址 response = requests.get(url) print(response) print(response.status_code) print(response.cookies) print(response.text) print(type(response.text))
import requests """ def post(url, data=None, json=None, **kwargs): Sends a POST request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary (will be form-encoded), bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response return request('post', url, data=data, json=json, **kwargs) """ response = requests.post('http://httpbin.org/post', data={'name': 'kobe', 'age': 40}) print(response.text) response1 = requests.delete('http://httpbin.org/post', data={'name': 'kobe'}) print(response1.text)
## 帶引數的get請求
import requests # url1 = 'https://movie.douban.com/subject/4864908/comments?start=20&limit=20&sort=new_score&status=P' data = { 'start': 40, 'limit': 80, 'sort': 'new_score', 'status': 'P', } url = 'https://movie.douban.com/subject/4864908/comments' response = requests.get(url, params=data) print(response.url)
## 解析json格式
ip = '8.8.8.8'
url = "http://ip.taobao.com/service/getIpInfo.php?ip=%s" %(ip)
response = requests.get(url)
content = response.json()
print(content)
print(type(content))
## 獲取二進位制資料
值得注意的是:
# response.text : 返回字串的頁面資訊 # response.content : 返回bytes的頁面資訊
# 獲取二進位制資料
url = 'https://gss0.bdstatic.com/-4o3dSag_xI4khGkpoWK1HF6hhy/baike/w%3D268%3Bg%3D0/sign=4f7bf38ac3fc1e17fdbf8b3772ab913e/d4628535e5dde7119c3d076aabefce1b9c1661ba.jpg'
response = requests.get(url)
# print(response.text)
with open('picture.png', 'wb') as f:
f.write(response.content)
## 下載視訊
# 下載視訊
url = "http://gslb.miaopai.com/stream/sJvqGN6gdTP-sWKjALzuItr7mWMiva-zduKwuw__.mp4"
response = requests.get(url)
with open('/tmp/learn.mp4', 'wb') as f:
# response.text : 返回字串的頁面資訊
# response.content : 返回bytes的頁面資訊
f.write(response.content)
## 新增headers資訊
# 新增header資訊
url = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"
headers = {
'User-Agent': user_agent
}
response = requests.get(url,headers=headers)
print(response.status_code)
- 狀態碼的判斷 response = requests.get(url, headers=headers) exit() if response.status_code != 200 else print("請求成功)
## 高階設定_上傳檔案
# 上傳檔案
data = {'file': open('picture.png', 'rb')}
response = requests.post('http://httpbin.org/post', files=data)
print(response.text)
## 獲取cookie資訊
# 獲取cookie資訊
response = requests.get('http://www.csdn.net')
print(response.cookies)
for key,value in response.cookies.items():
print(key + ':' +value)
## 讀取已經存在的cookie資訊訪問網址內容(會話維持)
## 讀取已經存在的cookie資訊訪問網址內容(會話維持)
# 設定一個cookie: name = houzeyu
s = requests.session()
response1 = s.get('http://httpbin.org/cookies/set/name/houzeyu')
response2 = s.get('http://httpbin.org/cookies')
print(response2.text)
## 忽略證書驗證
# 忽略證書驗證
url = 'https://www.12306.cn'
response = requests.get(url, verify=False)
print(response.status_code)
print(response.text)
## 代理設定/設定超時間
proxy = {
'https': '171.221.239.11:808',
'http': '218.14.115.211:3128'
}
response = requests.get('http://httpbin.org/get', proxies=proxy, timeout=10)
print(response.text)