Urllib模塊使用
阿新 • • 發佈:2017-11-30
保存 file 構建 live 使用 print lib lencod user
Urllib2基礎操作
1、打開網頁(urlopen)
打開一個網頁
import urllib2 response = urllib2.urlopen(‘http://www.baidu.com‘) html= response.read() print html
urlopen一般常用的有三個參數,它的參數如下:
urllib.requeset.urlopen(url,data,timeout)
data參數的使用(GET)
import urllib import urllib2 data = {‘email‘:‘myemail‘, ‘password‘:‘password‘} params = urllib.urlencode(params)response= urllib.urlopen("%s?%s"%(uri, params))
code = response.getcode()
data參數的使用(POST)
import urllib import urllib2 data = {‘email‘:‘myemail‘, ‘password‘:‘password‘} params = urllib.urlencode(data) response= urllib.urlopen(uri, params) code = response.getcode()
所以如果我們添加data參數的時候就是以post請求方式請求,如果沒有data參數就是get請求方式
timeout參數的使用
在某些網絡情況不好或者服務器端異常的情況會出現請求慢的情況,請求設置一個超時時間
import urllib2 response = urllib2.urlopen(‘http://www.baidu.com‘, timeout=1) print(response.read())
2、打開網頁(request)
打開一個網頁
import urllib.request request = urllib.request.Request(‘https://www.baidu.com‘) response = urllib.request.urlopen(request) print(response.read().decode(‘utf-8‘))
指定請求頭
import urllib2 # 制定請求頭 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64)"} # 封裝請求 request = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(request) content = response.read().decode(‘utf-8‘) print content
3、進階
增加代理
# 自定義headers headers = { ‘Host‘:‘www.dianping.com‘, ‘Cookie‘: ‘JSESSIONID=F1C38C2F1A7F7BF3BCB0C4E3CCDBE245 aburl=1; cy=2;‘ ‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", } proxy_handler = urllib2.ProxyHandler({‘http‘: ‘http://host:port‘}) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) content = response.read().decode(‘utf-8‘)
操作cookie
import urllib2 import cookielib import json cookie = cookielib.CookieJar() cookie_s = urllib2.HTTPCookieProcessor(cookie) # 創建cookie處理器 opener = urllib2.build_opener(cookie_s) # 構建opener urllib2.install_opener(opener) response= urllib2.urlopen(‘http://www.dianping.com‘).read() # 讀取指定網站的內容 cj = urllib2.HTTPCookieProcessor(cookie) print response # 網頁HTML # 查看cookie print cookie, type(cookie) for item in cookie: print ‘name:‘ + item.name + ‘-value:‘ + item.value
保存cookie
def saveCookie(): # 設置保存cookie的文件 filename = ‘cookie.txt‘ # 聲明一個MozillaCookieJar對象來保存cookie,之後寫入文件 cookie = cookielib.MozillaCookieJar(filename) # 創建cookie處理器 handler = urllib2.HTTPCookieProcessor(cookie) # 構建opener opener = urllib2.build_opener(handler) # 創建請求 res = opener.open(‘http://www.baidu.com‘) # 保存cookie到文件 # ignore_discard的意思是即使cookies將被丟棄也將它保存下來 # ignore_expires的意思是如果在該文件中cookies已經存在,則覆蓋原文件寫入 cookie.save(ignore_discard=True, ignore_expires=True)
在文件中取出cookie
def getCookie(): # 創建一個MozillaCookieJar對象 cookie = cookielib.MozillaCookieJar() # 從文件中的讀取cookie內容到變量 cookie.load(‘cookie.txt‘, ignore_discard=True, ignore_expires=True) # 打印cookie內容,證明獲取cookie成功 for item in cookie: print ‘name:‘ + item.name + ‘-value:‘ + item.value # 利用獲取到的cookie創建一個opener handler = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(handler) res = opener.open(‘http://www.baidu.com‘) print res.read()
來個實例
def my_cookie_test(): headers = { ‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4‘, ‘Connection‘: ‘keep-alive‘, ‘Cookie‘: ‘cy=2; _lxsdk_cuid=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk_s=16000a1a16f-c56-870-2aa%7C%7C23; _hc.v=44792549-7147-7394-ac0a-eefed1fa19a2.1511839081; s_ViewType=10‘, ‘Host‘: ‘www.dianping.com‘, ‘Referer‘: ‘http://www.dianping.com/shop‘, ‘Upgrade-Insecure-Requests‘: 1 } # 請求cookie cj_a = cookielib.CookieJar() cj_s = urllib2.HTTPCookieProcessor(cj_a) proxy_s = urllib2.ProxyHandler({‘http‘: ‘0.0.0.0:8080‘}) opener = urllib2.build_opener(proxy_s, cj_s) urllib2.install_opener(opener) try: request = urllib2.Request("http://www.dianping.com/shop/000000/", headers=headers) response = urllib2.urlopen(request) content = response.read().decode(‘utf-8‘) # HTML print content cookie_data = {} for item in cj_a: # print ‘請求之後:name:‘ + item.name + ‘-value:‘ + item.value cookie_data[item.name] = item.value cookie_str = json.dumps(cookie_data) with open(‘cookie.txt‘, ‘w‘) as f: f.write(cookie_str) print("cookies信息已保存到本地") except Exception as e: print e
網頁信息抽取。。。待下期。。。
Urllib模塊使用