Python爬蟲之Urllib庫的基本使用
阿新 • • 發佈:2018-11-27
狀態碼 chrom 異常處理 false 基本 sta col thead kit
# get請求 import urllib.request response = urllib.request.urlopen("http://www.baidu.com") print(response.read().decode(‘utf-8‘)) # post請求 import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding=‘utf8‘) response = urllib.request.urlopen(‘http://httpbin.org/post‘, data=data) print(response.read()) import urllib.request response = urllib.request.urlopen(‘http://httpbin.org/get‘, timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen(‘http://httpbin.org/get‘, timeout = 0.1) except urllib.error.URLError as e:if isinstance(e.reason, socket.timeout): print(‘TIME OUT‘) # 響應類型 import urllib.request response = urllib.request.urlopen(‘http://www.python.org‘) print(type(response)) # 狀態碼、響應頭 import urllib.request response = urllib.request.urlopen(‘http://www.python.org‘) print(response.status) print(response.getheaders())print(response.getheader(‘server‘)) # Request import urllib.request request = urllib.request.Request(‘http://python.org‘) response = urllib.request.urlopen(request) print(response.read().decode(‘utf-8‘)) from urllib import request, parse url = ‘http://httpbin.org/post‘ headers = { ‘User-Agent‘: ‘user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36‘, ‘Host‘:‘httpbin.org‘ } dict = { ‘name‘:‘Germey‘ } data = bytes(parse.urlencode(dict), encoding = ‘utf-8‘) req = request.Request(url = url, data = data, headers = headers, method = ‘POST‘) response = request.urlopen(req) print(response.read().decode(‘utf-8‘)) from urllib import request, parse url = ‘http://httpbin.org/post‘ dict = { ‘name‘: ‘Germey‘ } data = bytes(parse.urlencode(dict), encoding = ‘utf-8‘) req = request.Request(url = url, data = data, method = ‘POST‘) req.add_header(‘User-Agent‘, ‘user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36‘) response = request.urlopen(req) print(response.read().decode(‘utf-8‘)) #代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ ‘http‘: ‘http://127.0.0.1:9743‘, ‘https‘: ‘https://127.0.0.1:9743‘ }) opener = urllib.request.build_opener(proxy_handler) response = opener.open(‘http://httpbon.org/get‘) print(response.read()) # cookie import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(‘http://www.baidu.com‘) for item in cookie: print(item.name + " = " + item.value) # 保存cookie為1.txt import http.cookiejar, urllib.request filename = ‘1.txt‘ cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(‘http://www.baidu.com‘) cookie.save(ignore_discard = True, ignore_expires = True) # 另外一種方式保存cookie import http.cookiejar, urllib.request filename = ‘1.txt‘ cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(‘http://www.baidu.com‘) cookie.save(ignore_discard = True, ignore_expires = True) # 讀取cookie import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load(‘1.txt‘, ignore_discard = True, ignore_expires = True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(‘http://www.baidu.com‘) print(response.read().decode(‘utf-8‘)) # 異常處理 from urllib import request, error try: response = request.urlopen(‘http://lidonghao.com‘) except error.URLError as e: print(e.reason) from urllib import request, error try: response = request.urlopen(‘http://www.baidu.com/101‘) except error.HTTPError as e: print(e.reason, e.code, sep = ‘\n‘) except error.URLError as e: print(e.reason) else: print(‘Request Successfully‘) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen("https://www.baidu.com", timeout = 0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print("TIME OUT")
1 # 解析URL 2 # urlparse 3 from urllib.parse import urlparse 4 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘) 5 print(type(result), result) 6 7 from urllib.parse import urlparse 8 result = urlparse(‘www.baidu.com/index.html;user?id=5#comment‘, scheme = "https") 9 print(result) 10 11 from urllib.parse import urlparse 12 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘, scheme = "https") 13 print(result) 14 15 from urllib.parse import urlparse 16 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘, allow_fragments = False) 17 print(result) 18 19 from urllib.parse import urlparse 20 result = urlparse(‘http://www.baidu.com/index.html#comment‘, allow_fragments = False) 21 print(result)
1 # urlunparse 2 from urllib.parse import urlunparse 3 data = [‘http‘, ‘www.baidu.com‘, ‘index,html‘, ‘user‘, ‘a=6‘, ‘comment‘] 4 print(urlunparse(data)) 5 6 # urljoin 7 from urllib.parse import urljoin 8 print(urljoin(‘http://www.baidu.com‘, ‘FAQ.html‘)) 9 print(urljoin(‘http://www.baidu.com‘, ‘https://cuiqingcai.com/FAQ.html‘)) 10 print(urljoin(‘http://www.baidu.com/about.html‘, ‘https://cuiqingcai.com/FAQ.html‘)) 11 print(urljoin(‘http://www.baidu.com/about.html‘, ‘http://cuiqingcai.com/FAQ.html?question=2‘)) 12 print(urljoin(‘http://www.baidu.com?wd=abc‘, ‘https://cuiqingcai.com/index.php‘)) 13 print(urljoin(‘http://www.baidu.com‘, ‘?category=2#comment‘)) 14 print(urljoin(‘www.baidu.com‘, ‘?category=2#comment‘)) 15 print(urljoin(‘www.baidu.com#comment‘, ‘?category=2‘)) 16 17 # urlencode 18 from urllib.parse import urlencode 19 params = { 20 ‘name‘:‘germey‘, 21 ‘age‘:22 22 } 23 base_url = ‘http://www.baidu.com‘ 24 url = base_url + urlencode(params) 25 print(url)
Python爬蟲之Urllib庫的基本使用