Python爬蟲之Urllib庫的基本使用

阿新 • • 發佈：2018-11-27

狀態碼 chrom 異常處理 false 基本 sta col thead kit

# get請求
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode(‘utf-8‘))

# post請求
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding=‘utf8‘)
response = urllib.request.urlopen(‘http://httpbin.org/post 
‘, data=data)
print(response.read())

import urllib.request
response = urllib.request.urlopen(‘http://httpbin.org/get‘, timeout=1)
print(response.read())

import socket
import urllib.request
import urllib.error
try:
    response = urllib.request.urlopen(‘http://httpbin.org/get‘, timeout = 0.1)
except urllib.error.URLError as e:
     
if isinstance(e.reason, socket.timeout):
        print(‘TIME OUT‘)

# 響應類型
import urllib.request
response = urllib.request.urlopen(‘http://www.python.org‘)
print(type(response))

# 狀態碼、響應頭
import urllib.request
response = urllib.request.urlopen(‘http://www.python.org‘)
print(response.status)
print(response.getheaders())
 
print(response.getheader(‘server‘))

# Request
import urllib.request
request = urllib.request.Request(‘http://python.org‘)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8‘))

from urllib import request, parse
url = ‘http://httpbin.org/post‘
headers = {
    ‘User-Agent‘: ‘user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36‘,
    ‘Host‘:‘httpbin.org‘
}
dict = {
    ‘name‘:‘Germey‘
}
data = bytes(parse.urlencode(dict), encoding = ‘utf-8‘)
req = request.Request(url = url, data = data, headers = headers, method = ‘POST‘)
response = request.urlopen(req)
print(response.read().decode(‘utf-8‘))

from urllib import request, parse
url = ‘http://httpbin.org/post‘
dict = {
    ‘name‘: ‘Germey‘
}
data = bytes(parse.urlencode(dict), encoding = ‘utf-8‘)
req = request.Request(url = url, data = data, method = ‘POST‘)
req.add_header(‘User-Agent‘, ‘user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36‘)
response = request.urlopen(req)
print(response.read().decode(‘utf-8‘))

#代理
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
    ‘http‘: ‘http://127.0.0.1:9743‘,
    ‘https‘: ‘https://127.0.0.1:9743‘
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(‘http://httpbon.org/get‘)
print(response.read())

# cookie
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open(‘http://www.baidu.com‘)
for item in cookie:
    print(item.name + " = " + item.value)

# 保存cookie為1.txt
import http.cookiejar, urllib.request
filename = ‘1.txt‘
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open(‘http://www.baidu.com‘)
cookie.save(ignore_discard = True, ignore_expires = True)

# 另外一種方式保存cookie
import http.cookiejar, urllib.request
filename = ‘1.txt‘
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open(‘http://www.baidu.com‘)
cookie.save(ignore_discard = True, ignore_expires = True)

# 讀取cookie
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load(‘1.txt‘, ignore_discard = True, ignore_expires = True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open(‘http://www.baidu.com‘)
print(response.read().decode(‘utf-8‘))

# 異常處理
from urllib import request, error
try:
    response = request.urlopen(‘http://lidonghao.com‘)
except error.URLError as e:
    print(e.reason)

from urllib import request, error
try:
    response = request.urlopen(‘http://www.baidu.com/101‘)
except error.HTTPError as e:
    print(e.reason, e.code, sep = ‘\n‘)
except error.URLError as e:
    print(e.reason)
else:
    print(‘Request Successfully‘)

import socket
import urllib.request
import urllib.error
try:
    response = urllib.request.urlopen("https://www.baidu.com", timeout = 0.01)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print("TIME OUT")

 1 # 解析URL
 2 # urlparse
 3 from urllib.parse import urlparse
 4 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘)
 5 print(type(result), result)
 6 
 7 from urllib.parse import urlparse
 8 result = urlparse(‘www.baidu.com/index.html;user?id=5#comment‘, scheme = "https")
 9 print(result)
10 
11 from urllib.parse import urlparse
12 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘, scheme = "https")
13 print(result)
14 
15 from urllib.parse import urlparse
16 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘, allow_fragments = False)
17 print(result)
18 
19 from urllib.parse import urlparse
20 result = urlparse(‘http://www.baidu.com/index.html#comment‘, allow_fragments = False)
21 print(result)

 1 # urlunparse
 2 from urllib.parse import urlunparse
 3 data = [‘http‘, ‘www.baidu.com‘, ‘index,html‘, ‘user‘, ‘a=6‘, ‘comment‘]
 4 print(urlunparse(data))
 5 
 6 # urljoin
 7 from urllib.parse import urljoin
 8 print(urljoin(‘http://www.baidu.com‘, ‘FAQ.html‘))
 9 print(urljoin(‘http://www.baidu.com‘, ‘https://cuiqingcai.com/FAQ.html‘))
10 print(urljoin(‘http://www.baidu.com/about.html‘, ‘https://cuiqingcai.com/FAQ.html‘))
11 print(urljoin(‘http://www.baidu.com/about.html‘, ‘http://cuiqingcai.com/FAQ.html?question=2‘))
12 print(urljoin(‘http://www.baidu.com?wd=abc‘, ‘https://cuiqingcai.com/index.php‘))
13 print(urljoin(‘http://www.baidu.com‘, ‘?category=2#comment‘))
14 print(urljoin(‘www.baidu.com‘, ‘?category=2#comment‘))
15 print(urljoin(‘www.baidu.com#comment‘, ‘?category=2‘))
16 
17 # urlencode
18 from urllib.parse import urlencode
19 params = {
20     ‘name‘:‘germey‘,
21     ‘age‘:22
22 }
23 base_url = ‘http://www.baidu.com‘
24 url = base_url + urlencode(params)
25 print(url)

Python爬蟲之Urllib庫的基本使用

狀態碼 chrom 異常處理 false 基本 sta col thead kit # get請求 import urllib.request response = urllib.request.urlopen("http://www.baidu.com") print(

python 爬蟲之BeautifulSoup 庫的基本使用

rip data lin value 訪問 pytho 輕松 register tex import urllib2url = ‘http://www.someserver.com/cgi-bin/register.cgi‘values = {}values[‘name‘]

Python爬蟲之Requests庫的基本使用

1 import requests 2 response = requests.get('http://www.baidu.com/') 3 print(type(response)) 4 print(response.status_code) 5 print(type(respon

python3爬蟲之Urllib庫（二）

cau python err 發送請求 split 完成構造服務器 inf 在上一篇文章中，我們大概講了一下urllib庫中最重要的兩個請求方法：urlopen() 和 Request() 但是僅僅憑借那兩個方法無法執行一些更高級的請求，如Cookies處理，代

Python爬蟲系列-Urllib庫詳解

Urllib庫詳解 Python內建的Http請求庫: * urllib.request 請求模組 * urllib.error 異常處理模組 * urllib.parse url解析模組 * urllib.robotparser robots.txt解析模組 #### 相比在python2基礎上的變化

python爬蟲之xpath的基本使用 python爬蟲之xpath的基本使用

python爬蟲之xpath的基本使用一、簡介　　XPath 是一門在 XML 文件中查詢資訊的語言。XPath 可用來在 XML 文件中對元素和屬性進行遍歷。XPath 是 W3C XSLT 標準的主要元素，並且 XQuery 和 XPointer 都構建於

python爬蟲之urllib(一)

Python 3 中的 urllib 庫有四個模組，分別是urllib.request，urllib.error，urllib.parse，urllib.robotparser。接下來我們對這四個模組做詳細介紹參考：https://docs.python.org

python爬蟲之urllib(二)

urllib.error可以接收urllib.request產生的異常，urllib.error有三個方法，如下： URLError是OSError的一個子類，HTTPError是URLError的一個子類，伺服器上HTTP的響應會返回一個狀態碼，根據這個HTTP狀態碼

Python爬蟲之selenium庫使用詳解

Python爬蟲之selenium庫使用詳解什麼是Selenium selenium 是一套完整的web應用程式測試系統，包含了測試的錄製（selenium IDE）,編寫及執行（Selenium Remote Control）和測試的並行處理（Selenium Grid）。Seleni

python爬蟲之requests庫詳解（一，如何通過requests來獲得頁面資訊）

前言：爬蟲的基礎是與網頁建立聯絡，而我們可以通過get和post兩種方式來建立連線，而我們可以通過引入urllib庫[在python3的環境下匯入的是urllib；而python2的環境下是urllib和urllib2]或者requests庫來實現,從程式的複雜度和可讀性

python爬蟲之requests的基本使用

簡介 Requests是用python語言基於urllib編寫的，採用的是Apache2 Licensed開源協議的HTTP庫，Requests它會比urllib更加方便，可以節約我們大量的工作。一、安裝 pip快速安裝pip install r

Python爬蟲之requests庫(三)：傳送表單資料和JSON資料

import requests 一、傳送表單資料要傳送表單資料，只需要將一個字典傳遞給引數data payload = {'key1': 'value1', 'key2': 'value

python爬蟲入門urllib庫的使用

urllib庫的使用，非常簡單。 import urllib2 response = urllib2.urlopen("http://www.baidu.com") print response.read() 只要幾句程式碼就可以把一個網站的原始碼下載下來。官方文件：https://d

Python爬蟲之requests庫(五)：Cookie、超時、重定向和請求歷史

import requests 一、Cookie 獲取伺服器響應中的cookie資訊 url = 'http://example.com/some/cookie/setting/url'

python學習之turtle庫基本操作

目錄一段執行繪製蟒蛇的程式碼示例 import turtle turtle.setup(650,350,200,200) turtle.penup() turtle.fd(-250) turtle.pendown() turtle.pensize(25) t

Python爬蟲之urllib簡單使用

1.什麼是Urllib庫 Urllib是一個Python提供的用於操作URL的模組 2.簡單網頁爬取（1）匯入urllib庫（2）使用urllib.request.urlopen開啟並爬去一個網頁

python爬蟲之xpath的基本使用

result pip ack highlight query mage lpad add 必須一、簡介　　XPath 是一門在 XML 文檔中查找信息的語言。XPath 可用來在 XML 文檔中對元素和屬性進行遍歷。XPath 是 W3C XSLT 標準的主要元素，並

Python爬蟲之BeautifulSoup庫

1. BeautifulSoup 1.1 解析庫 1）Python標準庫 # 使用方法 BeautifulSoup(markup, "html.parser") # 優勢 Python的內建標準庫，執行速度適中，文件容錯能力強 # 劣勢 Python2.7.3 或者 python3.2.2 前的版本容錯

Python爬蟲入門三之Urllib庫的基本使用

res 瀏覽器中必須答案文件的網頁 one .com 屏幕截圖 1.分分鐘扒一個網頁下來怎樣扒網頁呢？其實就是根據URL來獲取它的網頁信息，雖然我們在瀏覽器中看到的是一幅幅優美的畫面，但是其實是由瀏覽器解釋才呈現出來的，實質它是一段HTML代碼，加 JS、CSS

python爬蟲 urllib庫基本使用

afa 識別 urllib spa response aid gen odin pos 以下內容均為python3.6.*代碼學習爬蟲，首先有學會使用urllib庫，這個庫可以方便的使我們解析網頁的內容，本篇講一下它的基本用法解析網頁 #導入urllib from u

Python爬蟲之Urllib庫的基本使用

相關推薦