Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)
阿新 • • 發佈:2018-12-26
No.1 第一個python爬蟲練習
from urllib import request,parse import chardet if __name__ == '__main__': url = 'https://blog.csdn.net/m0_37355951/article/details/80457159' rsp = request.urlopen(url) html = rsp.read() ##獲取網頁的頭資訊(編碼) cs = chardet.detect(html) print(cs) ##{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''} ##按照獲取的頁面編碼進行解碼 預設utf-8 html = html.decode(cs.get("encoding",'utf-8')) #輸出返回的資訊 print(rsp) print(rsp.geturl()) print(rsp.info()) print(rsp.getcode()) ## 正常200 ##網頁資訊 print(html)
No.2 模擬Get請求
from urllib import request,parse if __name__ == '__main__': url = 'http://www.baidu.com/s?' wd = input('Input your keyword') ## 拼接的資料 qs = { "wd":wd } ## 對資料進行編譯 qs = parse.urlencode(qs) rsp = request.urlopen(url+qs) html = rsp.read().decode() print(html)
No.3 模擬post請求
''' 利用parse 模組模擬post請求 1.開啟F12 2.輸入一個g 3.利用NetWork-All-Headers 檢視 發現 FormData 的值是kw:g ''' from urllib import request,parse import json ''' 利用data構造內容 然後urlopen開啟 返回一個json 格式的結果 結果應該是girl的翻譯 ''' baseurl = 'https://fanyi.baidu.com/sug' #存放dict格式的資料 data = { 'kw':'girl' } #需要使用parse來變異 data = parse.urlencode(data).encode() rsp = request.urlopen(baseurl,data= data) ## 讀取資訊解碼 預設utf-8 json_data = rsp.read().decode() print(json_data) #把json字串轉化成字典 json_data = json.loads(json_data) print(json_data) for item in json_data['data']: print(item['k'],'---',item['v'])
No.4 UrlError的使用
'''
UrlEror的使用
檢視 訪問錯誤
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
No.5 更改自己的agent
常用的agent:
https://blog.csdn.net/rookie_is_me/article/details/81634048
兩種方式:
1.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
req = request.Request(url= url,headers=headers)
2.req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
'''
訪問一個網址 更改自己的agent
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
# req = request.Request(url= url,headers=headers)
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
No.6 代理伺服器
'''
代理伺服器
www.xicidaili.com
www.goubanjia.com
使用步驟:
1.設定代理地址
2.建立ProxyHandle
3.建立Opener
4.安裝 Opener
'''
from urllib import request,error,parse
if __name__ == '__main__':
url = 'http://www.baidu.com'
#設定代理地址
proxy = {'http':'117.169.104.102:80'}
#建立ProxyHandler
proxy_handler = request.ProxyHandler(proxy)
#建立Opener
opener = request.build_opener(proxy_handler)
#安裝Opener
request.install_opener(opener)
try:
rsp = request.urlopen(url)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
No.7 使用cookie登入網站
'''
使用cookie 登入人人網
複製登入後的cookie
'''
from urllib import request
if __name__ == '__main__':
url = 'http://www.renren.com/894245278/profile'
headers = {'Cookie':' 自己的cookie '}
req = request.Request(url=url,headers=headers)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
No.8 自動配置cookie (自動登入)訪問資料
'''
自動配置cookie爬取資料
CookieJar 管理儲存cookie 向傳出的http請求新增cookie
cookie儲存在記憶體中 CookieJar例項回收後,cookie消失
FileCookieJar 使用檔案儲存cookie
MozillaCookieJar 建立與mocilla瀏覽器cookie.txt相容的FileCookie
LwpCookieJar
'''
#利用cookieJar訪問人人網
#開啟登入介面 自動通過使用者名稱密碼登入
#利用提取的cookie登入隱私頁面
from urllib import request,error,parse
from http import cookiejar
#建立 cookiejar 的例項
cookie = cookiejar.CookieJar()
#生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
#建立http請求管理器
http_handler = request.HTTPHandler()
#生成https管理器
https_handler = request.HTTPSHandler()
#建立請求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)
def login():
url = 'http://www.renren.com/PLogin.do'
#設定登入資料
data = {
'email':'賬號',
'password':'密碼'
}
#資料編碼
data = parse.urlencode(data).encode()
req = request.Request(url,data= data)
rsp = opener.open(req)
def getHomePage():
url = 'http://www.renren.com/894245278/profile'
#如果已經執行了login函式 則opener自動包含相應的cookie值
rsp = opener.open(url)
html = rsp.read().decode()
print(html)
if __name__ == '__main__':
login()
getHomePage()