1. 程式人生 > >Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)

Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)

No.1 第一個python爬蟲練習

from urllib import request,parse
import chardet

if __name__ == '__main__':
    url = 'https://blog.csdn.net/m0_37355951/article/details/80457159'
    rsp = request.urlopen(url)
    html = rsp.read()
    ##獲取網頁的頭資訊(編碼)
    cs = chardet.detect(html)   
    print(cs)   ##{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
    ##按照獲取的頁面編碼進行解碼 預設utf-8
    html = html.decode(cs.get("encoding",'utf-8'))
    #輸出返回的資訊
    print(rsp)
    print(rsp.geturl())  
    print(rsp.info())   
    print(rsp.getcode())  ## 正常200
    ##網頁資訊
    print(html)

No.2 模擬Get請求

from urllib import request,parse

if __name__ == '__main__':
    url =  'http://www.baidu.com/s?'
    wd = input('Input your keyword')
    
    ## 拼接的資料
    qs = {
        "wd":wd
    }
    
    ## 對資料進行編譯
    qs = parse.urlencode(qs)
    rsp = request.urlopen(url+qs)
    html = rsp.read().decode()
    print(html)

No.3 模擬post請求

'''
利用parse 模組模擬post請求
    1.開啟F12
    2.輸入一個g
    3.利用NetWork-All-Headers 檢視 發現 FormData 的值是kw:g
'''

from urllib import request,parse
import json

'''
  利用data構造內容 然後urlopen開啟
    返回一個json 格式的結果
    結果應該是girl的翻譯
'''
baseurl = 'https://fanyi.baidu.com/sug'
#存放dict格式的資料
data = {
    'kw':'girl'
}
#需要使用parse來變異
data = parse.urlencode(data).encode()

rsp = request.urlopen(baseurl,data= data)
## 讀取資訊解碼 預設utf-8
json_data = rsp.read().decode()
print(json_data)

#把json字串轉化成字典
json_data = json.loads(json_data)
print(json_data)

for item in json_data['data']:
    print(item['k'],'---',item['v'])


No.4 UrlError的使用

'''
UrlEror的使用
檢視 訪問錯誤
'''

from urllib import request,error

if __name__ == '__main__':
    url = 'http://www.baidu.com'
    try:
        req = request.Request(url)
        rsp = request.urlopen(req)
        html = rsp.read().decode()
        print(html)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)

No.5 更改自己的agent

常用的agent:
https://blog.csdn.net/rookie_is_me/article/details/81634048

兩種方式:

  1.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
  req = request.Request(url= url,headers=headers)
   2.req = request.Request(url)
     req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
'''
訪問一個網址 更改自己的agent

'''
from urllib import request,error
if __name__ == '__main__':
    url = 'http://www.baidu.com'

    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
       # req = request.Request(url= url,headers=headers)
        req = request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
        rsp = request.urlopen(req)
        html = rsp.read().decode()
        print(html)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)

No.6 代理伺服器

'''
代理伺服器
www.xicidaili.com
www.goubanjia.com
使用步驟:
    1.設定代理地址
    2.建立ProxyHandle
    3.建立Opener
    4.安裝 Opener
'''
from urllib import request,error,parse

if __name__ == '__main__':
    url = 'http://www.baidu.com'
    #設定代理地址
    proxy = {'http':'117.169.104.102:80'}
    #建立ProxyHandler
    proxy_handler = request.ProxyHandler(proxy)
    #建立Opener
    opener = request.build_opener(proxy_handler)
    #安裝Opener
    request.install_opener(opener)
    try:
        rsp = request.urlopen(url)
        html = rsp.read().decode()
        print(html)
    except error.URLError as e:
        print(e)

No.7 使用cookie登入網站

'''
使用cookie 登入人人網
複製登入後的cookie
'''

from urllib import request

if __name__ == '__main__':
    url = 'http://www.renren.com/894245278/profile'
    headers = {'Cookie':' 自己的cookie '}
    req = request.Request(url=url,headers=headers)
    rsp = request.urlopen(req)
    html = rsp.read().decode()
    print(html)

No.8 自動配置cookie (自動登入)訪問資料

'''
自動配置cookie爬取資料
    CookieJar 管理儲存cookie 向傳出的http請求新增cookie
              cookie儲存在記憶體中 CookieJar例項回收後,cookie消失
        FileCookieJar 使用檔案儲存cookie
            MozillaCookieJar 建立與mocilla瀏覽器cookie.txt相容的FileCookie
            LwpCookieJar
'''
#利用cookieJar訪問人人網
    #開啟登入介面 自動通過使用者名稱密碼登入
    #利用提取的cookie登入隱私頁面

from urllib import request,error,parse
from http import cookiejar

#建立 cookiejar 的例項
cookie = cookiejar.CookieJar()

#生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)

#建立http請求管理器
http_handler = request.HTTPHandler()

#生成https管理器
https_handler = request.HTTPSHandler()

#建立請求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)


def login():
    url = 'http://www.renren.com/PLogin.do'
    #設定登入資料
    data = {
        'email':'賬號',
        'password':'密碼'
    }
    #資料編碼
    data = parse.urlencode(data).encode()
    req = request.Request(url,data= data)

    rsp = opener.open(req)

def getHomePage():
    url = 'http://www.renren.com/894245278/profile'
    #如果已經執行了login函式 則opener自動包含相應的cookie值
    rsp = opener.open(url)
    html = rsp.read().decode()
    print(html)

if __name__ == '__main__':
    login()
    getHomePage()