python之路 -- 爬蟲二篇 -- 常用模塊
1.requests
Requests 是用Python語言編寫,基於 urllib,采用 Apache2 Licensed 開源協議的 HTTP 庫。它比 urllib 更加方便,可以節約我們大量的工作,完全滿足 HTTP 測試需求。
requests模塊的參數
1.1 get #發送get請求
requests.get( )的參數有:url、params、headers、cookies
1 requests.get( 2 url=”http: // www.oldboyedu.com”, 3 params = {“nid”:1,”name”:”xx”} # 實際上傳入的url為http://www.oldboyedu.com?Nid=1&name=xx #url中傳入參數4 headers = {...}, 5 cookies = {...} 6 )
1.2 post #發送post請求
requests.post( )的參數有:url、params , haders , data , cookies
post中的參數用法和get中的一樣,就不一一贅述了。
1.3 proxies
proxies --代理
1 # 發送文件,定制文件名(上傳文件) 2 # file_dict = { 3 # ‘f1‘: (‘test.txt‘, open(‘readme‘, ‘rb‘)) 4 #} 5 # requests.request(method=‘POST‘, 6 # url=‘http://127.0.0.1:8000/test/‘, 7 # files=file_dict)
1.4 json
當請求中提交的不是From Data作為數據,而是payload.時使用,導入json模塊json.dumps(data)
post發送json數據
1 import requests 2 import json 3 4 r = requests.post(‘https://api.github.com/some/endpoint‘, data=json.dumps({‘some‘: ‘data‘})) 5 print(r.json())
1.5 auth
做基本的認證
1.6 timeout
#超時時間 timeout=(m,n) #表示 請求時間最多n秒;響應時間最多等待接收m秒
1.7 allow_redirects
是否支持重定向,默認為True
1.8 stream
下載大文件是使用,一點一點的下載
1 ret = requests.get(‘http://127.0.0.1:8000/test/‘, stream=True) 2 for i in r.iter_content(): 3 print(i) 4 from contextlib import closing 5 with closing(requests.get(‘http://httpbin.org/get‘, stream=True)) as r: 6 # 在此處理響應。 7 for i in r.iter_content(): 8 print(i)View Code
1.9 cert: 證書
1.10 verify: 確認
2.BeautifulSoup
Beautiful Soup 是一個可以從 HTML 或 XML 文件中提取數據的 Python 庫.它能夠通過你喜歡的轉換器實現慣用的文檔導航,查找,修改文檔的方式.Beautiful Soup 會幫你節省數小時甚至數天的工作時間.
2.1 bs4的安裝
pip install BeautifulSoup4
2.2 解析
1 import requests 2 from bs4 import BeautifulSoup 3 4 ret = requests.get("http://www.baidu.com") 5 soup = BeautifulSoup(ret.text,‘html.parser‘) 6 print(soup) #打印解析出來的html
2.3 find和find_all 方法
1 div = soup.find(name="div",attrs={"id":"content-list"}) 2 # 找到標簽名為div,id屬性為content-list的標簽,返回此div標簽中的所有內容 3 items = div.find_all(name="div",attrs={"class":"item"}) 4 #找到標簽名為div,class屬性為item的所有標簽,返回所有此class屬性的div標簽
一大波練習這兩個爬蟲最常用模塊的實例:
1.自動登錄抽屜並批量點贊
1 import requests 2 from bs4 import BeautifulSoup 3 4 #獲取每一頁頁面的URL 5 for page in range(5,6): 6 pageurl = "https://dig.chouti.com/all/hot/recent/%s"%page 7 8 header = { 9 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" 10 } 11 #循環訪問每一頁,並獲取cookies 12 response = requests.get( 13 url=pageurl, 14 headers = header 15 ) 16 cookie1_dict = response.cookies.get_dict() 17 # response.encoding = response.apparent_encoding 18 # print(response.text) 19 20 # 發送post請求,進行登錄 21 data = { 22 "phone":"********", 23 "password":"*******", 24 "oneMonth":1 25 } 26 response1 = requests.post(url="https://dig.chouti.com/login", 27 data=data, 28 headers=header, 29 cookies = cookie1_dict 30 ) 31 32 #找到每頁的各個新聞的ID 33 soup = BeautifulSoup(response.text,"html.parser") 34 div = soup.find(name="div",attrs={"id":"content-list"}) 35 # print(div) 36 items = div.find_all(name="div",attrs={"class":"item"}) 37 for item in items: 38 id=item.find(name="div",attrs = {"class":"part2"}).get("share-linkid") 39 40 #進行點贊操作 41 response2 = requests.post(url="https://dig.chouti.com/link/vote?linksId=%s"%id, 42 headers={ 43 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" 44 }, 45 cookies = cookie1_dict 46 47 ) 48 print(response2.text)View Code
2.自動登錄GitHub並獲取個人信息
1 import requests 2 from bs4 import BeautifulSoup 3 4 res = requests.get(url="https://github.com/login") 5 soup1 = BeautifulSoup(res.text,"html.parser") 6 tag = soup1.find(name=‘input‘, attrs={‘name‘: ‘authenticity_token‘}) 7 authenticity_token = tag.get(‘value‘) 8 cookie1 = res.cookies.get_dict() 9 10 header={ 11 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 12 } 13 14 res_login = requests.post(url="https://github.com/session", 15 headers = header, 16 data = { 17 "commit":"Sign in", 18 "utf8":"?", 19 "authenticity_token":authenticity_token, 20 "login":"******", 21 "password":"**********" 22 }, 23 cookies = cookie1 24 ) 25 cookie2 = res_login.cookies.get_dict() 26 # print(res_login.text) 27 28 res_message = requests.get(url="https://github.com/Aberwang", 29 headers=header, 30 cookies = cookie2, 31 ) 32 # print(res_message.text) 33 soup2 = BeautifulSoup(res_message.text,"html.parser") 34 div = soup2.find(name="div",attrs={"id":"js-pjax-container"}) 35 h1 = div.find(name="h1",attrs={"class":"vcard-names"}) 36 span = h1.find(name="span",attrs={"class":"p-nickname vcard-username d-block"}) 37 username = span.get_text() 38 print("獲取到的用戶名為:",username) 39 a = div.find(name="a",attrs={"class":"u-photo d-block tooltipped tooltipped-s"}) 40 img = a.find(name="img",attrs={"class":"avatar width-full rounded-2"}) 41 src = img.get("src") 42 print("獲取到的用戶頭像地址為:",src)View Code
3.汽車之家新聞抓取
1 import requests 2 from bs4 import BeautifulSoup 3 4 res = requests.get("https://www.autohome.com.cn/news/") #獲取網頁HTML內容 5 res.encoding = "gbk" 6 7 soup = BeautifulSoup(res.text,"html.parser") 8 #解析所獲得的html頁面 9 li_list = soup.find(id = "auto-channel-lazyload-article").find_all(name = "li") 10 for li in li_list: 11 title = li.find("h3") 12 if not title: 13 continue 14 summary = li.find("p") 15 url = li.find("a").get("href") 16 img = li.find(‘img‘).get(‘src‘) 17 print(title.text,url,summary.text,img)View Code
4.自動登錄碼雲並獲取個人信息
1 import requests 2 from bs4 import BeautifulSoup 3 4 #獲取token 5 r1 = requests.get("https://gitee.com/login") 6 r1.encoding = "utf-8" 7 soup = BeautifulSoup(r1.text,"html.parser") 8 token = soup.find(name = "input",attrs = {"name":"authenticity_token"}).get("value") 9 10 #將用戶名,密碼,token發送到服務端,以POST請求方式 11 12 13 date = { 14 "utf8":"?", 15 "authenticity_token":token, 16 "redirect_to_url":"", 17 "user[login]":"***賬號****", 18 ‘user[password]‘:"***密碼***.", 19 "captcha":"", 20 "user[remember_me]":"0", 21 "commit":"登錄" 22 } 23 r2 = requests.post("https://gitee.com/login",date) 24 25 cookie_dict = r2.cookies.get_dict() 26 r3 = requests.get("https://gitee.com/aberwang/projects",cookie_dict) 27 print(r3.text)View Code
python之路 -- 爬蟲二篇 -- 常用模塊