Python爬蟲的一些操作
阿新 • • 發佈:2018-10-17
add 一次 設置 app new 下載圖片 afa 練手 json
1.先來個不反爬的
"""這個不設置反爬措施,練手最好用""" import requests from bs4 import BeautifulSoup response = requests.get("https://www.autohome.com.cn/news/") # 轉換編碼 response.encoding = ‘gbk‘ # 封裝html到soup soup = BeautifulSoup(response.text, ‘html.parser‘) # 找到匹配的第一個div div = soup.find(name=‘div‘, attrs={‘id‘: ‘auto-channel-lazyload-articleView Code‘}) # 找到此div下所有li li_list = div.find_all(name=‘li‘) # 循環獲取數據 for li in li_list: title = li.find(name=‘h3‘) if not title: continue p = li.find(name=‘p‘) a = li.find(name=‘a‘) print(title.text) print(a.attrs.get(‘href‘)) print(p.text) img = li.find(name=‘img‘) src= img.get(‘src‘) src = "https:" + src print(type(src)) print(type(title.text)) # 再次發起請求,下載圖片到本地 file_name = src.rsplit(‘/‘, maxsplit=1)[1] ret = requests.get(src) with open(file_name, ‘wb‘) as f: f.write(ret.content)
2.來個獲取數據的
"""進階爬蟲1""" import requestsView Codefrom bs4 import BeautifulSoup res = requests.get( url="http://jandan.net/", ) soup = BeautifulSoup(res.text, "html.parser") div = soup.find(name="div", attrs={"id": "content"}) div_list = div.find_all(name="div", attrs={"class": "post f list-post"}) for div in div_list: print(div.text.strip()) # 獲取所有文本 # img = div.find(name="img") # src = img.get("src") # if not src: # continue # src = "https:" + src # print(src) 獲取圖片 # h = div.find(name="h2") # a = h.find(name="a") # print(a.text) 獲取標題
3.來個有點難度的
"""爬蟲進階2""" import requests # 1. 查看首頁 r1 = requests.get( url=‘https://dig.chouti.com/‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ } ) # 2. 提交用戶名和密碼 r2 = requests.post( url=‘https://dig.chouti.com/login‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, data={ ‘phone‘:‘86你的賬號‘, ‘password‘:‘你的密碼‘, ‘oneMonth‘:1 }, cookies=r1.cookies.get_dict() ) # 3. 點贊 r3 = requests.post( url=‘https://dig.chouti.com/link/vote?linksId=20435396‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, cookies=r1.cookies.get_dict() ) print(r3.text)View Code
4.來個再難一點的
"""進階爬取3""" import requests import re from bs4 import BeautifulSoup # 先偽裝login請求 res = requests.get( url="https://passport.lagou.com/login/login.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360" } ) # print(res.text) 原話(動態token,防禦偽造請求,重復提交)(小坑) # 笑一會兒 # 獲取token(正則匹配) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", res.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", res.text, re.S)[0] ret = requests.post( url="https://passport.lagou.com/login/login.json", # 登錄網址發送前發個錯的獲取登錄url headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360", "X-Anit-Forge-Token": X_Anti_Forge_Token, "X_Anti_Forge_Code": X_Anti_Forge_Code, "Referer": "https://passport.lagou.com/login/login.html", # 上一次提交地址(小坑) }, data={ # 發送post數據 "isValidate": True, "username": 你的賬號, "password": "你的密碼", "request_form_verifyCode": "", "submit": "", "challenge": "c87407cd89add055d8f1b54ad579cec8", }, cookies=res.cookies.get_dict(), # 帶著登錄頁面的cookies獲取權限(小坑) ) r1 = requests.get( url="https://www.lagou.com/zhaopin/Python/?labelWords=label", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360", "Referer": "https://www.lagou.com/", # 上一次的登錄網址(可以re匹配一下動態獲取) }, cookies=ret.cookies.get_dict(), ) soup = BeautifulSoup(r1.text, "html.parser") div = soup.find(name="div", attrs={"id": "s_position_list"}) li_list = div.find_all(name="li") for li in li_list: title = li.find(name="h3") if not title: continue money = li.find(name="span") div = li.find(name="div", attrs={"class": "li_b_l"}) a = li.find(name="a") print(title.text) print(money.text) print(div.text) print(a.text)View Code
5.來個github的
"""進階爬取4""" import requests from bs4 import BeautifulSoup r1 = requests.get( url="https://github.com/session", # 這點註意url,登錄是login獲取cookies是session(小坑) headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, } ) soup = BeautifulSoup(r1.text, "html.parser") inp = soup.find(name="input", attrs={"name": "authenticity_token"}) cookies = r1.cookies.get_dict() token = inp.get("value") # 登錄 r2 = requests.post( url="https://github.com/login", headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, data={ "commit": "Sign in", "utf8": "?", "authenticity_token": token, "login": "你的賬號", "password": "你的密碼", }, cookies=cookies ) # 後續要啥隨你 print(r2.text)View Code
Python爬蟲的一些操作