1. 程式人生 > >Python爬蟲的一些操作

Python爬蟲的一些操作

add 一次 設置 app new 下載圖片 afa 練手 json

1.先來個不反爬的

技術分享圖片
"""這個不設置反爬措施,練手最好用"""
import requests
from bs4 import BeautifulSoup


response = requests.get("https://www.autohome.com.cn/news/")
# 轉換編碼
response.encoding = gbk
# 封裝html到soup
soup = BeautifulSoup(response.text, html.parser)
# 找到匹配的第一個div
div = soup.find(name=div, attrs={id: auto-channel-lazyload-article
}) # 找到此div下所有li li_list = div.find_all(name=li) # 循環獲取數據 for li in li_list: title = li.find(name=h3) if not title: continue p = li.find(name=p) a = li.find(name=a) print(title.text) print(a.attrs.get(href)) print(p.text) img = li.find(name=img) src
= img.get(src) src = "https:" + src print(type(src)) print(type(title.text)) # 再次發起請求,下載圖片到本地 file_name = src.rsplit(/, maxsplit=1)[1] ret = requests.get(src) with open(file_name, wb) as f: f.write(ret.content)
View Code

2.來個獲取數據的

技術分享圖片
"""進階爬蟲1"""
import requests
from bs4 import BeautifulSoup res = requests.get( url="http://jandan.net/", ) soup = BeautifulSoup(res.text, "html.parser") div = soup.find(name="div", attrs={"id": "content"}) div_list = div.find_all(name="div", attrs={"class": "post f list-post"}) for div in div_list: print(div.text.strip()) # 獲取所有文本 # img = div.find(name="img") # src = img.get("src") # if not src: # continue # src = "https:" + src # print(src) 獲取圖片 # h = div.find(name="h2") # a = h.find(name="a") # print(a.text) 獲取標題
View Code

3.來個有點難度的

技術分享圖片
"""爬蟲進階2"""
import requests
# 1. 查看首頁
r1 = requests.get(
    url=https://dig.chouti.com/,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    }
)

# 2. 提交用戶名和密碼
r2 = requests.post(
    url=https://dig.chouti.com/login,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    },
    data={
        phone:86你的賬號,
        password:你的密碼,
        oneMonth:1
    },
    cookies=r1.cookies.get_dict()
)


# 3. 點贊
r3 = requests.post(
    url=https://dig.chouti.com/link/vote?linksId=20435396,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    },
    cookies=r1.cookies.get_dict()
)
print(r3.text)
View Code

4.來個再難一點的

技術分享圖片
"""進階爬取3"""
import requests
import re
from bs4 import BeautifulSoup

# 先偽裝login請求
res = requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360"
    }
)
# print(res.text)   原話(動態token,防禦偽造請求,重復提交)(小坑)
# 笑一會兒
# 獲取token(正則匹配)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", res.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", res.text, re.S)[0]

ret = requests.post(
    url="https://passport.lagou.com/login/login.json",      # 登錄網址發送前發個錯的獲取登錄url
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "X-Anit-Forge-Token": X_Anti_Forge_Token,
        "X_Anti_Forge_Code": X_Anti_Forge_Code,
        "Referer": "https://passport.lagou.com/login/login.html",     # 上一次提交地址(小坑)
    },
    data={           # 發送post數據
        "isValidate": True,
        "username": 你的賬號,
        "password": "你的密碼",
        "request_form_verifyCode": "",
        "submit": "",
        "challenge": "c87407cd89add055d8f1b54ad579cec8",
    },
    cookies=res.cookies.get_dict(),     # 帶著登錄頁面的cookies獲取權限(小坑)
)

r1 = requests.get(
    url="https://www.lagou.com/zhaopin/Python/?labelWords=label",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "Referer": "https://www.lagou.com/",    # 上一次的登錄網址(可以re匹配一下動態獲取)
    },
    cookies=ret.cookies.get_dict(),
)

soup = BeautifulSoup(r1.text, "html.parser")
div = soup.find(name="div", attrs={"id": "s_position_list"})
li_list = div.find_all(name="li")
for li in li_list:
    title = li.find(name="h3")
    if not title:
        continue
    money = li.find(name="span")
    div = li.find(name="div", attrs={"class": "li_b_l"})
    a = li.find(name="a")
    print(title.text)
    print(money.text)
    print(div.text)
    print(a.text)
View Code

5.來個github的

技術分享圖片
"""進階爬取4"""
import requests
from bs4 import BeautifulSoup


r1 = requests.get(
    url="https://github.com/session",     # 這點註意url,登錄是login獲取cookies是session(小坑)
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
    }
)
soup = BeautifulSoup(r1.text, "html.parser")
inp = soup.find(name="input", attrs={"name": "authenticity_token"})
cookies = r1.cookies.get_dict()
token = inp.get("value")
# 登錄
r2 = requests.post(
    url="https://github.com/login",
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
    },
    data={
        "commit": "Sign in",
        "utf8": "?",
        "authenticity_token": token,
        "login": "你的賬號",
        "password": "你的密碼",
    },
    cookies=cookies
)
# 後續要啥隨你
print(r2.text)
View Code

Python爬蟲的一些操作