1. 程式人生 > >requests-爬取美女圖片原始碼

requests-爬取美女圖片原始碼

爬取思路:
1.分析ajax請求,找到存放圖片地址的json
2.解析json資料,提取中圖片url
3.再次請求圖片url,通過open()和write()方法將圖片儲存至內地。

廢話少說,直接上程式碼:
前提條件是在當前.py檔案同級目錄下新建一個beauty360的資料夾用來儲存圖片

import requests
import time
import re

base_url="https://image.so.com/z?"
num=1
headers={
"Host": "image.so.com",
"Referer": "https://image.so.com/zv?ch=beauty",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "__guid=16527278.4407656107534301000.1546852761488.196; __guid=100021698.456336978600101800.1546852883449.8489; count=2; tracker=; lightbox_thumb_visible=1; _S=ab9f5ecb680ae35247705feda8f5bda4; test_cookie_enable=null"
}

header1={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
#獲取存有具體圖片url地址的json資料
def get_json(page):
    paras = {
        "ch": "beauty",
        "a": "jsonpViewScroll",
        "i": page,
        "count": 30
    }
    try:
        response=requests.get(base_url,params=paras,headers=headers)
        if response.status_code==200:
            json=response.json()
            return json
    except:
        print("wrong url.")
        
#解析圖片url地址,並儲存圖片至本地
def get_pic(json):
    global num
    datas=json.get("data")
    if datas:
        for item in datas:
            groupdatas=item.get("groupdata")
            if groupdatas:
                for group in groupdatas:
                    picurl=group.get("qhimg_url").strip()
                    response = requests.get(picurl, headers=header1)
                    if response.status_code==200:
                        num=num+1
                        print(str(num)+": "+picurl)
                        with open(r"./beauty360/"+str(num)+".jpg","wb") as fp:
                            fp.write(response.content)

if __name__=="__main__":
    for page in range(1,600):
        json=get_json(page)
        get_pic(json)
        time.sleep(3)