1. 程式人生 > >抓取今日頭條的街拍美女圖片

抓取今日頭條的街拍美女圖片

由於今日頭條的反扒機制的更新,利用多執行緒,將圖片儲存在資料夾中,將路徑儲存在mongo db中
import codecs
import pymongo
import requests
import json
import re
import os
from hashlib import md5
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests import RequestException
from confug import *
from multiprocessing import Pool

client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

def get_page_index(offset,keyword):
    data={
        'offset': 0,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3,
        'from':'gallery'
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    response=requests.get(url)
    if response.status_code==200:
        return response.text
    else:
        return None

def parse_page_index(html):
    data=json.loads(html)
    if data and "data" in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

def get_data_detail(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    try:

        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
    except RequestException:
        print('請求詳情頁出錯')
        return None

def parsee_page_detail(html,url):
    soup=BeautifulSoup(html,'lxml')
    title=soup.select('title')[0].get_text()#查詢標題
    image_pattern = re.compile('gallery: JSON.parse\("(.*?)"\)', re.S)
    # print(response.text)
    result = re.search(image_pattern, html)

    # print(result.group(1))輸出json解析的內容
    # 解碼
    if result != None:
        data_str = codecs.getdecoder('unicode_escape')(result.group(1))[0]
        data_json = json.loads(data_str)
        # print(data_json)

        sub_images = data_json.get('sub_images')
        images = [item.get('url') for item in sub_images]
        for image in images:
            donwload_image(image)
        return {
            'title':title,
            'url':url,
            'images':images
        }
            # print(urls)
def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('成功儲存')
        return True
    return False

def donwload_image(url):
    print('正在下載',url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    try:

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            save_image(response.content)
            # response.content表示返回二進位制結果
            return response.text
    except RequestException:
        print('請求詳情頁出錯')
        return None
def save_image(content):


    file_name='{0}/{1}.{2}'.format(os.path.dirname('G:\pic\\'),md5(content).hexdigest(),'jpg')#使用md5,防止檔案重複
    # 儲存到當前路徑,檔名自動雜湊生成
# 路徑 檔名 字尾
    if not os.path.exists(file_name):
        f=open(file_name,'wb')
        f.write(content)
        f.close()

def main(offset):
    html=get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        # print(url)
        htmll=get_data_detail(url)
        # print(htmll)
        result=parsee_page_detail(htmll,url)
        if result:save_to_mongo(result)
        print(result)

if __name__ == '__main__':

    group=[x*20 for x in range(GROUP_START,GROUP_END+1)]
    pool = Pool()  # 建立程序池
    pool.map(main,group)