抓取今日頭條的街拍美女圖片
阿新 • • 發佈:2018-11-07
由於今日頭條的反扒機制的更新,利用多執行緒,將圖片儲存在資料夾中,將路徑儲存在mongo db中 import codecs import pymongo import requests import json import re import os from hashlib import md5 from urllib.parse import urlencode from bs4 import BeautifulSoup from requests import RequestException from confug import * from multiprocessing import Pool client=pymongo.MongoClient(MONGO_URL) db=client[MONGO_DB] def get_page_index(offset,keyword): data={ 'offset': 0, 'format': 'json', 'keyword': '街拍', 'autoload': 'true', 'count': '20', 'cur_tab': 3, 'from':'gallery' } url='https://www.toutiao.com/search_content/?'+urlencode(data) response=requests.get(url) if response.status_code==200: return response.text else: return None def parse_page_index(html): data=json.loads(html) if data and "data" in data.keys(): for item in data.get('data'): yield item.get('article_url') def get_data_detail(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text except RequestException: print('請求詳情頁出錯') return None def parsee_page_detail(html,url): soup=BeautifulSoup(html,'lxml') title=soup.select('title')[0].get_text()#查詢標題 image_pattern = re.compile('gallery: JSON.parse\("(.*?)"\)', re.S) # print(response.text) result = re.search(image_pattern, html) # print(result.group(1))輸出json解析的內容 # 解碼 if result != None: data_str = codecs.getdecoder('unicode_escape')(result.group(1))[0] data_json = json.loads(data_str) # print(data_json) sub_images = data_json.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: donwload_image(image) return { 'title':title, 'url':url, 'images':images } # print(urls) def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('成功儲存') return True return False def donwload_image(url): print('正在下載',url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} try: response = requests.get(url, headers=headers) if response.status_code == 200: save_image(response.content) # response.content表示返回二進位制結果 return response.text except RequestException: print('請求詳情頁出錯') return None def save_image(content): file_name='{0}/{1}.{2}'.format(os.path.dirname('G:\pic\\'),md5(content).hexdigest(),'jpg')#使用md5,防止檔案重複 # 儲存到當前路徑,檔名自動雜湊生成 # 路徑 檔名 字尾 if not os.path.exists(file_name): f=open(file_name,'wb') f.write(content) f.close() def main(offset): html=get_page_index(offset,KEYWORD) for url in parse_page_index(html): # print(url) htmll=get_data_detail(url) # print(htmll) result=parsee_page_detail(htmll,url) if result:save_to_mongo(result) print(result) if __name__ == '__main__': group=[x*20 for x in range(GROUP_START,GROUP_END+1)] pool = Pool() # 建立程序池 pool.map(main,group)