爬取取百度和Flickr影象
阿新 • • 發佈:2020-08-21
爬取百度和Flickr影象
import requests from threading import Thread import re import time import hashlib class BaiDu: """ 爬取百度圖片 """ def __init__(self, name, page): self.start_time = time.time() self.name = name self.page = page #self.url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&' self.url = 'https://image.baidu.com/search/acjson' self.header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}# 新增為自己的瀏覽器版本,具體操作網上一大推 self.num = 0 self.all_num = 0 self.thread_all = [] # thread num def queryset(self): """ 將字串轉換為查詢字串形式 """ pn = 0 for i in range(int(self.page)): pn += 60 * i name = {'word': self.name, 'pn': pn, 'tn':'resultjson_com', 'ipn':'rj', 'rn':60} url = self.url self.all_num += 60 self.getrequest(i, url, name) def getrequest(self, index, url, data): """ 傳送請求 """ print('[INFO]: 開始傳送請求:' + url) ret = requests.get(url, headers=self.header, params=data) if str(ret.status_code) == '200': print('[INFO]: request 200 ok :' + ret.url) else: print('[INFO]: request {}, {}'.format(ret.status_code, ret.url)) response = ret.content.decode() img_links = re.findall(r'thumbURL.*?\.jpg', response) links = [] # 提取url for link in img_links: links.append(link[11:]) self.build_thread(index, links) def saveimage(self, links): """ 儲存圖片 """ for i, link in enumerate(links): if not link: continue #print('[INFO]:正在儲存圖片:' + link) m = hashlib.md5() m.update(link.encode()) name = m.hexdigest() try: ret = requests.get(link, headers = self.header) image_content = ret.content filename = './images/' + name + '.jpg' with open(filename, 'wb') as f: f.write(image_content) #print('[INFO]:儲存成功,圖片名為:{}.jpg'.format(name)) except Exception: pass self.num += 1 def run(self): for thred_p in self.thread_all: thred_p.start() for thred_p in self.thread_all: thred_p.join() def build_thread(self, i, links): """多執行緒""" self.thread_all.append(Thread(target=self.saveimage, args=(links,))) def __del__(self): end_time = time.time() print('request total images: {}, actual download images: {}, time cost {} second'.format(self.all_num, self.num, (end_time - self.start_time))) def main(): hand_name = ['人臉', 'head','arm'] for name in hand_name: #name = '手勢影象'#input('請輸入你要爬取的圖片型別: ') page = 10 #input('請輸入你要爬取圖片的頁數(60張一頁):') baidu = BaiDu(name, page) baidu.queryset() baidu.run() if __name__ == '__main__': main()
#coding:utf-8 import flickrapi import urllib import os from threading import Thread from tqdm import tqdm class CrawlFlickr: def __init__(self, API_KEY="", API_SECRET="", SavePath="", PerPage=10, Text="", Tags="", ThreadNum=4, MaxCounter=10): self.urls = [] self.ThreadNum = ThreadNum self.SavePath = SavePath self.Thread_All = [] self.MaxCounter = MaxCounter flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, cache=True) self.photos=flickr.walk(text=Text, tag_mode='all', tags=Tags, extras='url_c', per_page=PerPage, # may be you can try different numbers.. sort='relevance') self.get_url() self.build_thread() def get_url(self): for i, photo in enumerate(self.photos): url = photo.get('url_c') if str(url) == "None": continue self.urls.append(url) if i >= self.MaxCounter: break if i%200==0: print('get {} url, max {}\n'.format(len(self.urls), self.MaxCounter)) print('\nget {} url finish.....\n'.format(len(self.urls))) def build_thread(self): if self.ThreadNum >= len(self.urls): raise ValueError(f"Input Thread number is large: {self.ThreadNum}," "while data is small: {len(self.urls)}") part = len(self.urls) // self.ThreadNum for i in range(self.ThreadNum)[::-1]: self.Thread_All.append(Thread(target=self.get_img, args=(self.urls[i * part:],))) self.urls = self.urls[:i * part] print('build thread finish...\n') def run(self): for thred in self.Thread_All: thred.start() for thred in self.Thread_All: thred.join() print('download image finish...\n') def get_img(self, urls): for url in urls: img_name = url.split('/')[-1] if '.jpg' in img_name or '.png' in img_name: urllib.request.urlretrieve(url, os.path.join(self.SavePath, img_name)) print('download {}\n'.format(os.path.join(self.SavePath, img_name))) if __name__ == "__main__": param = dict( API_KEY="", API_SECRET="", SavePath="./images", PerPage=10, Text="human pose", Tags="", ThreadNum=8, MaxCounter=500 ) crawl_flickr = CrawlFlickr(**param) crawl_flickr.run()