Python批量爬取堆糖網圖片
阿新 • • 發佈:2018-12-20
import urllib.parse import requests #第三方請求庫 import json import jsonpath #處理json檔案的的提取庫 from bs4 import BeautifulSoup import os import urllib import re label = 'AI' label = urllib.parse.quote(label) #https://www.duitang.com/napi/blog/list/by_search/?kw=%E6%A0%A1%E8%8A%B1&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start=24&_=1541772636388 url = 'https://www.duitang.com/search/?kw={}&start{}' os.path.abspath('D:/Python/AI') for i in range(0, 24, 240): #進行翻頁程式碼迭代 u = url.format(label,i) r = requests.get(u) print(len(r.text)) print(r.text) print(r.encoding) soup = BeautifulSoup(r.text, 'html.parser') print(len(soup)) se = soup.findAll('a',{'class':{'a'}}) print(se) for ii in se: print(ii.img.get('alt'),ii.img.get('src')) img =re.findall('https://b-ssl.duitang.com/uploads/item/(.*?).thumb.224_0.(.*?)',ii.img.get('src')) file_path= 'D:/Python/AI' file_name = img[0][0] print(type(file_name)) ## file_suffix2 = file_suffix1.split( ) ## print(file_suffix2) #print(os.path.splitext(ii.img.get('src'))[1]) #filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix) #print(filename) #file_suffix = os.path.splitext(ii.img.get('src'))[5] #print(file_suffix) #img_name = img[0][0] #print(img_name) #filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix) #os.makedirs(file_path) #urllib.request.urlretrieve(ii.img.get('src'),filename = filename) ## with urllib.request.urlopen(ii.img.get('src'), timeout=30) as response,open(filename, 'wb') as f_save: ## f_save.write(response.read()) ## f_save.flush() ## f_save.close() ## print("成功") # file_suffix = os.path.splitext(img_url)[1] #print(file_suffix) #拼接圖片名(包含路徑) #filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix) #print(filename) #下載圖片,並儲存到資料夾中 #urllib.request.urlretrieve(img_url,filename=filename) file_path='D:/book/img' file_name ="pyt" ##import os,stat ##import urllib.request ## ##img_url="https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1516371301&di=d99af0828bb301fea27c2149a7070" \ ## "d44&imgtype=jpg&er=1&src=http%3A%2F%2Fupload.qianhuaweb.com%2F2017%2F0718%2F1500369506683.jpg" ##file_path='D:/book/img' ##file_name ="pyt" ## ##try: ## #是否有這個路徑 ## if not os.path.exists(file_path): ## #建立路徑 ## os.makedirs(file_path) ## #獲得圖片字尾 ## file_suffix = os.path.splitext(img_url)[1] ## print(file_suffix) ## #拼接圖片名(包含路徑) ## filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix) ## print(filename) ## #下載圖片,並儲存到資料夾中 ## urllib.request.urlretrieve(img_url,filename=filename) ## ##except IOError as e: ## print("IOError") ##except Exception as e: ## print("Exception") ## ## ##二:利用讀寫操作寫入檔案,具體程式碼: ## ##import os,stat ##import urllib.request ## ##for i in range(1,3): ## if not os.path.exists("./rym"): ## print("不純在") ## os.makedirs("./rym") ## ## else: ## print("存在") ## os.chmod("D:/imagss",777) ## ## ## with urllib.request.urlopen("https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1516371301&di=d99af0828b" ## "b301fea27c2149a7070d44&imgtype=jpg&er=1&src=http%3A%2F%2Fupload.qianhuaweb.com%2F2017%2F0718%" ## "2F1500369506683.jpg", timeout=30) as response, open("./rym/lyj.png" ## , 'wb') as f_save: ## f_save.write(response.read()) ## f_save.flush() ## f_save.close() ## print("成功")