python_爬取【搜狗圖片】
阿新 • • 發佈:2019-02-03
1.利用python抓取網站上的圖片,對於學習python及對網頁資料分析處理很有幫助,也可以學習一些web方面的知識,我嘗試使用【搜狗圖片】搜尋到的圖片作為抓取物件,抓取【搜狗圖片】主頁各個標題欄的圖片,以及【其他】輸入圖片型別的圖片,使用tkinter完成了一個簡單的UI介面。
2.一般抓取網頁圖片,需要先訪問頁面,然後提取原始碼,依次解析各個圖片URL,然後直接下載即可,這些網上的教程很多,在此不再贅述。但是對於一些圖片較多的頁面,往往使用動態載入的方式呈現圖片,也就是我們抓取的頁面原始碼中,並沒有各個圖片的URL,這就需要 分析頁面結構,找到頁面圖片真正的URL資源地址,才能完成下載。
3.例如在【搜狗圖片】搜尋美女,然後點進圖片,查詢該圖片的URL地址:
【請求URL】:
【頁面原始碼】:找不到對應的圖片URL。
發現【標題類】的圖片都集中在如下URL:
而通過【搜尋】得到得圖片URL集中在:
這樣,就可以很清楚的得到各個圖片URL的地址,爬取圖片了。
4.原始碼:
執行結果:圖片#-*- encoding=UTF-8 -*- import urllib.request,socket,re,sys,os from urllib.request import urlopen import time from tkinter import * import webbrowser from bs4 import BeautifulSoup import requests import json import urllib ##############################常量區############################## sougou_url="http://pic.sogou.com/" ###URL download_pics_path="C:/bz2018/" download_pics_num=10 download_success = "" sougou_pics_tag=["pic_url","thumbUrl","bthumbUrl","ori_pic_url"] sougou_url_pics_start="http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=" sougou_url_pics_mid="&tag=%E5%85%A8%E9%83%A8&start=0&len=" sougou_url_pics_start_other="http://pic.sogou.com/pics?query=" sougou_url_pics_mid_other="&did=1&mode=1&start=0&len=" sougou_url_pics_stop_other="&reqType=ajax" ###title title_key_start="a class=\"nav-tab\" href=\"/pics/" title_key_stop="<" ###tkinter window_name="搜狗圖片下載器" window_size="500x500" frm_bg="white" real_columnspan=4 label_type_str="---------------------------------------------圖片型別---------------------------------------------" ##############################常量區############################## real_url_arr = [] ###組成url集合 ##############################函式區############################## ###獲取網頁上標題,返回標題陣列 def get_title(url): html = urlopen(url) sougou_html = BeautifulSoup(html.read()) title_key = [] for ihtml in sougou_html: data1 = str(ihtml).split(title_key_start) if len(data1) > 1: for jhtml in data1: data2 = jhtml.split(title_key_stop)[0] data3 = data2.split("\">") if len(data3) == 2: title_key.append(data3[1]) return title_key ###獲取網頁圖片並下載,返回下載失敗個數 def get_pics(url,path): # 檢測當前路徑的有效性 if not os.path.isdir(path): os.mkdir(path) pics_str = requests.get(url) pics_dict = json.loads(pics_str.text) pics_dict_items = pics_dict['all_items'] i_item=0 fail_count=0 for item in pics_dict_items: fail_flag=0 for itag in sougou_pics_tag: try: pic_url=item[itag] pic_title=item['title'] if pic_title == "": pic_title = str(i_item) i_item = i_item + 1 if pic_url != "": urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg') print(pic_title+": download complete!") fail_flag=1 break except: print("download fail!") continue if fail_flag != 1: fail_count=fail_count+1 return fail_count def get_pics_other(url,path): pics_str = requests.get(url) pics_dict = json.loads(pics_str.text) pics_dict_items = pics_dict['items'] i_item=0 fail_count=0 for item in pics_dict_items: fail_flag=0 for itag in sougou_pics_tag: try: pic_url=item[itag] pic_title=item['title'] pic_title=pic_title+str(i_item) i_item = i_item + 1 if pic_url != "": urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg') print(pic_title+": download complete!") fail_flag=1 break except: print("download fail!") continue if fail_flag != 1: fail_count=fail_count+1 return fail_count def url_get_othertype(): global real_url_arr if PhotoType.get() != "": real_url_arr.append(PhotoType.get()) real_url_arr = list(set(real_url_arr)) def url_get_phototype(all_type): global real_url_arr real_url_arr=[] url_get_othertype() if "其他" in all_type: all_type.remove("其他") for i in range(len(all_type)): if CheckType[i].get() == 1: real_url_arr.append(typeBtn[all_type[i]]['text']) real_url_arr = list(set(real_url_arr)) def other_type(): if OtherType.get() == 1 : type["state"] = "normal" else: type["state"] = "disabled" PhotoType.set("") def get_full_url(all_type): global download_pics_num down_result["text"] = "" url_get_phototype(all_type) if download_num_str.get() != "": download_pics_num = int((download_num_str.get())) sum = len(real_url_arr) * download_pics_num down_result["text"] = "準備下載: " + str(sum) + "張照片" fail_num = 0 for iurl in real_url_arr: if iurl in photo_type: tmp_url=sougou_url_pics_start+iurl+sougou_url_pics_mid+str(download_pics_num) fail_num = fail_num + get_pics(tmp_url, download_pics_path) else: tmp_url=sougou_url_pics_start_other + iurl + sougou_url_pics_mid_other + str(download_pics_num) + sougou_url_pics_stop_other time.sleep(1) fail_num = fail_num + get_pics_other(tmp_url, download_pics_path) down_result["text"] ="成功下載: " + str(sum-fail_num) + "張照片" ###tkinter label佔一行 def write_line(row,text="",column=0,columnspan=real_columnspan,bg=frm_bg): label = Label(frm, text=text, bg=bg) label.grid(row=row, column=column,columnspan=columnspan) return label ###呼叫網頁 def callback(url=sougou_url): webbrowser.open_new(url) ##############################函式區############################## ##############################UI部分########################################## root =Tk() #給窗體 root.title(window_name) #設定窗體名字 root.geometry(window_size) root.resizable(width=False, height=False) ###固定窗體大小 frm=Frame(root,bg=frm_bg) #新建框架 frm.pack(expand = YES,fill = BOTH) #放置框架 ###控制行的引數 real_row=0 ###空一行 write_line(real_row) real_row=real_row+1 ###進入官網 Button(frm,text="點選進入搜狗圖片官網",command=callback).grid(row=real_row,column=0,columnspan=real_columnspan,sticky=N) real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 ###圖片型別 write_line(real_row,label_type_str) real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 ###checkbutton photo_type=get_title(sougou_url) photo_type.append("其他") typeBtn={} CheckType=[] real_column=0 for itype in photo_type: if itype == "其他": OtherType = IntVar() PhotoType = StringVar() type = Entry(frm, textvariable=PhotoType, width=9, state='disabled') # 新增輸入框 Checkbutton(frm, text="其他", variable=OtherType, onvalue=1, offvalue=2, command=other_type).grid(row=real_row, column=1) type.grid(row=real_row, column=2, columnspan=4, sticky=W, padx=40, ipadx=60) # 放置輸入框位置 else: CheckType.append(IntVar()) typeBtn[itype]=Checkbutton(frm, text=itype, variable=CheckType[-1], command=lambda: url_get_phototype(photo_type)) typeBtn[itype].grid(row=real_row, column=real_column) real_column=real_column+1 if real_column == 4: real_column = 0 real_row = real_row + 1 real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 ###下載個數 lab1 = Label(frm,text = "下載個數:")# 新增Label lab1.grid(row = real_row,column=0) download_num_str = StringVar() download_num = Entry(frm,width=10,textvariable=download_num_str)# 新增Entry download_num.grid(row = real_row,column=1,sticky=W) real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 ###get Button(frm,text="獲取照片",command=lambda: get_full_url(photo_type)).grid(row=real_row,column=0,columnspan=4,sticky=N) real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 ###結果 down_result=write_line(real_row) real_row=real_row+1 ###空一行 write_line(real_row) real_row=real_row+1 Button(frm,text="退出程式",command=root.quit).grid(row=real_row,column=0,columnspan=4,sticky=N) real_row=real_row+1 mainloop() ##############################UI部分##########################################