Python3.4網頁爬蟲,提取圖片
阿新 • • 發佈:2019-01-01
網頁圖片爬蟲:
第一個爬蟲抓去bing主頁圖片,24張
第二個抓取貼吧圖片
第三個抓去圖蟲圖片
#!/usr/bin/env python # -*- coding:utf-8 -*- # -*- author:miko-*- # python3抓取bing主頁所有背景圖片 import urllib.request import urllib,re,sys,os def get_bing_backphoto(): if (os.path.exists('img')== False): os.mkdir('img') for i in range(0,24): url = 'http://cn.bing.com/HPImageArchive.aspx?format=js&idx='+str(i)+'&n=1&nc=1361089515117&FORM=HYLH1' html = urllib.request.urlopen(url).read() if html == 'null': print( 'open & read bing error!') sys.exit(-1) html = html.decode('utf-8') #print (html) reg = re.compile('"url":"(.*?)","urlbase"',re.S) text = re.findall(reg,html) #http://s.cn.bing.net/az/hprichbg/rb/LongJi_ZH-CN8658435963_1366x768.jpg for imgurl in text : right = imgurl.rindex('/') name = imgurl.replace(imgurl[:right+1],'') savepath = 'img/'+ name print (imgurl) urllib.request.urlretrieve(imgurl, savepath) #print (name + ' save success!') get_bing_backphoto()
#coding=utf-8 import urllib.request import re import urllib,re,sys,os def getHtml(url): html = urllib.request.urlopen(url).read() if html == 'null': print( 'open & read bing error!') sys.exit(-1) html=html.decode('utf-8') return html def getImg(html): if (os.path.exists('baidu')== False): os.mkdir('baidu') reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = imgre.findall(html) x = 0 for imgurl in imglist: urllib.request.urlretrieve(imgurl,'baidu/%s.jpg' % x) x = x + 1 print(imgurl) #http://imgsrc.baidu.com/forum/pic/item/16391f30e924b89915f86eb06f061d950b7bf677.jpg html = getHtml("http://tieba.baidu.com/p/2460150866") getImg(html) #print (getImg(html))
#-*- encoding: utf-8 -*- ''' Created on 2015-7-30 @author: Miko ''' import urllib.request import urllib,re,sys,os,time import uuid #獲取二級頁面url def findUrl2(html): re1 = r'http://tuchong.com/\d+/\d+/|http://\w+(?<!photos).tuchong.com/\d+/' url2list = re.findall(re1,html) url2lstfltr = list(set(url2list)) url2lstfltr.sort(key=url2list.index) #print url2lstfltr return url2lstfltr #獲取html文字 def getHtml(url): html = urllib.request.urlopen(url).read().decode('utf-8')#解碼為utf-8 return html #下載圖片到本地 def download(html_page , pageNo): #定義資料夾的名字 x = time.localtime(time.time()) foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday")) re2=r'http://photos.tuchong.com/.+/f/.+\.jpg' imglist=re.findall(re2,html_page) print (imglist) download_img=None for imgurl in imglist: picpath = 'D:\\TuChong\\%s\\%s' % (foldername,str(pageNo)) filename = str(uuid.uuid1()) if not os.path.exists(picpath): os.makedirs(picpath) target = picpath+"\\%s.jpg" % filename print ("The photos location is:"+target) download_img = urllib.request.urlretrieve(imgurl, target)#將圖片下載到指定路徑中 time.sleep(1) print(imgurl) return download_img # def callback(blocknum, blocksize, totalsize): # '''回撥函式 # @blocknum: 已經下載的資料塊 # @blocksize: 資料塊的大小 # @totalsize: 遠端檔案的大小 # ''' # print str(blocknum),str(blocksize),str(totalsize) # if blocknum * blocksize >= totalsize: # print '下載完成' def quitit(): print ("Bye!") exit(0) if __name__ == '__main__': print (''' ***************************************** ** Welcome to Spider for TUCHONG ** ** Created on 2015-7-30 ** ** @author: miko ** *****************************************''') pageNo ='10' # raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>") while not pageNo.isdigit() or int(pageNo) > 100 : if pageNo == 'quit':quitit() print ("Param is invalid , please try again.") pageNo = raw_input("Input the page number you want to scratch >") #針對圖蟲人像模組來爬取 html = getHtml("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo)) detllst = findUrl2(html) for detail in detllst: html2 = getHtml(detail) download(html2,pageNo) print ("Finished.")