小爬靜態頁面圖片
阿新 • • 發佈:2017-08-15
靜態頁面 www class enter == cnblogs text lose nco
以慕課網http://www.imooc.com/首頁為例
源代碼:
#coding:utf-8 class Outputer: def __init__(self,directory): self.directory=directory def output(self,file,content): try: f=open(self.directory+file,‘wb‘) f.write(content) except Exception as err: print(err) finally: f.close() print(file+"has writed!") class SpiderMan: def __init__(self,url,directory): self.url=url self.outputer=Outputer(directory) def crawl(self): resp=request.urlopen(self.url) buff=resp.read() pattern=r‘http:.+\.jpg‘ jpgurls=re.findall(pattern,buff.decode(‘utf-8‘),flags=0) count=1 for jurl in jpgurls: try: content=request.urlopen(jurl).read() print(count) count=count+1 self.outputer.output(‘imooc_‘+str(count)+‘.jpg‘,content) except UnicodeEncodeError as err: print(err) if __name__ == ‘__main__‘: from urllib import request import re #http://www.imooc.com/ url=‘http://www.imooc.com/‘ spider=SpiderMan(url,‘./spider_imooc/‘) spider.crawl()
運行圖:
小爬靜態頁面圖片