python 爬取表情包——鬥圖啦
阿新 • • 發佈:2018-12-21
#import urllib import requests import time from lxml import etree url='http://www.doutula.com/' headers={'Referer':'http://www.doutula.com/', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} resp=requests.get(url,headers=headers) print(resp.text) ''' <img class="gif" style="min-height: inherit;left: 5px;top:5px" src="//static.doutula.com/img/gif.png" /> <img src="//static.doutula.com/img/loader_170_160.png" style="margin: 0 auto; min-height: inherit;" data-original="https://ws2.sinaimg.cn/bmiddle/6af89bc8gw1f8smgrjzkug20af0afmyl.gif" alt="總愛在我的生活裡指手畫腳,俗稱經驗婊和過來人婊" class="img-responsive lazy image_dta" data-backup="http://img.doutula.com/production/uploads/image//2016/06/10/20160610526577_IvENsd.gif!dta"> ''' #開始解析 #html=etree.HTML(resp.text) #srcs=html.xpath('.//img/@data-original') #for src in srcs: # filename=src.split('/')[-1] # img=requests.get(src,headers=headers) # # with open('D:\Anaconda3\imgs/'+filename,'wb') as file: # file.write(img.content) # print(src,filename) # #print(len(src)) def download_img(src): filename=src.split('/')[-1] img=requests.get(src,headers=headers) with open('D:\Anaconda3\imgs/'+filename,'wb') as file: file.write(img.content) print(src,filename) def get_page(url): resp=requests.get(url,headers=headers) print(resp,url) html=etree.HTML(resp.text) srcs=html.xpath('.//img/@data-original') for src in srcs: download_img(src) next_link=html.xpath('.//a[@rel="next"]/@href') return ['next_link'] next_link_base='http://www.doutula.com/article/list/?page=' next_link=html.xpath('.//a[@rel="next"]/@href') current_num=1 while next_link: time.sleep(0.2) current_num+=1 next_link=get_page(next_link_base+str(current_num)) if current_num>=4: break ''' http://www.doutula.com/article/list/?page=581 '''