1. 程式人生 > >python 爬取表情包——鬥圖啦

python 爬取表情包——鬥圖啦

#import urllib
import requests
import time
from lxml import etree
url='http://www.doutula.com/'
headers={'Referer':'http://www.doutula.com/',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}

resp=requests.get(url,headers=headers)
print(resp.text)
'''
<img class="gif" style="min-height: inherit;left: 5px;top:5px" src="//static.doutula.com/img/gif.png" />
<img src="//static.doutula.com/img/loader_170_160.png" 
style="margin: 0 auto; min-height: inherit;"
 data-original="https://ws2.sinaimg.cn/bmiddle/6af89bc8gw1f8smgrjzkug20af0afmyl.gif"
 alt="總愛在我的生活裡指手畫腳,俗稱經驗婊和過來人婊" class="img-responsive lazy image_dta"
 data-backup="http://img.doutula.com/production/uploads/image//2016/06/10/20160610526577_IvENsd.gif!dta">
 '''

#開始解析

#html=etree.HTML(resp.text)
#srcs=html.xpath('.//img/@data-original')
#for src in srcs:
#    filename=src.split('/')[-1]
#    img=requests.get(src,headers=headers)
#    
#    with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
#        file.write(img.content)  
#    print(src,filename)
#    
#print(len(src))



def download_img(src):
    filename=src.split('/')[-1]
    img=requests.get(src,headers=headers)
    with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
        file.write(img.content)  
    print(src,filename)



def get_page(url):
    resp=requests.get(url,headers=headers)
    print(resp,url)
    html=etree.HTML(resp.text)
    srcs=html.xpath('.//img/@data-original')
    for src in srcs:
        download_img(src)
        
    next_link=html.xpath('.//a[@rel="next"]/@href')
    return ['next_link']


next_link_base='http://www.doutula.com/article/list/?page='
next_link=html.xpath('.//a[@rel="next"]/@href')
current_num=1
while next_link:
    time.sleep(0.2)
    current_num+=1
    next_link=get_page(next_link_base+str(current_num))
    if current_num>=4:
        break
        
        

'''
http://www.doutula.com/article/list/?page=581
'''