1. 程式人生 > >妹子圖爬蟲..新手的爬蟲

妹子圖爬蟲..新手的爬蟲

import re
import urllib.request


def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')
    return html


def get_img(html):
    ###搜尋圖片地址
    p = r'<img src="([^"]+\.jpg)'
    imglist = re.findall(p,html)
    ###去除最後一個地址是最後一個地址與我們找到圖片無關,而且多次試驗中最後一張圖片的地址會出錯所以需要刪除
    imglist.pop()
    """
    for i in imglist:
        print(i)
    """
    ###下載圖片
    for each in imglist:
          each = 'https:'+each
          print(each)
          filename = each.split("/")[-1]
          urllib.request.urlretrieve(each,filename,None)
    print('圖片下載完成!!!!')
def get_Ye(html):
    ###獲取當前頁數
    p=r'<span class="current-comment-page">\[(.+)]'
    imglist=re.findall(p,html)
    return imglist[0]
if __name__ == '__main__':
    
    FirstUrl = 'http://jandan.net/ooxx'
    NowYe = int(get_Ye(open_url(FirstUrl)))
    print('當前頁數為:%d'%NowYe)
    
    while True:
        Ywant = int(input('請輸入你想下載的頁數:'))
        if Ywant<=NowYe and Ywant >0:  
            for i in range(Ywant):
                print
                url = 'http://jandan.net/ooxx/page-'+str(NowYe-i)+'#comments'
                get_img(open_url(url))
            break    


        else:
            print('請重新輸入頁數:範圍在【%d,0)中'%NowYe)