妹子圖爬蟲..新手的爬蟲
阿新 • • 發佈:2018-10-31
import re import urllib.request def open_url(url): req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') page = urllib.request.urlopen(req) html = page.read().decode('utf-8') return html def get_img(html): ###搜尋圖片地址 p = r'<img src="([^"]+\.jpg)' imglist = re.findall(p,html) ###去除最後一個地址是最後一個地址與我們找到圖片無關,而且多次試驗中最後一張圖片的地址會出錯所以需要刪除 imglist.pop() """ for i in imglist: print(i) """ ###下載圖片 for each in imglist: each = 'https:'+each print(each) filename = each.split("/")[-1] urllib.request.urlretrieve(each,filename,None) print('圖片下載完成!!!!') def get_Ye(html): ###獲取當前頁數 p=r'<span class="current-comment-page">\[(.+)]' imglist=re.findall(p,html) return imglist[0] if __name__ == '__main__': FirstUrl = 'http://jandan.net/ooxx' NowYe = int(get_Ye(open_url(FirstUrl))) print('當前頁數為:%d'%NowYe) while True: Ywant = int(input('請輸入你想下載的頁數:')) if Ywant<=NowYe and Ywant >0: for i in range(Ywant): print url = 'http://jandan.net/ooxx/page-'+str(NowYe-i)+'#comments' get_img(open_url(url)) break else: print('請重新輸入頁數:範圍在【%d,0)中'%NowYe)