spider----校花圖的爬取
阿新 • • 發佈:2018-12-18
案例:
import urllib.request import re import time import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Referer': 'http://www.mm131.com/xiaohua/' } for page in range(1, 7): print('這是第%s頁' % page) if page == 1: url = 'http://www.mm131.com/xiaohua/' elif page >= 2: url = 'http://www.mm131.com/xiaohua/list_2_{}.html'.format(page) # print(url) # 構建一個request請求,其中包含請求頭與url request = urllib.request.Request(url=url, headers=headers) # 傳送請求得到響應 response = urllib.request.urlopen(request) # print(response.read().decode('gbk')) # 拿到響應內容 content = response.read().decode('gbk') # print(content) # # 正則匹配 # '''<a target="_blank" href="http://www.mm131.com/xiaohua/634.html"><img src="http://img1.mm131.me/pic/634/m634.jpg # " alt="性感校花路子瀅 爆乳沐浴私房寫真" width="120" height="160">性感校花路子瀅 爆乳沐</a>''' ret = re.compile(r'<a target="_blank" href=".*?"><img src="(.*?)" alt="(.*?)" width=".*?" height=".*?">.*?</a>', re.S) result = ret.findall(content) # print(result) dirname = '校花圖' if not os.path.exists(dirname): os.mkdir(dirname) for img in result: # 圖片src image = img[0] print(image) # 圖片的名字 filename = img[1] + '.' + image.split('.')[-1] # 儲存圖片的路徑,拼接 # print(filename) filepath = os.path.join(dirname, filename) # print(filepath) result1 = urllib.request.Request(url=image, headers=headers) response1 = urllib.request.urlopen(result1) with open(filepath, 'wb') as fp: fp.write(response1.read()) # print('正在下載...%s' % filename) # 下載圖片並儲存相應路徑 # urllib.request.urlretrieve(image, filepath) time.sleep(2) # print('結束下載') time.sleep(2)
代理池案例:
import urllib.request import os for i in range(4200, 4461): os.mkdir('tupian/' + str(i)) for j in range(60): try: url = 'http://img1.mm131.me/pic/' + str(i) + '/' + str(j) + '.jpg' print(url) # urllib.request.urlretrieve(url, 'lala.jpg') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', 'Referer': 'https://www.sogou.com/link?url=DSOYnZeCC_o7btUgpK402wmc9YOcsOr4cOOT57O29F8' } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) with open('tupian/' + str(i) + '/' + str(j) + '.jpg', 'wb') as fp: fp.write(response.read()) except Exception as e: print('下載失敗,下載下一條') break