Python3爬蟲實戰(urllib模組)
阿新 • • 發佈:2018-12-31
import urllib.request import os import re import time def url_open(url): # 建立一個 Request物件 req req = urllib.request.Request(url) # 通過 add_header( )方法新增請求頭,防止基本的網站反爬策略 req.add_header('User-Agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\ 537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36") # 將獲取的網頁資訊通過read()方法讀取出來 response = urllib.request.urlopen(req).read() return response # 另一種方法獲取網頁 ''' def url_open(url): req = urllib.request.Request(url) header = ('User-Agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\ 537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" ) # 建立opner物件 opener = urllib.request.build_opener() # 給該物件新增請求頭 opener.addheaders = [header] # 用open方法獲取網頁並讀取 response = opener.open(url).read() return response ''' def find_imgs(url): # 將網頁內容進行解碼,網頁編碼是GBK,就換成gbk html = url_open(url).decode('utf-8') # 使用正則表示式獲取目標資料 p = r'<img src="([^"]+\.jpg)"' img_addrs = re.findall(p, html) return img_addrs def download_mm(folder='OOXX'): os.mkdir(folder) os.chdir(folder) page_num = 1 # 設定為從第一頁開始爬取,可以自己改 x = 0 # 自命名圖片 img_addrs = [] # 防止圖片重複 # 只爬取前兩頁的圖片,可改,同時給圖片重新命名 while page_num <= 2: page_url = url + 'a/more_' + str(page_num) + '.html' addrs = find_imgs(page_url) print(len(addrs)) # img_addrs = [] for i in addrs: if i in img_addrs: continue else: img_addrs.append(i) print(len(img_addrs)) for each in img_addrs: print(each) page_num += 1 time.sleep() # x = (len(2img_addrs)+1)*(page_num-1) for each in img_addrs: filename = str(x) + '.' + each.split('.')[-1] x += 1 with open(filename, 'wb') as f: img = url_open(each) f.write(img) # page_num += 1 if __name__ == '__main__': url = 'http://www.meizitu.com/' download_mm()