python爬蟲學習多程序下載圖片
阿新 • • 發佈:2019-01-23
import requests import urllib.request from bs4 import BeautifulSoup import os, re import datetime from multiprocessing import Pool total_page = 0 class Spider: # 初始化網址 def __init__(self): self.url = "http://www.zbjuran.com/mei" # 獲取該網頁的所有模特連結 def get_image_urls(self): msgList = [] for i in range(1, 30): url = self.url + "/qingchun/list_14_" + str(i) + ".html" html = requests.get(url).text soup = BeautifulSoup(html, 'lxml') all_urls = soup.find_all(class_="picbox") # print(all_urls) for url in all_urls: img_name = url.find('img')['alt'] # print(img_name) img_url = 'http://www.zbjuran.com' + url.find('a')['href'] # print(img_url) msgList.append([img_name, img_url]) print(len(msgList)) return msgList # 建立儲存資料夾 def createDirectory(self, filename): path = "E:/爬蟲/圖片/" + filename if not os.path.exists(path): os.makedirs(path) return path # 下載的圖片 def down_image(self, url, path): # for url in urlList: # 網頁讀取 myurl = url[1] html = requests.get(myurl) # 編碼網頁 html.encoding = 'gb2312' html = html.text soup = BeautifulSoup(html, 'lxml') # 讀取圖片數目 page_num = soup.find(class_='page').li.a.text page_num = re.sub('\D', '', page_num) # 計算總圖片數目 global total_page total_page += int(page_num) print("本頁面共有%s張照片" % page_num) # 下載圖片 for i in range(1, int(page_num) + 1): if i == 1: rp = '.html' else: rp = '_%s.html' % i urlSite = myurl.replace('.html', rp) html = requests.get(urlSite) html.encoding = 'gb2312' if html.status_code == 200: soup = BeautifulSoup(html.text, 'lxml') src = soup.find(class_='picbox').img if src == None: continue else: src = src['src'] if not "http://www.zbjuran.com" in src and 'uploads' in src: desrc = 'http://www.zbjuran.com' + src print("正在下載%s的第%s張照片" % (url[0], i)) urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i) else: desrc = src print("正在下載%s的第%s張照片" % (url[0], i)) urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i) else: continue if __name__ == "__main__": spider = Spider() urls = spider.get_image_urls() print(urls) d1 = datetime.datetime.now() p = Pool(20) j = 0 for i in urls: path = spider.createDirectory(i[0] + str(j)) p.apply_async(spider.down_image, args=(i, path)) j += 1 p.close() p.join() d2 = datetime.datetime.now() print(d2 - d1)