1. 程式人生 > >python爬蟲學習多程序下載圖片

python爬蟲學習多程序下載圖片

import requests
import urllib.request
from  bs4 import BeautifulSoup
import os, re
import datetime
from multiprocessing import Pool

total_page = 0
class Spider:
    # 初始化網址
    def __init__(self):
        self.url = "http://www.zbjuran.com/mei"
        # 獲取該網頁的所有模特連結

    def get_image_urls(self):
        msgList = []
        for i in range(1, 30):
            url = self.url + "/qingchun/list_14_" + str(i) + ".html"
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'lxml')
            all_urls = soup.find_all(class_="picbox")
            # print(all_urls)
            for url in all_urls:
                img_name = url.find('img')['alt']
                # print(img_name)
                img_url = 'http://www.zbjuran.com' + url.find('a')['href']
                # print(img_url)
                msgList.append([img_name, img_url])
        print(len(msgList))
        return msgList
        # 建立儲存資料夾

    def createDirectory(self, filename):
        path = "E:/爬蟲/圖片/" + filename
        if not os.path.exists(path):
            os.makedirs(path)
        return path
        # 下載的圖片

    def down_image(self, url, path):

        # for url in urlList:
        # 網頁讀取
        myurl = url[1]
        html = requests.get(myurl)
        # 編碼網頁
        html.encoding = 'gb2312'
        html = html.text
        soup = BeautifulSoup(html, 'lxml')
        # 讀取圖片數目
        page_num = soup.find(class_='page').li.a.text
        page_num = re.sub('\D', '', page_num)
        # 計算總圖片數目
        global total_page
        total_page += int(page_num)
        print("本頁面共有%s張照片" % page_num)

        # 下載圖片
        for i in range(1, int(page_num) + 1):
            if i == 1:
                rp = '.html'
            else:
                rp = '_%s.html' % i
            urlSite = myurl.replace('.html', rp)
            html = requests.get(urlSite)
            html.encoding = 'gb2312'
            if html.status_code == 200:
                soup = BeautifulSoup(html.text, 'lxml')
                src = soup.find(class_='picbox').img
                if src == None:
                    continue
                else:
                    src = src['src']
                    if not "http://www.zbjuran.com" in src and 'uploads' in src:
                        desrc = 'http://www.zbjuran.com' + src
                        print("正在下載%s的第%s張照片" % (url[0], i))
                        urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i)
                    else:
                        desrc = src
                        print("正在下載%s的第%s張照片" % (url[0], i))
                        urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i)
            else:
                continue


if __name__ == "__main__":
    spider = Spider()
    urls = spider.get_image_urls()
    print(urls)
    d1 = datetime.datetime.now()
    p = Pool(20)
    j = 0

    for i in urls:
        path = spider.createDirectory(i[0] + str(j))
        p.apply_async(spider.down_image, args=(i, path))
        j += 1
    p.close()
    p.join()
    d2 = datetime.datetime.now()
    print(d2 - d1)