1. 程式人生 > 其它 >python基礎爬蟲——單執行緒多執行緒爬取圖片

python基礎爬蟲——單執行緒多執行緒爬取圖片

技術標籤:pythonpython

困於心衡於慮而後作
今天的學習目標是:單執行緒與多執行緒爬取網頁圖片
python單執行緒:

# 指定一個網站,可以爬取這個網站中的所有的影象檔案,同時把這些檔案儲存到程式所在資料夾的images子資料夾中
# 首先設計一個單執行緒爬取程式,這個程式會因網站的某個影象下載過程而效率緩慢
# 此外設計一個多執行緒爬取程式,在多執行緒程式中如果一個影象下載緩慢,
# 那麼也就是爬取它的那個執行緒緩慢,而不影響其他的爬取過程

# 單執行緒爬取影象程式

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import
urllib.request def imageSpider(start_url): try: urls = [] req = urllib.request.Request(start_url, headers=headers) data = urllib.request.urlopen(req) data = data.read() dammit = UnicodeDammit(data, ['utf-8', 'gbk']) data = dammit.unicode_markup soup =
BeautifulSoup(data, 'lxml') images = soup.select('img') for images in images: try: src = images['src'] url = urllib.request.urljoin(start_url, src) if url not in urls: urls.append(url) print
(url) download(url) except Exception as e: print(e) except Exception as e: print(e) def download(url): global count try: count = count + 1 if (url[len(url) - 4] == '.'): ext = url[len(url) - 4:] else: ext = '' req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req, timeout=100) data = data.read() fobj = open("images\\" + str(count) + ext, 'wb') fobj.write(data) fobj.close() print('downloaded' + str(count) + ext) except Exception as e: print(e) start_url = "http://www.weather.com.cn/weather/101280601.shtml" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'} count = 0 imageSpider(start_url)

python多執行緒:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading


def imageSpider(start_url):
    global threads
    global count
    try:
        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammmit = UnicodeDammit(data, ['utf-8', 'gbk'])
        data = dammmit.unicode_markup
        soup = BeautifulSoup(data, 'lxml')
        images = soup.select('img')
        for images in images:
            try:
                src = images['src']
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except Exception as e:
                print(e)
    except Exception as e:
        print(e)


def download(url):
    global count
    try:
        count = count + 1
        if (url[len(url) - 4] == '.'):
            ext = url[len(url) - 4:]
        else:
            ext = ''
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("images\\" + str(count) + ext, 'wb')
        fobj.write(data)
        fobj.close()
        print('downloaded' + str(count) + ext)
    except Exception as e:
        print(e)



start_url = "http://www.weather.com.cn/weather/101280601.shtml"
# start_url="http://www.szlit.edu.cn"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
count = 0
threads=[]
imageSpider(start_url)

for t in threads:
    t.join()
print('the end')