python基礎爬蟲——單執行緒多執行緒爬取圖片
阿新 • • 發佈:2021-02-06
困於心衡於慮而後作
今天的學習目標是:單執行緒與多執行緒爬取網頁圖片
python單執行緒:
# 指定一個網站,可以爬取這個網站中的所有的影象檔案,同時把這些檔案儲存到程式所在資料夾的images子資料夾中
# 首先設計一個單執行緒爬取程式,這個程式會因網站的某個影象下載過程而效率緩慢
# 此外設計一個多執行緒爬取程式,在多執行緒程式中如果一個影象下載緩慢,
# 那麼也就是爬取它的那個執行緒緩慢,而不影響其他的爬取過程
# 單執行緒爬取影象程式
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
def imageSpider(start_url):
try:
urls = []
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ['utf-8', 'gbk'])
data = dammit.unicode_markup
soup = BeautifulSoup(data, 'lxml')
images = soup.select('img')
for images in images:
try:
src = images['src']
url = urllib.request.urljoin(start_url, src)
if url not in urls:
urls.append(url)
print (url)
download(url)
except Exception as e:
print(e)
except Exception as e:
print(e)
def download(url):
global count
try:
count = count + 1
if (url[len(url) - 4] == '.'):
ext = url[len(url) - 4:]
else:
ext = ''
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("images\\" + str(count) + ext, 'wb')
fobj.write(data)
fobj.close()
print('downloaded' + str(count) + ext)
except Exception as e:
print(e)
start_url = "http://www.weather.com.cn/weather/101280601.shtml"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
count = 0
imageSpider(start_url)
python多執行緒:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
def imageSpider(start_url):
global threads
global count
try:
urls = []
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammmit = UnicodeDammit(data, ['utf-8', 'gbk'])
data = dammmit.unicode_markup
soup = BeautifulSoup(data, 'lxml')
images = soup.select('img')
for images in images:
try:
src = images['src']
url = urllib.request.urljoin(start_url, src)
if url not in urls:
print(url)
count = count + 1
T = threading.Thread(target=download, args=(url, count))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as e:
print(e)
except Exception as e:
print(e)
def download(url):
global count
try:
count = count + 1
if (url[len(url) - 4] == '.'):
ext = url[len(url) - 4:]
else:
ext = ''
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("images\\" + str(count) + ext, 'wb')
fobj.write(data)
fobj.close()
print('downloaded' + str(count) + ext)
except Exception as e:
print(e)
start_url = "http://www.weather.com.cn/weather/101280601.shtml"
# start_url="http://www.szlit.edu.cn"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
count = 0
threads=[]
imageSpider(start_url)
for t in threads:
t.join()
print('the end')