1. 程式人生 > >Python 爬蟲--網站下載器

Python 爬蟲--網站下載器

       分享一個自己寫的網站下載器,程式語言是 Python。這個網站下載器主要下載網站可訪問的靜態資源,即各種靜態檔案,包括html、js、css、jpg、png、gif、mp3、mp4、pdf、doc、xls等等等等,具體可參考程式內容。本下載器預設開啟8個執行緒,對下載的網站檔案按照原本組織方式儲存,儲存在當前執行程式的資料夾下。下載的網站和線上的網站看起來是一模一樣的,這樣,下次沒網的時候,你也可以查看了。

       怎麼用呢?複製程式碼並儲存為.py檔案。程式最下方有示例。例項化Manager類,並傳給它一個連結,這個連結就是你要下載的網站的連結,然後呼叫這個例項的start()方法,然後是,waiting......

"""
網站下載器
"""
__author__ = 'Stardust1001'

from urllib import request, error
from urllib.request import Request, urlopen, urljoin, urlretrieve, urlparse
import os, shutil, re, time, threading, http
from http import cookiejar
from queue import Queue, Empty
import logging

import socket

socket.setdefaulttimeout(20)

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


def init_opener():
	cookie = cookiejar.CookieJar()
	cookie_support = request.HTTPCookieProcessor(cookie)
	return request.build_opener(cookie_support)

opener = init_opener()

def init_logger():
	logger = logging.getLogger()
	logger.setLevel(logging.INFO)
	console_handler = logging.StreamHandler()
	console_handler.setLevel(logging.INFO)
	file_handler = logging.FileHandler('log.log', mode='w', encoding='UTF-8')
	file_handler.setLevel(logging.NOTSET)
	formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
	console_handler.setFormatter(formatter)
	file_handler.setFormatter(formatter)
	logger.addHandler(console_handler)
	logger.addHandler(file_handler)
	return logger

logger = init_logger()


class Manager:
	"""
	爬蟲主執行緒的管理器
	從子執行緒裡獲取新的連結,處理後新增進要爬取的連結 Queue 佇列
	子執行緒從主執行緒提供的連結 Queue 佇列獲取連結進行爬取
	"""
	def __init__(self, home_url):
		# 爬取網站域名的各個子域名

		# 下載的網站的根資料夾,網站可能有不同子域名,提供一個更高階的資料夾路徑 -site

		home_dir = '{0}-site/{1}'.format(home_url.split('.')[1], home_url.split('/')[2])
		# home_dir = '/Users/liebeu/Desktop/localhost-site/localhost'

		if os.path.exists(home_dir):
			shutil.rmtree(os.path.dirname(home_dir))
		os.makedirs(home_dir)

		parsed_url = urlparse(home_url)
		scheme = parsed_url.scheme
		# 爬取的網站的頂級域名
		top_domain = '.'.join(parsed_url.netloc.split('.')[1:])
		# 每個請求最大嘗試次數
		max_tries = 3

		# 要爬取的連結 Queue 佇列
		self.link_queue = Queue()
		self.link_queue.put(home_url)
		# 連結 set ,對新連線進行唯一性判斷,然後新增進 Queue 佇列
		self.links = set([home_url])
		# 子執行緒爬蟲列表
		self.spiders = []
		# 預設開啟 8 個子執行緒
		for i in range(8):
			self.spiders.append(Spider(home_dir, home_url, self.link_queue, scheme, top_domain, max_tries))

	def start(self):
		"""
		開啟主執行緒的爬蟲管理器
		"""
		for spider in self.spiders:
			spider.start()
		# 上次有新連結的時間,預設延時 60 秒,超過時間就結束程式
		last_new_time = time.time()
		# 從子執行緒獲取新連結,新增進 Queue 佇列
		while True:
			for spider in self.spiders:
				new_links = spider.get_links()
				if new_links:
					last_new_time = time.time()
				for link in new_links:
					if not link in self.links and len(link) < 250:
						sharp_index = link.find('#')
						if sharp_index > 0:
							link = link[0:sharp_index]
						self.links.add(link)
						self.link_queue.put(link, True)
			if time.time() - last_new_time >= 60:
				break
		# 響鈴提醒下載完成
		for i in range(10):
			print('\a')
			time.sleep(0.5)


class Spider(threading.Thread):
	"""
	爬蟲執行緒
	從主執行緒獲取連結進行爬取,並處理 html 、css 檔案獲取新連結,以及直接下載其他檔案
	"""
	def __init__(self, home_dir, home_url, link_queue, scheme, top_domain, max_tries):
		threading.Thread.__init__(self)
		self.home_dir = home_dir
		self.home_url = home_url
		self.link_queue = link_queue
		self.scheme = scheme
		self.top_domain = top_domain
		self.max_tries = max_tries
		# 直接下載的其他檔案格式
		self.other_suffixes = set([
			'js', 'jpg', 'png', 'gif', 'svg', 'json', 'xml', 'ico', 'jpeg', 'ttf', 'mp3', 'mp4', 'wav',
			'doc', 'xls', 'pdf', 'docx', 'xlsx', 'eot', 'woff', 'csv', 'swf', 'tar', 'gz', 'zip', 'rar', 'txt',
			'exe', 'ppt', 'pptx', 'm3u8', 'avi', 'wsf'
		])
		self.media_suffixes = set(['mp3', 'mp4', 'pdf', 'gz', 'tar', 'zip', 'rar', 'wav', 'm3u8', 'avi'])
		# 域名名稱
		self.domain_names = set(['com', 'cn', 'net', 'org', 'gov', 'io'])
		# html 內容裡的連結匹配
		self.html_pat = re.compile(r'(href|src)=(\"|\')([^\"\']*)')
		# css 內容裡的連結匹配
		self.css_pat = re.compile(r'url\((\"|\')([^\"\']*)')

		self.links = set()

	def run(self):
		logger.info('{0} start.'.format(threading.current_thread().name))
		# 嘗試從主執行緒的連結佇列獲取新連結,預設延時 60 秒結束執行緒
		while True:
			try:
				link = self.link_queue.get(timeout=60)
				self.spide(link)
			except Empty:
				break
		logger.info('{0} end.'.format(threading.current_thread().name))

	def spide(self, link):
		# 爬取連結,對不同連結不同處理
		try:
			suffix = link.split('.')[-1].lower()
			if suffix == 'css':
				self.handle_css(link)
			elif suffix in self.other_suffixes:
				self.download(link)
			else:
				self.handle_html(link)
		except:
			logger.error('[Unknown Error]\t{0}'.format(link))

	def handle_html(self, link):
		# 處理 html 連結
		html = self.get_res(link)
		if html is None:
			return
		html_raw_links = set([ele[2] for ele in self.html_pat.findall(html)])
		html_raw_links = html_raw_links.union([ele[1] for ele in self.css_pat.findall(html)])
		if html_raw_links:
			# 提取有效的連結
			valid_links = list(filter(self.is_valid_link, html_raw_links))
			# 對有效的連結進行處理
			handled_links = list(map(self.handle_valid_link, valid_links))
			# 把有效的連結放入執行緒的 links ,供主執行緒爬蟲管理器獲取
			self.links = self.links.union([urljoin(link, t_link) for t_link in handled_links])
			# 替換 html 內容裡的連結為本地網站資料夾裡的相對路徑
			html = self.replace_links(html, valid_links, self.normalize_link(link))
		# 儲存 html 檔案
		with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w:
			f_w.write(html)
		logger.info('Handled\t{0}'.format(link))

	def handle_css(self, link):
		"""
		處理 css 連結
		"""
		text = self.get_res(link)
		if text is None:
			return
		css_raw_links = set([ele[1] for ele in self.css_pat.findall(text)])
		if css_raw_links:
			css_raw_links = list(filter(self.is_valid_link, css_raw_links))
			self.links = self.links.union([urljoin(link, t_link) for t_link in css_raw_links])
			text = self.replace_links(text, css_raw_links, self.normalize_link(link))
		with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w:
			f_w.write(text)
		logger.info('Handled\t{0}'.format(link))

	def is_valid_link(self, link):
		"""
		檢測有效連結
		嵌入的 data:image 圖片不作為新連結
		os.path.relpath 返回值最前面多一個 . 需要刪掉
		"""
		if link.find('javascript:') >= 0 or link.find('@') >= 0 or link.find('data:image') >= 0:
			return False
		if link.find('http') >= 0:
			netloc = urlparse(link).netloc
			if netloc:
				if netloc.find(':80') > 0:
					netloc = netloc.replace(':80', '')
				return netloc[netloc.find('.') + 1:] == self.top_domain
		return True

	def handle_valid_link(self, link):
		"""
		處理連結的錯誤 協議 寫法
		http:www.baidu.com http:/www.baidu.com 轉換為 http://www.baidu.com
		"""
		if not link:
			return link
		if link[0:2] == '//':
			return self.scheme + link
		if link[0] == '/':
			return urljoin(self.home_url, link)
		if link.find('http') < 0 or link.find('http://') >= 0 or link.find('https://') >= 0:
			return link
		if link.find('http:/') >= 0 or link.find('https:/') >= 0:
			return link.replace(':/', '://')
		if link.find('http:') >= 0 or link.find('https:') >= 0:
			first_colon = link.find(':')
			link = link[0:first_colon] + '://' + link[first_colon + 1:]
			return link
		return link

	def get_res(self, link):
		"""
		獲取 html 、 css 連結的響應
		"""
		num_tries = 0
		# 多次嘗試獲取
		while num_tries < self.max_tries:
			try:
				res = opener.open(Request(link)).read()
				break
			except error.HTTPError:
				logger.error('[error.HTTPError]\t{0}'.format(link))
				return None
			except error.URLError:
				logger.error('[error.URLError]\t{0}'.format(link))
				return None
			except UnicodeEncodeError:
				logger.error('[UnicodeEncodeError]\t{0}'.format(link))
				return None
			except http.client.BadStatusLine:
				logger.error('[http.client.BadStatusLine]\t{0}'.format(link))
				return None
			except http.client.IncompleteRead:
				logger.error('[http.client.IncompleteRead]\t{0}'.format(link))
				return None
			except TimeoutError:
				logger.error('[TimeoutError]\t{0}'.format(link))
				num_tries += 1
			except socket.timeout:
				logger.error('[socket.timeout]\t{0}'.format(link))
				num_tries += 1
			except http.client.RemoteDisconnected:
				logger.error('[RemoteDisconnected]\t{0}'.format(link))
				num_tries += 1
			except ConnectionResetError:
				logger.error('[ConnectionResetError]\t{0}'.format(link))
				num_tries += 1
		if num_tries >= self.max_tries:
			logger.warning('[failed get]\t{0}'.format(link))
			return None
		# 解碼響應內容
		try:
			text = res.decode('utf-8')
			return text
		except UnicodeDecodeError:
			pass
		try:
			text = res.decode('gb2312')
			return text
		except UnicodeDecodeError:
			pass
		try:
			text = res.decode('gbk')
			return text
		except UnicodeDecodeError:
			pass
		logger.error('[UnicodeDecodeError]\t{0}'.format(link))
		return None

	def download(self, link):
		"""
		直接下載其他格式的檔案
		"""
		socket.setdefaulttimeout(20)
		if link.split('.')[-1].lower() in self.media_suffixes:
			socket.setdefaulttimeout(600)
		num_tries = 0
		# 多次嘗試下載
		while num_tries < self.max_tries:
			try:
				urlretrieve(link, self.make_filepath(link))
				break
			except error.HTTPError:
				logger.error('[error.HTTPError]\t{0}'.format(link))
				break
			except error.URLError:
				logger.error('[error.URLError]\t{0}'.format(link))
				break
			except UnicodeEncodeError:
				logger.error('[UnicodeEncodeError]\t{0}'.format(link))
				break
			except http.client.BadStatusLine:
				logger.error('[http.client.BadStatusLine]\t{0}'.format(link))
				break
			except http.client.IncompleteRead:
				logger.error('[http.client.IncompleteRead]\t{0}'.format(link))
				break
			except TimeoutError:
				logger.error('[TimeoutError]\t{0}'.format(link))
				num_tries += 1
			except socket.timeout:
				logger.error('[socket.timeout]\t{0}'.format(link))
				num_tries += 1
			except http.client.RemoteDisconnected:
				logger.error('[RemoteDisconnected]\t{0}'.format(link))
				num_tries += 1
			except ConnectionResetError:
				logger.error('[ConnectionResetError]\t{0}'.format(link))
				num_tries += 1
		if num_tries >= self.max_tries:
			logger.warning('[failed download]\t{0}'.format(link))
		logger.info('Downloaded\t{0}'.format(link))

	def make_filepath(self, link):
		"""
		把連結建立為本地網站資料夾的絕對路徑
		"""
		# 需要的話建立新資料夾
		abs_filepath = self.get_abs_filepath(link)
		dirname = os.path.dirname(abs_filepath)
		if not os.path.exists(dirname):
			try:
				os.makedirs(dirname)
			except FileExistsError:
				pass
			except NotADirectoryError:
				logger.error('[NotADirectoryError]\t{0}\t{1}'.format(link, abs_filepath))
		return abs_filepath

	def get_abs_filepath(self, link):
		"""
		把連結轉換為本地網站資料夾的絕對路徑
		"""
		old_link = link

		if link[-1] == '/':
			link += 'index.html'
		elif link.split('.')[-1] in self.domain_names:
			link += '/index.html'
		rel_url = os.path.relpath(link, self.home_url)
		if rel_url.find('?') >= 0:
			rel_url += '.html'
		if rel_url.split('/')[-1].find('.') < 0 or rel_url == '.':
			rel_url += 'index.html'
		abs_filepath = os.path.join(self.home_dir, rel_url)
		if abs_filepath.find('..') > 0:
			parts = abs_filepath.split('..')
			abs_filepath = '/'.join(parts[0].split('/')[0:-2]) + parts[1]
		if os.path.isdir(abs_filepath):
			logger.warning('[isdir]\t{0}\t{1}'.format(old_link, abs_filepath))
			abs_filepath = os.path.join(abs_filepath, 'index.html')
		return abs_filepath

	def replace_links(self, content, links, cur_url):
		"""
		替換 html 、 css 內容裡的連結
		"""
		links.sort(key=lambda link: len(link), reverse=True)
		for link in set(links):
			link_abspath = self.get_abs_filepath(urljoin(cur_url, self.normalize_link(link)))
			cur_url_abspath = self.get_abs_filepath(cur_url)
			rel_link = os.path.relpath(link_abspath, cur_url_abspath)[1:].replace('?', '%3F')
			replacement = '"{0}"'.format(rel_link)
			content = content.replace(
				'"{0}"'.format(link),replacement
				).replace('\'{0}\''.format(link), replacement)
		return content

	def normalize_link(self, link):
		if link.find('http') < 0:
			return link
		if link.find(':80') > 0:
			link = link.replace(':80', '')
		first_colon = link.find(':')
		link = self.scheme + link[first_colon:]
		return link

	def get_links(self):
		"""
		主執行緒爬蟲管理器從這裡獲取爬蟲子執行緒的新連結
		獲取後子執行緒就刪除舊連結,為後面獲取的連結做準備
		"""
		export_links = self.links.copy()
		self.links.clear()
		return export_links


if __name__ == '__main__':
	manager = Manager('http://www.whsw.net/')
	manager.start()