Python 爬蟲--網站下載器
阿新 • • 發佈:2018-12-10
分享一個自己寫的網站下載器,程式語言是 Python。這個網站下載器主要下載網站可訪問的靜態資源,即各種靜態檔案,包括html、js、css、jpg、png、gif、mp3、mp4、pdf、doc、xls等等等等,具體可參考程式內容。本下載器預設開啟8個執行緒,對下載的網站檔案按照原本組織方式儲存,儲存在當前執行程式的資料夾下。下載的網站和線上的網站看起來是一模一樣的,這樣,下次沒網的時候,你也可以查看了。
怎麼用呢?複製程式碼並儲存為.py檔案。程式最下方有示例。例項化Manager類,並傳給它一個連結,這個連結就是你要下載的網站的連結,然後呼叫這個例項的start()方法,然後是,waiting......
""" 網站下載器 """ __author__ = 'Stardust1001' from urllib import request, error from urllib.request import Request, urlopen, urljoin, urlretrieve, urlparse import os, shutil, re, time, threading, http from http import cookiejar from queue import Queue, Empty import logging import socket socket.setdefaulttimeout(20) import ssl try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context def init_opener(): cookie = cookiejar.CookieJar() cookie_support = request.HTTPCookieProcessor(cookie) return request.build_opener(cookie_support) opener = init_opener() def init_logger(): logger = logging.getLogger() logger.setLevel(logging.INFO) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) file_handler = logging.FileHandler('log.log', mode='w', encoding='UTF-8') file_handler.setLevel(logging.NOTSET) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) file_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.addHandler(file_handler) return logger logger = init_logger() class Manager: """ 爬蟲主執行緒的管理器 從子執行緒裡獲取新的連結,處理後新增進要爬取的連結 Queue 佇列 子執行緒從主執行緒提供的連結 Queue 佇列獲取連結進行爬取 """ def __init__(self, home_url): # 爬取網站域名的各個子域名 # 下載的網站的根資料夾,網站可能有不同子域名,提供一個更高階的資料夾路徑 -site home_dir = '{0}-site/{1}'.format(home_url.split('.')[1], home_url.split('/')[2]) # home_dir = '/Users/liebeu/Desktop/localhost-site/localhost' if os.path.exists(home_dir): shutil.rmtree(os.path.dirname(home_dir)) os.makedirs(home_dir) parsed_url = urlparse(home_url) scheme = parsed_url.scheme # 爬取的網站的頂級域名 top_domain = '.'.join(parsed_url.netloc.split('.')[1:]) # 每個請求最大嘗試次數 max_tries = 3 # 要爬取的連結 Queue 佇列 self.link_queue = Queue() self.link_queue.put(home_url) # 連結 set ,對新連線進行唯一性判斷,然後新增進 Queue 佇列 self.links = set([home_url]) # 子執行緒爬蟲列表 self.spiders = [] # 預設開啟 8 個子執行緒 for i in range(8): self.spiders.append(Spider(home_dir, home_url, self.link_queue, scheme, top_domain, max_tries)) def start(self): """ 開啟主執行緒的爬蟲管理器 """ for spider in self.spiders: spider.start() # 上次有新連結的時間,預設延時 60 秒,超過時間就結束程式 last_new_time = time.time() # 從子執行緒獲取新連結,新增進 Queue 佇列 while True: for spider in self.spiders: new_links = spider.get_links() if new_links: last_new_time = time.time() for link in new_links: if not link in self.links and len(link) < 250: sharp_index = link.find('#') if sharp_index > 0: link = link[0:sharp_index] self.links.add(link) self.link_queue.put(link, True) if time.time() - last_new_time >= 60: break # 響鈴提醒下載完成 for i in range(10): print('\a') time.sleep(0.5) class Spider(threading.Thread): """ 爬蟲執行緒 從主執行緒獲取連結進行爬取,並處理 html 、css 檔案獲取新連結,以及直接下載其他檔案 """ def __init__(self, home_dir, home_url, link_queue, scheme, top_domain, max_tries): threading.Thread.__init__(self) self.home_dir = home_dir self.home_url = home_url self.link_queue = link_queue self.scheme = scheme self.top_domain = top_domain self.max_tries = max_tries # 直接下載的其他檔案格式 self.other_suffixes = set([ 'js', 'jpg', 'png', 'gif', 'svg', 'json', 'xml', 'ico', 'jpeg', 'ttf', 'mp3', 'mp4', 'wav', 'doc', 'xls', 'pdf', 'docx', 'xlsx', 'eot', 'woff', 'csv', 'swf', 'tar', 'gz', 'zip', 'rar', 'txt', 'exe', 'ppt', 'pptx', 'm3u8', 'avi', 'wsf' ]) self.media_suffixes = set(['mp3', 'mp4', 'pdf', 'gz', 'tar', 'zip', 'rar', 'wav', 'm3u8', 'avi']) # 域名名稱 self.domain_names = set(['com', 'cn', 'net', 'org', 'gov', 'io']) # html 內容裡的連結匹配 self.html_pat = re.compile(r'(href|src)=(\"|\')([^\"\']*)') # css 內容裡的連結匹配 self.css_pat = re.compile(r'url\((\"|\')([^\"\']*)') self.links = set() def run(self): logger.info('{0} start.'.format(threading.current_thread().name)) # 嘗試從主執行緒的連結佇列獲取新連結,預設延時 60 秒結束執行緒 while True: try: link = self.link_queue.get(timeout=60) self.spide(link) except Empty: break logger.info('{0} end.'.format(threading.current_thread().name)) def spide(self, link): # 爬取連結,對不同連結不同處理 try: suffix = link.split('.')[-1].lower() if suffix == 'css': self.handle_css(link) elif suffix in self.other_suffixes: self.download(link) else: self.handle_html(link) except: logger.error('[Unknown Error]\t{0}'.format(link)) def handle_html(self, link): # 處理 html 連結 html = self.get_res(link) if html is None: return html_raw_links = set([ele[2] for ele in self.html_pat.findall(html)]) html_raw_links = html_raw_links.union([ele[1] for ele in self.css_pat.findall(html)]) if html_raw_links: # 提取有效的連結 valid_links = list(filter(self.is_valid_link, html_raw_links)) # 對有效的連結進行處理 handled_links = list(map(self.handle_valid_link, valid_links)) # 把有效的連結放入執行緒的 links ,供主執行緒爬蟲管理器獲取 self.links = self.links.union([urljoin(link, t_link) for t_link in handled_links]) # 替換 html 內容裡的連結為本地網站資料夾裡的相對路徑 html = self.replace_links(html, valid_links, self.normalize_link(link)) # 儲存 html 檔案 with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w: f_w.write(html) logger.info('Handled\t{0}'.format(link)) def handle_css(self, link): """ 處理 css 連結 """ text = self.get_res(link) if text is None: return css_raw_links = set([ele[1] for ele in self.css_pat.findall(text)]) if css_raw_links: css_raw_links = list(filter(self.is_valid_link, css_raw_links)) self.links = self.links.union([urljoin(link, t_link) for t_link in css_raw_links]) text = self.replace_links(text, css_raw_links, self.normalize_link(link)) with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w: f_w.write(text) logger.info('Handled\t{0}'.format(link)) def is_valid_link(self, link): """ 檢測有效連結 嵌入的 data:image 圖片不作為新連結 os.path.relpath 返回值最前面多一個 . 需要刪掉 """ if link.find('javascript:') >= 0 or link.find('@') >= 0 or link.find('data:image') >= 0: return False if link.find('http') >= 0: netloc = urlparse(link).netloc if netloc: if netloc.find(':80') > 0: netloc = netloc.replace(':80', '') return netloc[netloc.find('.') + 1:] == self.top_domain return True def handle_valid_link(self, link): """ 處理連結的錯誤 協議 寫法 http:www.baidu.com http:/www.baidu.com 轉換為 http://www.baidu.com """ if not link: return link if link[0:2] == '//': return self.scheme + link if link[0] == '/': return urljoin(self.home_url, link) if link.find('http') < 0 or link.find('http://') >= 0 or link.find('https://') >= 0: return link if link.find('http:/') >= 0 or link.find('https:/') >= 0: return link.replace(':/', '://') if link.find('http:') >= 0 or link.find('https:') >= 0: first_colon = link.find(':') link = link[0:first_colon] + '://' + link[first_colon + 1:] return link return link def get_res(self, link): """ 獲取 html 、 css 連結的響應 """ num_tries = 0 # 多次嘗試獲取 while num_tries < self.max_tries: try: res = opener.open(Request(link)).read() break except error.HTTPError: logger.error('[error.HTTPError]\t{0}'.format(link)) return None except error.URLError: logger.error('[error.URLError]\t{0}'.format(link)) return None except UnicodeEncodeError: logger.error('[UnicodeEncodeError]\t{0}'.format(link)) return None except http.client.BadStatusLine: logger.error('[http.client.BadStatusLine]\t{0}'.format(link)) return None except http.client.IncompleteRead: logger.error('[http.client.IncompleteRead]\t{0}'.format(link)) return None except TimeoutError: logger.error('[TimeoutError]\t{0}'.format(link)) num_tries += 1 except socket.timeout: logger.error('[socket.timeout]\t{0}'.format(link)) num_tries += 1 except http.client.RemoteDisconnected: logger.error('[RemoteDisconnected]\t{0}'.format(link)) num_tries += 1 except ConnectionResetError: logger.error('[ConnectionResetError]\t{0}'.format(link)) num_tries += 1 if num_tries >= self.max_tries: logger.warning('[failed get]\t{0}'.format(link)) return None # 解碼響應內容 try: text = res.decode('utf-8') return text except UnicodeDecodeError: pass try: text = res.decode('gb2312') return text except UnicodeDecodeError: pass try: text = res.decode('gbk') return text except UnicodeDecodeError: pass logger.error('[UnicodeDecodeError]\t{0}'.format(link)) return None def download(self, link): """ 直接下載其他格式的檔案 """ socket.setdefaulttimeout(20) if link.split('.')[-1].lower() in self.media_suffixes: socket.setdefaulttimeout(600) num_tries = 0 # 多次嘗試下載 while num_tries < self.max_tries: try: urlretrieve(link, self.make_filepath(link)) break except error.HTTPError: logger.error('[error.HTTPError]\t{0}'.format(link)) break except error.URLError: logger.error('[error.URLError]\t{0}'.format(link)) break except UnicodeEncodeError: logger.error('[UnicodeEncodeError]\t{0}'.format(link)) break except http.client.BadStatusLine: logger.error('[http.client.BadStatusLine]\t{0}'.format(link)) break except http.client.IncompleteRead: logger.error('[http.client.IncompleteRead]\t{0}'.format(link)) break except TimeoutError: logger.error('[TimeoutError]\t{0}'.format(link)) num_tries += 1 except socket.timeout: logger.error('[socket.timeout]\t{0}'.format(link)) num_tries += 1 except http.client.RemoteDisconnected: logger.error('[RemoteDisconnected]\t{0}'.format(link)) num_tries += 1 except ConnectionResetError: logger.error('[ConnectionResetError]\t{0}'.format(link)) num_tries += 1 if num_tries >= self.max_tries: logger.warning('[failed download]\t{0}'.format(link)) logger.info('Downloaded\t{0}'.format(link)) def make_filepath(self, link): """ 把連結建立為本地網站資料夾的絕對路徑 """ # 需要的話建立新資料夾 abs_filepath = self.get_abs_filepath(link) dirname = os.path.dirname(abs_filepath) if not os.path.exists(dirname): try: os.makedirs(dirname) except FileExistsError: pass except NotADirectoryError: logger.error('[NotADirectoryError]\t{0}\t{1}'.format(link, abs_filepath)) return abs_filepath def get_abs_filepath(self, link): """ 把連結轉換為本地網站資料夾的絕對路徑 """ old_link = link if link[-1] == '/': link += 'index.html' elif link.split('.')[-1] in self.domain_names: link += '/index.html' rel_url = os.path.relpath(link, self.home_url) if rel_url.find('?') >= 0: rel_url += '.html' if rel_url.split('/')[-1].find('.') < 0 or rel_url == '.': rel_url += 'index.html' abs_filepath = os.path.join(self.home_dir, rel_url) if abs_filepath.find('..') > 0: parts = abs_filepath.split('..') abs_filepath = '/'.join(parts[0].split('/')[0:-2]) + parts[1] if os.path.isdir(abs_filepath): logger.warning('[isdir]\t{0}\t{1}'.format(old_link, abs_filepath)) abs_filepath = os.path.join(abs_filepath, 'index.html') return abs_filepath def replace_links(self, content, links, cur_url): """ 替換 html 、 css 內容裡的連結 """ links.sort(key=lambda link: len(link), reverse=True) for link in set(links): link_abspath = self.get_abs_filepath(urljoin(cur_url, self.normalize_link(link))) cur_url_abspath = self.get_abs_filepath(cur_url) rel_link = os.path.relpath(link_abspath, cur_url_abspath)[1:].replace('?', '%3F') replacement = '"{0}"'.format(rel_link) content = content.replace( '"{0}"'.format(link),replacement ).replace('\'{0}\''.format(link), replacement) return content def normalize_link(self, link): if link.find('http') < 0: return link if link.find(':80') > 0: link = link.replace(':80', '') first_colon = link.find(':') link = self.scheme + link[first_colon:] return link def get_links(self): """ 主執行緒爬蟲管理器從這裡獲取爬蟲子執行緒的新連結 獲取後子執行緒就刪除舊連結,為後面獲取的連結做準備 """ export_links = self.links.copy() self.links.clear() return export_links if __name__ == '__main__': manager = Manager('http://www.whsw.net/') manager.start()