一隻爬蟲
阿新 • • 發佈:2020-08-07
1. 流程分析
1.1 分析目標地址分頁的情況
- 第一頁:https://域名/分類/index.html
- 第二頁:https://域名/分類/index_2.html
- 第三頁:https://域名/分類/index_3.html
即可得出目標分頁的規律
1.2 分析某一頁各個項的特徵
觀察得出每一項的url地址在單頁各個項的ul下面的li中的a標籤中儲存,進入這個url地址即可得到此項中所有圖片的地址
此外這個a標籤的alt屬性中還儲存了此項的摘要描述
1.3 進入某一目標項獲取圖片
通過以上步驟,我們要想獲取圖片的url的路徑,需要訪問每一頁的每一項的URL,這樣即可載入獲取所有圖片的URL地址
拿到圖片地址後儲存,然後即可開啟多執行緒下載圖片
2. 程式碼實現
2.1 程式碼
import os, re import requests import logging from bs4 import BeautifulSoup from urllib.request import urlretrieve from concurrent.futures import ThreadPoolExecutor logging.captureWarnings(True) # 忽略所有警告資訊 class fuck_photo(): xxoo = """ 1: Asian 2: Cartoon 3: Cute 4: Secret 5: Silk""" def __init__(self, server, base_url, dirname, thread_num, headers, ): self.server = server self.base_url = base_url self.dirname = dirname self.headers = headers self.set_path(dirname) self.thread_poll = ThreadPoolExecutor(thread_num) self.photo_type= { 1: "yazhousetu/", 2: "katong/", 3: "mengmeizi/", 4: "zipai/", 5: "qingchun/", } def get_page_list(self, target, start_page, max_page): """獲取所有頁的url""" if start_page <= 1: page_list = [target, ] print("從第一頁開始爬...") start_page += 1 else: page_list = [] for i in range(start_page, max_page + 1): page_link = target[:-5] + "_" + str(i) + ".html" page_list.append(page_link) print('\n所有圖片會儲存在【%s】資料夾中...\n' % self.dirname) return page_list def get_list(self, target): """獲取每頁所有專案的url""" per_page_link_list = [] response = requests.get(url=target, headers=self.headers, verify=False) content = BeautifulSoup(response.text, features="html.parser") the_list = content.find('ul', class_='update_area_lists cl').find_all('a') for i in the_list: per_page_link_list.append(self.server[:-1] + i.get('href')) return per_page_link_list def get_photo_link_list(self, link_target): """獲取某頁中某個項中所有圖片的URL""" photo_link_dick = {} response = requests.get(url=link_target, headers=self.headers, verify=False) content = BeautifulSoup(response.text, features="html.parser") the_list = content.find('div', class_='content_left').find_all('img') for i in the_list: photo_link_dick[i.get('src')] = i.get('alt') return photo_link_dick def set_path(self, dirname): self.dirpath = os.getcwd() + os.sep + dirname if not os.path.isdir(self.dirpath): os.mkdir(dirname) def download_photo(self, photo_links, ret): """下載圖片""" try: # filename = re.findall('/(\w*?\.jpg)', photo_links)[0] filename = re.findall('/(\w*?\.jpg)', photo_links)[0][-12:] path = self.dirpath + os.sep + ret[photo_links] + filename urlretrieve(photo_links, path) except Exception as e: print("sorry,圖片下載時出現了一個錯誤: %s \n【已忽略...】" % e) pass def download_thread(self, i): ret = photo.get_photo_link_list(i) for item in ret: self.thread_poll.submit(photo.download_photo, item, ret) def start_fuck(self): type_num = int(input(self.xxoo + '\n\n' + "請輸入圖片型別:")) while not type_num: print("請輸入對應的數字!" + '\n\n' + self.xxoo) type_num = int(input("請輸入圖片型別:")) target = self.base_url + self.photo_type.get(type_num) + 'index.html' startpage = int(input("從哪一頁開始:")) maxpage = int(input("到哪一頁為止:")) # 獲取指定範圍內所有項的URL all_page_list = self.get_page_list(target, startpage, maxpage) for item in all_page_list: try: content_list = photo.get_list(item) for i in content_list: photo.download_thread(i) except Exception as e: print("出現了一個錯誤: %s \n【已忽略】" % e) pass if __name__ == '__main__': photo = fuck_photo( "https://htx5.com/", # 目標主機 "https://htx5.com/", # 目標地址 "精美桌布", # 設定儲存圖片的目錄,更改爬取型別時,要修改這裡的目錄名稱 64, # 執行緒數 { # 配置請求頭 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0', 'Referer': "https://htu5.com/", } ) photo.start_fuck()