[python][爬蟲]批量爬取【漫畫DB】的漫畫圖片
阿新 • • 發佈:2020-08-12
今天看漫畫的時候感覺用瀏覽器實在不爽,就寫個小程式爬了下來。順便安利一個漫畫軟體MComix,超級好用(Linux限定,Win可以用ComicViewer)
import requests import os from bs4 import BeautifulSoup class Manhuadb: def __init__(self, url,): self.url = url print(type(self.url)) self.header = { 'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, br", "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'PHPSESSID=tcsqlbacj644lgdi9gf3ngj3sf; Hm_lvt_b09a6e73b4faec9edd5935dc45604b5b=1597234279,1597234684; Hm_lpvt_b09a6e73b4faec9edd5935dc45604b5b=1597236589; _ga=GA1.2.1851898722.1597234280; _gid=GA1.2.615465153.1597234280; __cfduid=df113e815d70b868f8f455ea0cd34d9271597234297', 'Host':"www.manhuadb.com", 'Upgrade-Insecure-Requests':1, 'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0", } self.start() def judge_type(self): if "https://www.manhuadb.com/manhua/" in self.url: loc = self.url[32:] if '/' in loc and ".html" in loc: self.type_ = 'chapter' else: self.type_ = 'book' else: self.type_ = None def get_page(self, url): html = requests.get(url, ) soup = BeautifulSoup(html.text) self.title = soup.find_all('h1')[0].text self.chapter = soup.find_all('h2')[0].text img = soup.find_all("img", class_="img-fluid show-pic")[0]['src'] return img def get_page_list_from_page(self, url): html = requests.get(url, ) soup = BeautifulSoup(html.text) s0 = int(soup.find_all('div', class_="d-none vg-r-data")[0]['data-total']) id_ = url.split('/')[4] page_id = url.split('/')[5][:-5].split('p')[0] self.title = soup.find_all('h1')[0].text self.chapter = soup.find_all('h2')[0].text return *[ "https://www.manhuadb.com/manhua/{}/{}p{}.html".format(id_, page_id, each + 1) for each in range(s0) ], def get_chapter_list_from_book(self, url): html = requests.get(url, ) soup = BeautifulSoup(html) li = soup.find_all("li", class_="sort_div") self.title = soup.find_all('h1')[0].text self.chapter = soup.find_all('h2')[0].text return *[ "https://www.manhuadb.com"+each.find_all('a')['href'] for each in li ], def get_imgs(self, src, page): img = requests.get(src) with open('{}.jpg'.format(page), 'wb+') as f: f.write(img.content) f.close() def get_chapter(self, url): l = self.get_page_list_from_page(url) os.mkdir(self.chapter) cwd = os.getcwd() os.chdir(self.chapter) for each in l: self.get_imgs(self.get_page(each), l.index(each)) os.chdir(cwd) def start(self): self.judge_type() if self.type_ == "chapter": self.get_chapter(self.url) elif self.type_ == "book": chapter_list = self.get_chapter_list_from_book(self.url) os.mkdir(self.title) cwd = os.getcwd() os.chdir(self.title) for each in chapter_list: self.get_chapter(each) os.chdir(cwd) elif self.type_ == None: print('wrong input') if __name__ == "__main__": Manhuadb("https://www.manhuadb.com/manhua/147/1330_13291_p1.html")
效果: