1. 程式人生 > 實用技巧 >[python][爬蟲]批量爬取【漫畫DB】的漫畫圖片

[python][爬蟲]批量爬取【漫畫DB】的漫畫圖片

今天看漫畫的時候感覺用瀏覽器實在不爽,就寫個小程式爬了下來。順便安利一個漫畫軟體MComix,超級好用(Linux限定,Win可以用ComicViewer)

import requests
import os
from bs4 import BeautifulSoup
class Manhuadb:
    def __init__(self, url,):
        self.url = url
        print(type(self.url))
        self.header = {
            'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, br",
            "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive',
            'Cookie':'PHPSESSID=tcsqlbacj644lgdi9gf3ngj3sf; Hm_lvt_b09a6e73b4faec9edd5935dc45604b5b=1597234279,1597234684; Hm_lpvt_b09a6e73b4faec9edd5935dc45604b5b=1597236589; _ga=GA1.2.1851898722.1597234280; _gid=GA1.2.615465153.1597234280; __cfduid=df113e815d70b868f8f455ea0cd34d9271597234297',
            'Host':"www.manhuadb.com",
            'Upgrade-Insecure-Requests':1,
            'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
            }
        self.start()

    def judge_type(self):
        if "https://www.manhuadb.com/manhua/" in self.url:
            loc = self.url[32:]
            if '/' in loc and ".html" in loc:
                self.type_ = 'chapter'
            else:
                self.type_ = 'book'
        else:
            self.type_ = None
    
    def get_page(self, url):
        html = requests.get(url, )
        soup = BeautifulSoup(html.text)
        self.title = soup.find_all('h1')[0].text
        self.chapter = soup.find_all('h2')[0].text
        img = soup.find_all("img", class_="img-fluid show-pic")[0]['src']
        return img

    def get_page_list_from_page(self, url):
        html = requests.get(url, )
        soup = BeautifulSoup(html.text)
        s0 = int(soup.find_all('div', class_="d-none vg-r-data")[0]['data-total'])
        id_ = url.split('/')[4]
        page_id = url.split('/')[5][:-5].split('p')[0]
        self.title = soup.find_all('h1')[0].text
        self.chapter = soup.find_all('h2')[0].text
        return *[
            "https://www.manhuadb.com/manhua/{}/{}p{}.html".format(id_, page_id, each + 1)
            for each in range(s0)
        ],

    def get_chapter_list_from_book(self, url):
        html = requests.get(url, )
        soup = BeautifulSoup(html)
        li = soup.find_all("li", class_="sort_div")
        self.title = soup.find_all('h1')[0].text
        self.chapter = soup.find_all('h2')[0].text
        return *[
            "https://www.manhuadb.com"+each.find_all('a')['href']
            for each in li
        ],

    def get_imgs(self, src, page):
        img = requests.get(src)
        with open('{}.jpg'.format(page), 'wb+') as f:
            f.write(img.content)
            f.close()

    def get_chapter(self, url):
        l = self.get_page_list_from_page(url)
        os.mkdir(self.chapter)
        cwd = os.getcwd()
        os.chdir(self.chapter)
        for each in l:
            self.get_imgs(self.get_page(each), l.index(each))
        os.chdir(cwd)

    def start(self):
        self.judge_type()
        if self.type_ == "chapter":
            self.get_chapter(self.url)
        elif self.type_ == "book":
            chapter_list = self.get_chapter_list_from_book(self.url)
            os.mkdir(self.title)
            cwd = os.getcwd()
            os.chdir(self.title)
            for each in chapter_list:
                self.get_chapter(each)
            os.chdir(cwd)
        elif self.type_ == None:
            print('wrong input')

if __name__ == "__main__":
    Manhuadb("https://www.manhuadb.com/manhua/147/1330_13291_p1.html")

效果: