1. 程式人生 > 實用技巧 >python 使用selenium爬取進擊的巨人漫畫

python 使用selenium爬取進擊的巨人漫畫

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import os
  4 from selenium import webdriver
  5 from selenium.webdriver.firefox.webdriver import WebDriver
  6 from selenium.webdriver.support.wait import WebDriverWait
  7 from selenium.webdriver.support import expected_conditions as EC
  8
from selenium.webdriver.common.by import By 9 class Down_Cartoon(): 10 def __init__(self): 11 self.content_url='https://www.manhuabei.com/manhua/jinjidejuren/' 12 self.base_url='https://www.manhuabei.com' 13 self.header={"Use-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
" } 14 self.html_path=r'd:\進擊的巨人.txt' 15 self.file_path=r'D:\OneDrive\漫畫\進擊的巨人' 16 17 def get_url(self,url): 18 ''' 19 通用url請求 20 ''' 21 r=requests.get(url,headers=self.header) 22 if r.status_code==200: 23 return
r.text 24 else: 25 return "" 26 27 def parse_html(self,html_content): 28 ''' 29 BeautifulSoup解析網頁 30 返回每個章節名稱列表和每個章節首頁列表 31 ''' 32 soup=BeautifulSoup(html_content,'lxml') 33 #self.save_webxml(self.html_path,soup.prettify()) 34 main=soup.find('ul',class_="list_con_li autoHeight") 35 content=main.find_all('a') 36 print("總章節:",len(content)) 37 chapter_url=[] 38 title_name=[] 39 for p in content: 40 title_name.append(p['title']) 41 chapter_url.append(p['href']) 42 return chapter_url,title_name 43 44 45 def save_webxml(self,file_path, xml_content): 46 ''' 47 儲存html至本地 48 ''' 49 with open(file_path,'w',encoding='UTF-8',errors='ignore') as write_blog: 50 write_blog.write(xml_content) 51 52 def download_one_page(self,href,dir_path,num): 53 ''' 54 下載一個圖片並儲存 55 ''' 56 strpic=str(num+1)+'.jpg' 57 full_path=os.path.join(dir_path,strpic) 58 if not os.path.exists(full_path): 59 try: 60 r=requests.get(href,headers=self.header) 61 if r.status_code==200: 62 with open(full_path,'wb') as img: 63 img.write(r.content) 64 print(strpic,"success") 65 else: 66 print(full_path,'下載失敗',href) 67 except: 68 print('下載失敗',href) 69 else: 70 print(strpic,'圖片已存在,無需下載') 71 72 def mkdir(self,own_dir_name): 73 '''建立資料夾''' 74 own_dir_name=own_dir_name.strip() 75 full_path= os.path.join(self.file_path,own_dir_name) 76 isExists=os.path.exists(full_path) 77 if not isExists: 78 #print("建立",own_dir_name,"資料夾") 79 os.makedirs(full_path) 80 os.chdir(full_path) 81 return full_path 82 else: 83 #print(own_dir_name,'資料夾已存在') 84 return full_path 85 86 def run(self): 87 content_list,title_list= self.parse_html(self.get_url(self.content_url)) 88 brower=webdriver.Chrome() 89 self.download_content(brower,content_list,title_list) 90 brower.quit() 91 92 def download_content(self,browerdrive,content_list,title_list): 93 ''' 94 下載漫畫 95 ''' 96 cartoon_href_list=[] 97 for i,title in enumerate(title_list): 98 chapter_name=title.split(" ")[0] 99 print("正在下載第%s,總共%s話"%(chapter_name,len(title_list))) 100 dir_path=self.mkdir(chapter_name) 101 full_url=self.base_url+content_list[i] 102 browerdrive.get(full_url) 103 img_url_list=[] 104 chapter_info={} 105 try: 106 img_info= browerdrive.find_element_by_class_name("img_info") 107 except: 108 print("爬取失敗!") 109 continue 110 tag_string=img_info.text 111 try: 112 init_page=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src') 113 except: 114 print("爬取失敗!") 115 continue 116 img_url_list.append(init_page) 117 num=int(tag_string.split('/')[1][0:2]) 118 print("dir_path:",dir_path) 119 #print(num+1,len(os.listdir(dir_path))) 120 if num+1==len(os.listdir(dir_path)): 121 print("第%s已下載"%(chapter_name)) 122 continue 123 self.download_one_page(init_page,dir_path,0) 124 chapter_href=self.download_chapter(browerdrive,dir_path,num) 125 img_url_list.extend(chapter_href) 126 chapter_info['href']=img_url_list 127 chapter_info['chapter_name']=chapter_name 128 cartoon_href_list.append(chapter_info) 129 return cartoon_href_list 130 131 def download_chapter(self,browerdrive,dir_path,max_num): 132 ''' 133 下載一章節 134 ''' 135 img_url=[] 136 for x in range(0,max_num): 137 browerdrive.find_element_by_class_name("img_land_next").click() 138 wait=WebDriverWait(browerdrive,10) 139 try: 140 wait_element=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"img[style='display: inline;']"))) 141 #href=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src') 142 href=wait_element.get_attribute('src') 143 print("準備下載圖片:",x+2) 144 self.download_one_page(href,dir_path,x+1) 145 img_url.append(href) 146 except: 147 print("wait失敗!") 148 continue 149 150 return img_url 151 152 if __name__=='__main__': 153 down_load=Down_Cartoon() 154 down_load.run() 155