python 使用selenium爬取進擊的巨人漫畫

阿新 • • 發佈：2020-09-18

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import os
  4 from selenium import webdriver
  5 from selenium.webdriver.firefox.webdriver import WebDriver
  6 from selenium.webdriver.support.wait import WebDriverWait
  7 from selenium.webdriver.support import expected_conditions as EC
  8 
 from selenium.webdriver.common.by import By
  9 class Down_Cartoon():
 10     def __init__(self):
 11         self.content_url='https://www.manhuabei.com/manhua/jinjidejuren/'
 12         self.base_url='https://www.manhuabei.com'
 13         self.header={"Use-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36 
" }
 14         self.html_path=r'd:\進擊的巨人.txt'
 15         self.file_path=r'D:\OneDrive\漫畫\進擊的巨人'
 16         
 17     def get_url(self,url):
 18         '''
 19         通用url請求
 20         '''
 21         r=requests.get(url,headers=self.header)
 22         if r.status_code==200:
 23             return 
 r.text
 24         else:
 25             return ""           
 26 
 27     def parse_html(self,html_content):
 28         '''
 29         BeautifulSoup解析網頁
 30         返回每個章節名稱列表和每個章節首頁列表
 31         '''
 32         soup=BeautifulSoup(html_content,'lxml')
 33         #self.save_webxml(self.html_path,soup.prettify())
 34         main=soup.find('ul',class_="list_con_li autoHeight")
 35         content=main.find_all('a')
 36         print("總章節:",len(content))
 37         chapter_url=[]
 38         title_name=[]
 39         for p in content:
 40             title_name.append(p['title'])
 41             chapter_url.append(p['href'])
 42         return chapter_url,title_name
 43 
 44 
 45     def save_webxml(self,file_path, xml_content):
 46         '''
 47         儲存html至本地
 48         '''
 49         with open(file_path,'w',encoding='UTF-8',errors='ignore') as write_blog:
 50             write_blog.write(xml_content)
 51 
 52     def download_one_page(self,href,dir_path,num):
 53         '''
 54         下載一個圖片並儲存
 55         '''
 56         strpic=str(num+1)+'.jpg'
 57         full_path=os.path.join(dir_path,strpic)
 58         if not os.path.exists(full_path):
 59             try:
 60                 r=requests.get(href,headers=self.header)
 61                 if r.status_code==200:
 62                      with open(full_path,'wb') as img:
 63                         img.write(r.content)
 64                         print(strpic,"success")                 
 65                 else:
 66                     print(full_path,'下載失敗',href)
 67             except:
 68                 print('下載失敗',href)
 69         else:
 70             print(strpic,'圖片已存在,無需下載')  
 71 
 72     def mkdir(self,own_dir_name):
 73         '''建立資料夾'''
 74         own_dir_name=own_dir_name.strip()
 75         full_path= os.path.join(self.file_path,own_dir_name)
 76         isExists=os.path.exists(full_path)
 77         if not isExists:
 78             #print("建立",own_dir_name,"資料夾")
 79             os.makedirs(full_path)
 80             os.chdir(full_path)
 81             return full_path
 82         else:
 83             #print(own_dir_name,'資料夾已存在')
 84             return full_path
 85 
 86     def run(self):
 87         content_list,title_list= self.parse_html(self.get_url(self.content_url))
 88         brower=webdriver.Chrome()
 89         self.download_content(brower,content_list,title_list)
 90         brower.quit()                
 91 
 92     def download_content(self,browerdrive,content_list,title_list):
 93         '''
 94         下載漫畫
 95         '''
 96         cartoon_href_list=[]
 97         for i,title in enumerate(title_list):
 98             chapter_name=title.split(" ")[0]
 99             print("正在下載第%s,總共%s話"%(chapter_name,len(title_list)))
100             dir_path=self.mkdir(chapter_name)
101             full_url=self.base_url+content_list[i]
102             browerdrive.get(full_url)
103             img_url_list=[]
104             chapter_info={}
105             try:
106                 img_info= browerdrive.find_element_by_class_name("img_info")
107             except:
108                 print("爬取失敗!")
109                 continue
110             tag_string=img_info.text
111             try:
112                 init_page=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
113             except:
114                 print("爬取失敗!")
115                 continue         
116             img_url_list.append(init_page)
117             num=int(tag_string.split('/')[1][0:2])
118             print("dir_path：",dir_path)
119             #print(num+1,len(os.listdir(dir_path)))
120             if num+1==len(os.listdir(dir_path)):
121                 print("第%s已下載"%(chapter_name))
122                 continue
123             self.download_one_page(init_page,dir_path,0)
124             chapter_href=self.download_chapter(browerdrive,dir_path,num)
125             img_url_list.extend(chapter_href)
126             chapter_info['href']=img_url_list
127             chapter_info['chapter_name']=chapter_name
128             cartoon_href_list.append(chapter_info)            
129         return cartoon_href_list 
130 
131     def download_chapter(self,browerdrive,dir_path,max_num):
132         '''
133         下載一章節
134         '''
135         img_url=[]
136         for x in range(0,max_num):
137             browerdrive.find_element_by_class_name("img_land_next").click()
138             wait=WebDriverWait(browerdrive,10)
139             try:
140                 wait_element=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"img[style='display: inline;']")))
141                 #href=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
142                 href=wait_element.get_attribute('src')
143                 print("準備下載圖片:",x+2)
144                 self.download_one_page(href,dir_path,x+1)
145                 img_url.append(href)
146             except:
147                 print("wait失敗!")
148                 continue
149                         
150         return img_url       
151 
152 if  __name__=='__main__':
153     down_load=Down_Cartoon() 
154     down_load.run()  
155

python 使用selenium爬取進擊的巨人漫畫

1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 from selenium import webdriver 5 from selenium.webdriver.firefox.webdriver import WebDriver

Python selenium爬取微博資料程式碼例項

爬取某人的微博資料，把某人所有時間段的微博資料都爬下來。具體思路：建立driver-----get網頁----找到並提取資訊-----儲存csv----翻頁----get網頁（開始迴圈）----...----沒有“下一頁”就結束，

Python selenium爬取微信公眾號文章程式碼詳解

參照資料：selenium webdriver新增cookie: https://www.jb51.net/article/193102.html 需求：想閱讀微信公眾號歷史文章，但是每次找回看得地方不方便。

python+ selenium爬取房天下新房詳情

新房詳情 from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep

Python selenium 爬取cnvd(國家資訊保安漏洞共享平臺)

#coding = utf-8#@author :今夕#@Time :2021.08.06 16:09#@file :mian.py#@software :PyCharmimport timefrom selenium import webdriverfrom bs4 import BeautifulSoupimport reimport pymysqlimport random#應用漏

Python selenium 爬取cnvd(國家資訊保安漏洞共享平臺)剩餘部分

# coding = utf-8# @author :今夕# @Time :2021.08.10 09:22# @file :main2.py# @software :PyCharmimport timefrom selenium import webdriverfrom bs4 import BeautifulSoupimport reimport pymysqlimport random