1. 程式人生 > 其它 >爬蟲案例 下載某文庫付費文件 全格式

爬蟲案例 下載某文庫付費文件 全格式

技術標籤:爬蟲# 爬蟲案例pythonxpath

由於版權原因,具體網站不再明述。

說一下爬取思路及遇到的問題

  • 爬取付費文件(大部分含文字)實際使用百度的請求頭對自家進行爬取,可以爬取成功,請求頭可以在robots.txt裡找到 參考連結

  • xpath抓取的值有空格換行符等問題:使用normalize-space()函式,如contents = html.xpath(’//div[normalize-space(@class=“bd doc-reader”)]/text()’)

  • python-pptx 實踐 :新增圖片 參考連結如下:https://www.cnblogs.com/shanger/p/13098799.html

  • python-pdf 和 python-docx 實踐參考

  • selenium click無效問題 之前寫過解決辦法,連結為https://blog.csdn.net/a12355556/article/details/108346202

注意

  • pptx和pdf我是用圖片轉換的,沒有圖片的文件可轉換不了哦
  • 執行時大家記得改一下chromedriver的路徑哦

爬取結果:
在這裡插入圖片描述在這裡插入圖片描述

程式碼

import requests,docx,os
from lxml import etree
from PIL import Image
from urllib import request
from selenium import
webdriver driver_path = r'D:/chromedriver/chromedriver.exe' header = {'User-agent': 'Baiduspider'} def download_wenku(url,typ): r = requests.get(url , headers = header) html = etree.HTML(r.text) title = html.xpath('//title/text()')[0][:-5] contents = html.xpath('//div[normalize-space(@class="bd doc-reader")]/text()'
) for c in contents: if len(c)>50: content = c #pdf divs = html.xpath('//div[@class="mod flow-ppt-mod"]/div/div') urls = [] for div in divs: url = div.xpath('div/img/@src') if len(url)==0: url = div.xpath('div/img/@data-src') if len(url)!=0: urls.append(url[0]) if ',' or ',' in typ: if ',' in typ: typs = typ.split(',') for typ in typs: save_type(typ,title,content,url) else: typs = typ.split(',') for typ in typs: save_type(typ,title,content,url) if typ=='all': a = [save_type(typ,title,content,url) for typ in ['docx','txt','pdf','pptx']] else: save_type(typ,title,content,url) def save_type(typ,title,content,url): if typ=='docx': print("*"*30+"docx正在下載中"+"*"*30) docu = docx.Document() docu.add_paragraph(content) docu.save(title+'.docx') print("*"*30+"docx型別下載完成"+"*"*30) if typ=='txt': print("*"*30+"txt正在下載中"+"*"*30) f = open(title+'.txt','w',encoding='utf-8').write(content) print("*"*30+"txt型別下載完成"+"*"*30) if typ=='pdf': print("*"*30+"pdf正在下載中"+"*"*30) save_pdf(title,url) print("*"*30+"pdf型別下載完成"+"*"*30) if typ=='pptx': print("*"*30+"pptx正在下載中"+"*"*30) save_pptx(title,url) print("*"*30+"pptx型別下載完成"+"*"*30) def save_pptx(title,url): verify = download_png(url) if verify==True: return # 載入庫 from pptx import Presentation # 設定路徑 old_path = os.getcwd() # work_path = old_path+'\img' # os.chdir(work_path) # 例項化 ppt 文件物件 prs = Presentation() img = old_path # 新增圖片 for i in range(1,len(os.listdir(img))+1): blank_slide = prs.slide_layouts[6] slide_1 = prs.slides.add_slide(blank_slide) img_name = old_path+'\img'+'\\'+ str(i)+'.png' slide_1.shapes.add_picture(image_file=img_name,left=1,top=1,) # 儲存 ppt prs.save(title+'.pptx') def download_png(url): try: driver = webdriver.Chrome(executable_path=driver_path) driver.get("https://wenku.baidu.com") driver.get(url) element = driver.find_element_by_xpath('//span[@class="read-all"]') driver.execute_script("arguments[0].click();", element) js = "var q=document.documentElement.scrollTop=20000" driver.execute_script(js) source = driver.page_source html = etree.HTML(source) #獲取圖片url divs = html.xpath('//div[@class="mod flow-ppt-mod"]/div/div') urls = [] for div in divs: url = div.xpath('div/img/@src') if len(url)==0: url = div.xpath('div/img/@data-src') if len(url)!=0: urls.append(url[0]) #建立圖片資料夾並下載圖片 if not os.path.exists('img'): os.mkdir('img') i=1 for url in urls: request.urlretrieve(url,'img/'+str(i)+'.png') i = i+1 except Exception as e: print("該文件無圖片,不適合轉換成pptx和pdf") driver.quit() return True driver.quit() def save_pdf(title,url): verify = download_png(url) if verify==True: return folderPath = os.getcwd()+'\img' filename = title files = os.listdir(folderPath) jpgFiles = [] sources = [] for file in files: if 'png' in file: jpgFiles.append(file) tep = [] for i in jpgFiles: ex = i.split('.') tep.append(int(ex[0])) tep.sort() jpgFiles=[folderPath +'/'+ str(i) + '.png' for i in tep] output = Image.open(jpgFiles[0]) jpgFiles.pop(0) for file in jpgFiles: img = Image.open(file) img = img.convert("P") sources.append(img) output.save(f"./{filename}.pdf","PDF",save_all=True,append_images=sources) def main(): url = input("請輸入要下載的文章連結:") print("*"*10+"文件都是圖片建議存為pptx,pdf,均為文字建議存為docx,txt"+"*"*10) typ = input("請輸入要儲存的型別(可供選擇的型別為doc,txt,pdf,pptx,下載多種格式請用逗號隔開,全部下載可使用all):") download_wenku(url,typ) if __name__ == '__main__': main()