爬蟲案例 下載某文庫付費文件 全格式
阿新 • • 發佈:2021-01-27
由於版權原因,具體網站不再明述。
說一下爬取思路及遇到的問題:
-
爬取付費文件(大部分含文字)實際使用百度的請求頭對自家進行爬取,可以爬取成功,請求頭可以在robots.txt裡找到 參考連結
-
xpath抓取的值有空格換行符等問題:使用normalize-space()函式,如contents = html.xpath(’//div[normalize-space(@class=“bd doc-reader”)]/text()’)
-
python-pptx 實踐 :新增圖片 參考連結如下:https://www.cnblogs.com/shanger/p/13098799.html
-
python-pdf 和 python-docx 實踐參考
-
selenium click無效問題 之前寫過解決辦法,連結為https://blog.csdn.net/a12355556/article/details/108346202
注意:
- pptx和pdf我是用圖片轉換的,沒有圖片的文件可轉換不了哦
- 執行時大家記得改一下chromedriver的路徑哦
爬取結果:
程式碼
import requests,docx,os
from lxml import etree
from PIL import Image
from urllib import request
from selenium import webdriver
driver_path = r'D:/chromedriver/chromedriver.exe'
header = {'User-agent': 'Baiduspider'}
def download_wenku(url,typ):
r = requests.get(url , headers = header)
html = etree.HTML(r.text)
title = html.xpath('//title/text()')[0][:-5]
contents = html.xpath('//div[normalize-space(@class="bd doc-reader")]/text()' )
for c in contents:
if len(c)>50:
content = c
#pdf
divs = html.xpath('//div[@class="mod flow-ppt-mod"]/div/div')
urls = []
for div in divs:
url = div.xpath('div/img/@src')
if len(url)==0:
url = div.xpath('div/img/@data-src')
if len(url)!=0:
urls.append(url[0])
if ',' or ',' in typ:
if ',' in typ:
typs = typ.split(',')
for typ in typs:
save_type(typ,title,content,url)
else:
typs = typ.split(',')
for typ in typs:
save_type(typ,title,content,url)
if typ=='all':
a = [save_type(typ,title,content,url) for typ in ['docx','txt','pdf','pptx']]
else:
save_type(typ,title,content,url)
def save_type(typ,title,content,url):
if typ=='docx':
print("*"*30+"docx正在下載中"+"*"*30)
docu = docx.Document()
docu.add_paragraph(content)
docu.save(title+'.docx')
print("*"*30+"docx型別下載完成"+"*"*30)
if typ=='txt':
print("*"*30+"txt正在下載中"+"*"*30)
f = open(title+'.txt','w',encoding='utf-8').write(content)
print("*"*30+"txt型別下載完成"+"*"*30)
if typ=='pdf':
print("*"*30+"pdf正在下載中"+"*"*30)
save_pdf(title,url)
print("*"*30+"pdf型別下載完成"+"*"*30)
if typ=='pptx':
print("*"*30+"pptx正在下載中"+"*"*30)
save_pptx(title,url)
print("*"*30+"pptx型別下載完成"+"*"*30)
def save_pptx(title,url):
verify = download_png(url)
if verify==True:
return
# 載入庫
from pptx import Presentation
# 設定路徑
old_path = os.getcwd()
# work_path = old_path+'\img'
# os.chdir(work_path)
# 例項化 ppt 文件物件
prs = Presentation()
img = old_path
# 新增圖片
for i in range(1,len(os.listdir(img))+1):
blank_slide = prs.slide_layouts[6]
slide_1 = prs.slides.add_slide(blank_slide)
img_name = old_path+'\img'+'\\'+ str(i)+'.png'
slide_1.shapes.add_picture(image_file=img_name,left=1,top=1,)
# 儲存 ppt
prs.save(title+'.pptx')
def download_png(url):
try:
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("https://wenku.baidu.com")
driver.get(url)
element = driver.find_element_by_xpath('//span[@class="read-all"]')
driver.execute_script("arguments[0].click();", element)
js = "var q=document.documentElement.scrollTop=20000"
driver.execute_script(js)
source = driver.page_source
html = etree.HTML(source)
#獲取圖片url
divs = html.xpath('//div[@class="mod flow-ppt-mod"]/div/div')
urls = []
for div in divs:
url = div.xpath('div/img/@src')
if len(url)==0:
url = div.xpath('div/img/@data-src')
if len(url)!=0:
urls.append(url[0])
#建立圖片資料夾並下載圖片
if not os.path.exists('img'):
os.mkdir('img')
i=1
for url in urls:
request.urlretrieve(url,'img/'+str(i)+'.png')
i = i+1
except Exception as e:
print("該文件無圖片,不適合轉換成pptx和pdf")
driver.quit()
return True
driver.quit()
def save_pdf(title,url):
verify = download_png(url)
if verify==True:
return
folderPath = os.getcwd()+'\img'
filename = title
files = os.listdir(folderPath)
jpgFiles = []
sources = []
for file in files:
if 'png' in file:
jpgFiles.append(file)
tep = []
for i in jpgFiles:
ex = i.split('.')
tep.append(int(ex[0]))
tep.sort()
jpgFiles=[folderPath +'/'+ str(i) + '.png' for i in tep]
output = Image.open(jpgFiles[0])
jpgFiles.pop(0)
for file in jpgFiles:
img = Image.open(file)
img = img.convert("P")
sources.append(img)
output.save(f"./{filename}.pdf","PDF",save_all=True,append_images=sources)
def main():
url = input("請輸入要下載的文章連結:")
print("*"*10+"文件都是圖片建議存為pptx,pdf,均為文字建議存為docx,txt"+"*"*10)
typ = input("請輸入要儲存的型別(可供選擇的型別為doc,txt,pdf,pptx,下載多種格式請用逗號隔開,全部下載可使用all):")
download_wenku(url,typ)
if __name__ == '__main__':
main()