PDF檔案批量下載爬蟲
阿新 • • 發佈:2019-01-25
參考文章在這兒
https://segmentfault.com/a/1190000010823538這個爬蟲是下載智慧車官網的技術報告的
# _*_ coding:utf-8_*_ # auther :nsy12 # date :2018/2/25 # time :11:20 import requests import re, os from bs4 import BeautifulSoup import time import random FILE_DIR = r'E:\1smart Car\paper'#檔案儲存地址 url_datas = [ 'https://.cn', 'https://.cn', 'https://.cn', 'https://.cn', 'https://.cn', 'https://.cn', 'https://.cn' ] #顯示下載的文件名稱 def showPdf(pdf_name): print(pdf_name + '...') #儲存文件 def savePdf(url, pdf_name): response = requests.get(url, data=None, stream=True) if not os.path.exists(FILE_DIR): os.makedirs(FILE_DIR) with open(os.path.join(FILE_DIR, pdf_name), "wb") as pdf_file: for content in response.iter_content(): pdf_file.write(content) def downOne(url, pdf_name): # showPdf(url, pdf_name) savePdf(url, pdf_name) print(pdf_name + "has been downloaded!!") def get_urls(url): print("Please wait for second ...") html = requests.get(url, data=None) # html.encoding = 'utf-8' # 指定網頁編碼方式(檢視網頁原始碼) # print(html.encoding) # print(html.status_code) # print(html.text) soup = BeautifulSoup(html.text, 'lxml') # all_a = soup.find('div', class_='cvideotitle').find_all('a') all_a = soup.find('div').find_all('a') for a in all_a: title = a.get_text() url_pdf = a['href'] name = title[19:-18] print('------開始儲存:', name) downOne(url_pdf, str(name)) # time.sleep(random.randint(1, 2)) """ #將資料寫入記事本 # with open(r'D:jishubaogao\date.txt', 'a', encoding='gbk') as f: f.write(name + '\n') """ if __name__ == "__main__": for url_data in url_datas: get_urls(url_data) print("finsh"+url_data) print("finsh download")
在這兒,
soup = BeautifulSoup(html.text, 'lxml')
all_a = soup.find('div').find_all('a')
for a in all_a:
title = a.get_text()
url_pdf = a['href']
#在這兒,用bs對div標籤解析,得到所有a標籤中的名稱和連結,其中名稱是這樣的
\r\n \t\t\t001大連海事大學 電航phi2017.docx\r\n \t\t
於是對其切片處理,否則會報錯 name = title[19:-18] print('------開始儲存:', name) downOne(url_pdf, str(name)) # time.sleep(random.randint(1, 2)) #將資料寫入記事本 with open(r'D:jishubaogao\date.txt', 'a', encoding='gbk') as f: f.write(name + '\n')