1. 程式人生 > 實用技巧 >辦公自動化25-爬取CMB網站理財產品的投資報告並格式化輸出

辦公自動化25-爬取CMB網站理財產品的投資報告並格式化輸出

# -*- coding: utf-8 -*-
"""
Created on Aug 5 2020

@author: lizitingxue
"""

#基礎包
import numpy as np
import pandas as pd
import os
import time
##爬蟲
import re
import requests
from bs4 import BeautifulSoup
import urllib.request
from selenium import webdriver
##pdf處理
import winerror
from win32com import client as wc
from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT ##word資訊處理 from docx import Document from openpyxl import Workbook #定義函式 # 定義函式 def get_code(key_word): url = 'http://www.cmbchina.com/cfweb/Personal/Default.aspx' js_bottom = "var q=document.documentElement.scrollTop=10000" browser
= webdriver.Chrome() browser.get(url) time.sleep(3) browser.find_element_by_xpath('//*[@id="tbCondition"]/table[1]/tbody/tr[9]/td[2]/ul/li[2]/input[1]').click() browser.find_element_by_xpath('//*[@id="inputPrdKey"]').clear() browser.find_element_by_xpath('//*[@id="inputPrdKey"]').send_keys(key_word) browser.find_element_by_xpath(
'//*[@id="column_content"]/div[8]/input[1]').click() time.sleep(2) browser.execute_script(js_bottom) time.sleep(2) browser.execute_script(js_bottom) time.sleep(2) ##產品名稱和編號 temp_for_code = browser.find_elements_by_css_selector("[class='leftArea']>[class='row']>[class='code']") print(key_word+'系列產品數量為'+str(len(temp_for_code))) temp_for_name = browser.find_elements_by_css_selector('[class="inlineBgImage"]>[class="inlineBriefName"]') temp_code = [] temp_name = [] for j in temp_for_code: temp_code.append(j.text) for i in temp_for_name: temp_name.append(i.text) ###產品pdf的下載連結 url_lst = [] counts = len(browser.find_elements_by_css_selector("[class='viewMore']")) # temp = browser.find_elements_by_css_selector("[class='viewMore']") for i in range(counts): #需要返回上一級 browser.find_elements_by_css_selector("[class='viewMore']")[i].click() time.sleep(3) browser.switch_to.window(browser.window_handles[-1]) browser.find_element_by_id('fmenu_7').click() temp_for_pdf = browser.find_elements_by_css_selector("[class='c_title']>[target = '_blank']") if len(temp_for_pdf)!= 0: pdf = temp_for_pdf[0].get_attribute('href') url_lst.append(pdf) else: url_lst.append("未披露") browser.close() browser.switch_to.window(browser.window_handles[0]) ######儲存為excel tarDir = folder_path+"//"+key_word #希望生成code列表檔案的位置", if not os.path.exists(tarDir): os.mkdir(tarDir) df = pd.DataFrame({"產品名稱":temp_name,"產品程式碼":temp_code,"投資報告下載連結":url_lst}) print(df) df.to_excel(tarDir+"\\"+key_word+"產品程式碼和連結.xlsx",index = False) return url_lst def get_pdf(link,key_word): #儲存pdf try: r = requests.get(link) r.encoding = r.apparent_encoding html = r.text soup = BeautifulSoup(html,'html.parser') pro_name = soup.find(name='head',attrs={"id":"Head1"}).text.replace('\n', '').replace('\r', '').replace('\t', '').replace(' ', '').split("--")[-1] link = soup.find_all(name='a',attrs={"href":re.compile(r'.pdf$')}) if len(link) != 0: mylink = link[0] # name = mylink.string # print(name) url = mylink.get('href') # print(url) filename = pro_name filepath = folder_path + '/' +key_word+'/'+ pro_name +".pdf" urllib.request.urlretrieve(url, filename=filepath) print(pro_name+"報告下載完成~~~~~O(∩_∩)O~") else: print(pro_name+'報告下載失敗01') except: print(pro_name+"報告下載失敗02") def pdf_to_word(my_dir): ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL) os.chdir(my_dir) file_lst = [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.pdf'] for f in file_lst: src = os.path.abspath(f)#原來的地址 # my_pdf = "招商銀行零售青葵系列一年定開1號理財計劃(產品程式碼:301025301025B301025C301025D)2020年第2季度報告.pdf" try: AvDoc = Dispatch("AcroExch.AVDoc") if AvDoc.Open(src, ""): pdDoc = AvDoc.GetPDDoc() jsObject = pdDoc.GetJSObject() new = os.path.join(my_dir, f.split(".")[0]+'.docx') jsObject.SaveAs(new, "com.adobe.acrobat.docx")#新的地址 #word格式轉換 word = wc.Dispatch('Word.Application') doc = word.Documents.Open(new) doc.SaveAs(new.split(".")[0]+'T'+".docx",16) #16對應於下表中的預設docx檔案檔案 doc.Close() word.Quit() #把原來的doc檔案刪除 if os.path.exists(new): os.remove(new) except Exception as e: print(str(e)) finally: AvDoc.Close(True) jsObject = None pdDoc = None AvDoc = None def date_trans(start): publish_Time = start.replace(' ','') array = time.strptime(publish_Time, u"%Y年%m月%d日") publishTime = time.strftime("%Y-%m-%d", array) return publishTime def basic_info(file): wb = Workbook() sheet = wb.active header = ['產品名稱', '業績比較基準', '產品風險評級', '槓桿上限', '理財產品份額總額(億元)', '理財產品成立日','理財產品到期日', '直接_債券','直接_一般債權','直接_資產支援證券','直接_現金及貨幣工具','直接_含權債權','直接_權益類','直接_商品及金融衍生品類','直接_合計', '間接_債券','間接_一般債權','間接_資產支援證券','間接_現金及貨幣工具','間接_含權債權','間接_權益類','間接_商品及金融衍生品類','間接_合計',] sheet.append(header) for doc in file: document = Document(folder_path + "\\"+key_word + "\\"+doc) # 讀取word中的所有表格 tables = document.tables #基本資訊 table0= tables[0] name = table0.cell(1, 1).text.strip().replace(' ','')[4:][:-4] compe = table0.cell(6, 1).text.strip().replace(' ','')[2:] risk = table0.cell(7,1).text.strip().replace(' ','') leverage = table0.cell(8,1).text.strip().replace(' ','') total = table0.cell(9,1).text.strip().replace(' ','') start = date_trans(table0.cell(10,1).text.strip().replace(' ','')) end = date_trans(table0.cell(11,1).text.strip().replace(' ','')[0:11]) #投資標的 invest = tables[1] print(doc+"投資表格行數"+str(len(invest.rows))+"-"*20) temp_direct = [] temp_indirect = [] for i in range(8): try: # 直接投資 # name = invest.cell(i+3,1) amount = invest.cell(i+1,2).text.replace("\n","") temp_direct.append(amount) # 間接投資 # name = invest.cell(i+3,1) amount = invest.cell(i+1,4).text.replace("\n","") temp_indirect.append(amount) except Exception as error: # 捕獲異常,也可以用log寫到日誌裡方便檢視和管理 print(error) continue #格式化輸出基本資訊 row = [name,compe,risk,leverage,total,start,end]+temp_direct+temp_indirect sheet.append(row) wb.save(file_path+'\\'+key_word+'基本資訊.xlsx') def nonstan_info(file): res = pd.DataFrame(columns = ['產品名稱','專案名稱','融資客戶名稱','交易日','專案剩餘融資期限(單位:天)','報告期內到期收益率預估(%)','交易結構']) for doc in file: document = Document(folder_path + "\\"+key_word + "\\"+doc) # 讀取word中的所有表格 tables = document.tables #非標資訊 name = tables[0].cell(1, 1).text.strip().replace(' ','')[4:][:-4] nonstan = tables[-2] # print(len(nonstan.rows))#帶表頭 nonstan_all = [[] for _ in range(len(nonstan.rows)-1)] if nonstan.cell(1,1).text != '-': for l in range(len(nonstan.rows)-1): for num in range(6): nonstan_all[l].append(nonstan.cell(l+1,num).text.replace("\n",'').replace(" ","")) # print(nonstan_all)#不帶表頭 print(name + '非標專案數量為{0}'.format(len(nonstan_all))) # print("-"*20) if nonstan_all != [[]]: for x in range(len(nonstan_all)): temp = {'產品名稱':name,'專案名稱':nonstan_all[x][0],'融資客戶名稱':nonstan_all[x][1],'交易日':nonstan_all[x][2],'專案剩餘融資期限(單位:天)':nonstan_all[x][3],'報告期內到期收益率預估(%)':nonstan_all[x][4],'交易結構':nonstan_all[x][5]} res = res.append(temp,ignore_index=True) res.to_excel (file_path+'\\'+key_word +'非標投資情況.xlsx', index = False, header=True) ##爬取過程 #定義路徑和爬取產品名稱 folder_path = 'F:\\PY\\爬蟲\\投資報告備份' key_word = "卓遠" #爬取資料 url_lst = get_code(key_word) #下載報告 for link in url_lst: if link != "未披露": get_pdf(link,key_word) #pdf解析 my_dir = folder_path+'/'+key_word pdf_to_word(my_dir) #提取投資報告資訊 file_path = folder_path+"\\"+key_word file = [x for x in os.listdir(file_path) if os.path.isfile(x) and os.path.splitext(x)[1]=='.docx'] file #儲存資訊 basic_info(file) nonstan_info(file)

有問題歡迎留言哦~~~O(∩_∩)O