爬蟲|巨潮資訊網上市公司年報爬取
阿新 • • 發佈:2021-01-19
爬蟲|巨潮資訊網上市公司年報爬取
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import requests
import os
import random
from PyPDF2 import PdfFileReader
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-maximized')
browser = webdriver.Chrome(options=chrome_options)
#os.makedirs('D:\\公司年報') #在D盤建立資料夾,若重複執行,註釋此行
#解析網址
def get_html_content(url):
header = {
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36"
}
r = requests.get(url,headers = header)
if r.status_code == 200:
r.encoding = 'utf-8'
#print(r.content)
return r.content
else:
return None
#儲存pdf
def report_save(url,pdf_name):
report = get_html_content(url)
path = "D:\\公司年報\\" + pdf_name + ".pdf"
with open(path,'wb') as f:
f.write(report)
f.close
#獲取年報頁數
def get_num_pages(pdf_name):
path = "D:\\公司年報\\" + pdf_name + ".pdf"
reader = PdfFileReader(path)
if reader.isEncrypted:
reader.decrypt('')
page_num = reader.getNumPages()
return page_num
url = 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh#sse'
browser.get(url)
#修改財報公佈日期期間
browser.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/div[2]/div[1]/div[2]/form/div[1]/div/div/i[1]').click()
browser.find_element_by_xpath('/html/body/div[5]/div[1]/div[1]/button[7]').send_keys(Keys.ENTER)
items = []
count = 1
while count<=1:
count += 1
time.sleep(2)
all_tr = browser.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/div[1]/div[2]/div/div[3]/table/tbody').find_elements_by_xpath('.//tr')
for tr in all_tr:
item = {}
a=random.random()*3
time.sleep(a) #設定隨機休息
number = tr.find_elements_by_xpath('./td')[0].find_element_by_xpath('.//span').text
item['公司程式碼'] = number
name_pre = tr.find_elements_by_xpath('./td')[1].find_element_by_xpath('.//span').text
name = name_pre.replace('*','') #windows命名不能包含*,去除*
item['公司名稱'] = name
title = tr.find_elements_by_xpath('./td')[2].find_element_by_xpath('.//a').text
item['年報標題'] = title
date = tr.find_elements_by_xpath('./td')[3].find_element_by_xpath('.//span').text
item['年報釋出日期'] = date
year = str(int(date[0:4])-1)
item['年報對應年份'] = year
pdf_name = number + "-" + name + "-" + year #設定pdf命名
print(pdf_name)
handle_main = browser.current_window_handle #控制代碼
tr.find_elements_by_xpath('./td')[2].find_element_by_xpath('.//a').send_keys(Keys.ENTER) #點選進入網頁
if len(browser.window_handles)>1:
for handle in browser.window_handles:
if handle != handle_main:
browser.switch_to.window(handle) #切換到新控制代碼
pdf_href = browser.find_element_by_xpath('//*[@id="noticeDetail"]/div/div[2]/div[1]/a').get_attribute('href') #獲取pdf網址
print(pdf_href)
report_save(pdf_href,pdf_name)
browser.close()
browser.switch_to.window(handle_main) #切換回主控制代碼
page_num = get_num_pages(pdf_name)
print(page_num)
item['年報頁數'] = page_num
items.append(item)
if browser.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/div[1]/div[3]/div/button[2]').get_attribute('disabled') == 'disabled':
break
browser.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/div[1]/div[3]/div/button[2]/i').click()
df = pd.DataFrame(items)
df.to_excel(r'D:/公司年報/上市公司年報.xlsx')