1. 程式人生 > >pytho金融大資料分析

pytho金融大資料分析

以下所有內容都是基於python3

時間序列分析

import pandas as pd
import numpy as np
from datetime import time

# 生成日期
MyTime = time(hour=10, minute=1, second=20)
# 生成日期範圍
index = pd.data_range(start='2000-01-01 00:00:00', end='2010-01-01 23:59:00', freq='min')
# 初始化時間序列
ts = pd.Series(np.random.randn(len(index)), index=index)
# 對時間序列降取樣
ts2 =
ts.resample('5min', how='sum', loffset='-1s') # open, close, high, low ts3 = ts.resample('5min', how='ohlc') # 日期偏移 offset = pd.offsets.BDay(1) print(index + offset)

獲取歷史股票資料

import pandas_datareader.data as web
data = web.get_data_yahoo('SPY','2006-01-01')
print(type(data))
print(data)

常見的函式

# 得到歷史資料, type of series,包含open, close, high, low 
data = web.get_data_yahoo('SPY','2006-01-01') # 計算日收益率 px = data['Adj Close'] returns = px.pct_change() # 計算得到指數 index = (1+rets).cumprod() # 滑動視窗rolling ma60 = index.rolling(window=60, min_periods=50).mean()

分組變換和分析

import random
import string
import numpy as np
import pandas as pd
import statsmodels.api as
sm random.seed(10) # 隨機產生n個大寫的字母 def rands(n): choices = string.ascii_uppercase return ''.join([random.choice(choices) for _ in range(n)]) # 標準化 def zscore(group): return (group - group.mean()) / group.std() N = 1000 tickers = np.array([rands(5) for _ in range(N)]) M = 500 # 生成datafrane df = pd.DataFrame({'Momentum': np.random.randn(M)/200 + 0.03, 'Value':np.random.randn(M)/200 + 0.08, 'ShortInterest':np.random.randn(M)/200 - 0.02}, index=tickers[:M]) # 對tickers進行分組 ind_names = np.array(['FIANACAL', 'TECH']) sampler = np.random.randint(0, len(ind_names), N) industries = pd.Series(ind_names[sampler], index=tickers, name='industry') # 聚合 by_industry = df.groupby(industries) # 列印聚合之後的一些結果 print(by_industry.mean()) print(by_industry.describe()) # 標準化 df_stand = by_industry.apply(zscore) print(df_stand.groupby(industries).agg(['mean', 'std'])) # 排名 ind_rank = by_industry.rank(ascending=False) print(ind_rank.groupby(industries).agg(['min','max'])) # 隨機初始化因子 fac1, fac2, fac3 = np.random.rand(3, 1000) # 隨機選取索引 ticker_subset = tickers.take(np.random.permutation(N)[:1000]) # 設定因變數 port = pd.Series(0.7*fac1 - 1.2*fac2 + 0.3*fac3 + np.random.rand(1000),index=ticker_subset) factors = pd.DataFrame({'f1':fac1, 'f2':fac2, 'f3':fac3}, index=ticker_subset) # 計算相關係數 print(factors.corrwith(port)) # 最小二乘迴歸 model = sm.OLS(port.values, factors.values) results = model.fit() print(results.params)