pytho金融大資料分析
阿新 • • 發佈:2018-12-13
以下所有內容都是基於python3
時間序列分析
import pandas as pd
import numpy as np
from datetime import time
# 生成日期
MyTime = time(hour=10, minute=1, second=20)
# 生成日期範圍
index = pd.data_range(start='2000-01-01 00:00:00', end='2010-01-01 23:59:00', freq='min')
# 初始化時間序列
ts = pd.Series(np.random.randn(len(index)), index=index)
# 對時間序列降取樣
ts2 = ts.resample('5min', how='sum', loffset='-1s')
# open, close, high, low
ts3 = ts.resample('5min', how='ohlc')
# 日期偏移
offset = pd.offsets.BDay(1)
print(index + offset)
獲取歷史股票資料
import pandas_datareader.data as web
data = web.get_data_yahoo('SPY','2006-01-01')
print(type(data))
print(data)
常見的函式
# 得到歷史資料, type of series,包含open, close, high, low
data = web.get_data_yahoo('SPY','2006-01-01')
# 計算日收益率
px = data['Adj Close']
returns = px.pct_change()
# 計算得到指數
index = (1+rets).cumprod()
# 滑動視窗rolling
ma60 = index.rolling(window=60, min_periods=50).mean()
分組變換和分析
import random
import string
import numpy as np
import pandas as pd
import statsmodels.api as sm
random.seed(10)
# 隨機產生n個大寫的字母
def rands(n):
choices = string.ascii_uppercase
return ''.join([random.choice(choices) for _ in range(n)])
# 標準化
def zscore(group):
return (group - group.mean()) / group.std()
N = 1000
tickers = np.array([rands(5) for _ in range(N)])
M = 500
# 生成datafrane
df = pd.DataFrame({'Momentum': np.random.randn(M)/200 + 0.03,
'Value':np.random.randn(M)/200 + 0.08,
'ShortInterest':np.random.randn(M)/200 - 0.02}, index=tickers[:M])
# 對tickers進行分組
ind_names = np.array(['FIANACAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = pd.Series(ind_names[sampler], index=tickers, name='industry')
# 聚合
by_industry = df.groupby(industries)
# 列印聚合之後的一些結果
print(by_industry.mean())
print(by_industry.describe())
# 標準化
df_stand = by_industry.apply(zscore)
print(df_stand.groupby(industries).agg(['mean', 'std']))
# 排名
ind_rank = by_industry.rank(ascending=False)
print(ind_rank.groupby(industries).agg(['min','max']))
# 隨機初始化因子
fac1, fac2, fac3 = np.random.rand(3, 1000)
# 隨機選取索引
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
# 設定因變數
port = pd.Series(0.7*fac1 - 1.2*fac2 + 0.3*fac3 + np.random.rand(1000),index=ticker_subset)
factors = pd.DataFrame({'f1':fac1, 'f2':fac2, 'f3':fac3},
index=ticker_subset)
# 計算相關係數
print(factors.corrwith(port))
# 最小二乘迴歸
model = sm.OLS(port.values, factors.values)
results = model.fit()
print(results.params)