AdditionalRadar-線性迴歸,因子篩選,多因子模型
阿新 • • 發佈:2019-01-27
# coding=utf-8
import numpy as np
import xlrd
import statsmodels.api as sm
import MySQLdb
import math
#*****************Load data part*****************
data = xlrd.open_workbook('StandarData.xlsx')
tb_hist_info = data.sheet_by_name('hist_info')
feat_num = 14
feat_index = np.zeros(feat_num)
for k in range(0,feat_num):
feat_index[k] = k
hist_info_rows = tb_hist_info.nrows - 3
hist_info = np.zeros((hist_info_rows,feat_num))
for k in range(1,tb_hist_info.ncols):
hist_info[:,k-1] = tb_hist_info.col_values(k,3)
#----------------load data from SQL---------------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
sql = """SELECT
-- 大盤點位
t.index_point ,
-- 折價率
t.dis_rate ,
-- 發行價格
t.issue_price ,
t.pe_ttm ,
t.pe_lyr ,
t.pb ,
-- 募集金額
t.collection ,
-- 總市值
t.val_mv ,
-- 發行目的程式碼
t.OBJECT_CODE ,
-- 中證流通PE
t.zzlt_pe ,
-- 中證流通pb
t.zzlt_pb ,
-- 攤薄比例
t.DILUTED_RATIO ,
-- 淨利潤同比
t.yoy_net_profit ,
-- 大股東認購比例
t.holder_subs_rate ,
-- 定增收益
t.seo_yield
FROM seo_cd_factor t
order by t.index ASC
;
"""
cursor.execute(sql)
factor = cursor.fetchall()
db.close()
sample_num = len(factor)
factor = np.array(factor)
#----------------load data from xlsx---------------
# tb_factor_info = data.sheet_by_name('factor')
# sample_num = tb_factor_info.nrows - 2
# factor_num = 16
# factor = np.zeros((sample_num, factor_num))
# for k in range(0,factor_num):
# factor[:,k] = tb_factor_info.col_values(k,2)
#*****************model generate part*****************
#--------hist the rate base on the factor--------
max_bin_num = hist_info_rows
hist_yield = np.zeros((max_bin_num,feat_num))
hist_sample_num = np.zeros((max_bin_num,feat_num))
factor_index = np.zeros((sample_num, feat_num), dtype='int32')
sample_yield = np.zeros((sample_num,1))
sample_yield = factor[:,feat_num]
for col in range(0,feat_num):
bin_num = int(hist_info[max_bin_num-1,col])
for row in range(0,sample_num):
factor_value = factor[row,col]
index = -1
for k in range(0,bin_num):
if factor_value<=hist_info[k,col]:
index = k
factor_index[row,col] = k
hist_sample_num[k,col] += 1
hist_yield[k,col] += sample_yield[row]
break
if index==-1:
factor_index[row, col] = bin_num
hist_sample_num[bin_num, col] += 1
hist_yield[bin_num, col] += sample_yield[row]
#debug
# for col in range(0,feat_num):
# hist_num_sum = 0
# hist_yield_sum = 0
# for row in range(0,max_bin_num):
# hist_num_sum += hist_sample_num[row,col]
# hist_yield_sum += hist_yield[row,col]
# print hist_num_sum
# print hist_yield_sum
# print col
#--------calc the mean yield------------
hist_mean_yield = np.zeros((max_bin_num,feat_num))
for i in range(0,max_bin_num):
for j in range(0,feat_num):
if hist_sample_num[i,j]!=0:
hist_mean_yield[i,j] = hist_yield[i,j]/hist_sample_num[i,j]
#--------map the factor to the mean yield------------
sample_mapping_yield = np.zeros((sample_num, feat_num))
for i in range(0,sample_num):
for j in range(0,feat_num):
index = factor_index[i,j]
sample_mapping_yield[i,j] = hist_mean_yield[index,j]
# ------regression model sample_mapping_yield and sample_yield by factor-----
mapping_yield = sample_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues[1:]
file_model_report = open('model_info.txt','w')
print >>file_model_report,model_reg.summary()
print model_reg.summary()
#---- feature selection and generate new model------
p_value_thresh = 0.05
new_feat_num = 0
for k in range(0,feat_num):
if p_values[k]<p_value_thresh:
new_feat_num += 1
new_mapping_yield = np.zeros((sample_num, new_feat_num))
new_feat_index = np.zeros((new_feat_num,), dtype=np.int)
count = 0
for k in range(0,feat_num):
if p_values[k]<p_value_thresh:
new_mapping_yield[:,count] = sample_mapping_yield[:,k]
new_feat_index[count] = feat_index[k]
count += 1
mapping_yield = new_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues
print >>file_model_report, '\n\n *********************The new model after feature selection********************'
print >>file_model_report, 'Feature index: ', new_feat_index
print >>file_model_report, model_reg.summary()
file_model_report.close()
print '\n\n *********************The new model after feature selection********************'
print 'Feature index: ', new_feat_index
print model_reg.summary()
#--------output the model----------------
np.savetxt('model_para.txt', model_reg.params)
np.savetxt('model_factor_index.txt', new_feat_index)
np.savetxt('hist_mean_yield.txt', hist_mean_yield)
#--------calc the predict result-------------
# np.savetxt('mapping_yield.txt', mapping_yield)
predict_yield = model_reg.predict(mapping_yield)
#----------map the yild to the probability---------
pred_prob = np.zeros(sample_num)
for k in range(0,sample_num):
pred_yield = predict_yield[k]
if pred_yield<-0.5:
pred_prob[k] = 0.01
elif pred_yield<0:
pred_prob[k] = 0.8 * pred_yield + 0.4
elif pred_yield<1:
pred_prob[k] = 0.55 * pred_yield + 0.4
elif pred_yield<4:
pred_prob[k] = 0.05/3.0 * pred_yield + 14.0/15.0
else:
pred_prob[k] = 1
# np.savetxt('pred_prob.txt', pred_prob)
# np.savetxt('predict_yield.txt', predict_yield)
#-------------map the yield to the score-------------
predict_score = np.zeros(sample_num)
for k in range(0,sample_num):
pred_yield = predict_yield[k]
if pred_yield<0:
predict_score[k] = 30 * pred_yield + 30
elif pred_yield<0.3:
predict_score[k] = 100.0/3.0 * pred_yield +30
elif pred_yield<0.45:
predict_score[k] = 200.0/3.0 * pred_yield + 20
elif pred_yield<0.7:
predict_score[k] = 80 * pred_yield + 14
elif pred_yield<0.9:
predict_score[k] = 50 * pred_yield + 35
elif pred_yield<1.2:
predict_score[k] = 100.0/3.0 * pred_yield + 50
elif pred_yield <2:
predict_score[k] = 6.25 * pred_yield + 82.5
else:
predict_score[k] = 2.5 * pred_yield + 90
#---------output the predict result----------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
for k in range(0,sample_num):
if not(math.isnan(predict_score[k])):
sql = """Update seo_cd_factor Set score=%f Where `index`=%d""" % (predict_score[k], k)
cursor.execute(sql)
if not(math.isnan(predict_yield[k])):
sql = """Update seo_cd_factor Set yield=%f Where `index`=%d""" % (predict_yield[k], k)
cursor.execute(sql)
if not(math.isnan(pred_prob[k])):
sql = """Update seo_cd_factor Set probability=%f Where `index`=%d""" % (pred_prob[k], k)
cursor.execute(sql)
db.commit()
db.close()
# #*******************predict part****************************
# #--------predict the yield of the testing data------------
# feat_num = new_feat_num
# feat_index = new_feat_index
# data = xlrd.open_workbook('TestData.xlsx')
# tb_factor_info = data.sheet_by_name('test')
# sample_num = tb_factor_info.nrows - 1
# test_data = np.zeros((sample_num, feat_num))
# for k in range(0,feat_num):
# index = feat_index[k]
# test_data[:,k] = tb_factor_info.col_values(index,1)
#
# predict_yield = np.ones(sample_num) * model_reg.params[0]
# for row in range(0,sample_num):
# for col in range(0,feat_num):
# src_f_index = feat_index[col]
# bin_num = int(hist_info[max_bin_num-1,src_f_index])
# factor_value = test_data[row,col]
# bin_index = -1
# for k in range(0,bin_num):
# if factor_value<=hist_info[k,src_f_index]:
# bin_index = k
# break
# if bin_index == -1:
# bin_index = bin_num
# mapping_yield = hist_mean_yield[bin_index, src_f_index]
# predict_yield[row] += model_reg.params[col + 1] * mapping_yield
#
# print predict_yield
#
# #----------map the yild to the probability---------
# pred_prob = np.zeros(sample_num)
# for k in range(0,sample_num):
# pred_yield = predict_yield[k]
# if pred_yield<0.5:
# pred_prob[k] = 0.01
# elif pred_yield<0:
# pred_prob[k] = 0.8 * pred_yield + 0.4
# elif pred_yield<1:
# pred_prob[k] = 0.55 * pred_yield + 0.4
# elif pred_yield<4:
# pred_prob[k] = 0.05/3 * pred_yield + 14/15
# else:
# pred_prob[k] = 1
#
# print pred_prob
#
# #-------------map the yield to the score-------------
# data = xlrd.open_workbook('StandarData.xlsx')
# tb_score_info = data.sheet_by_name('score_mapping_info')
# score_bin_num = tb_score_info.nrows
# bin_yield = tb_score_info.col_values(0)
# bin_score = tb_score_info.col_values(1)
# predict_score = np.zeros(sample_num)
# for i in range(0,sample_num):
# pred_yield = predict_yield[i]
# for k in range(0,score_bin_num):
# if pred_yield<bin_yield[k]:
# predict_score[i] = bin_score[k]
# break
#
# print predict_score
import numpy as np
import xlrd
import statsmodels.api as sm
import MySQLdb
import math
#*****************Load data part*****************
data = xlrd.open_workbook('StandarData.xlsx')
tb_hist_info = data.sheet_by_name('hist_info')
feat_num = 14
feat_index = np.zeros(feat_num)
for k in range(0,feat_num):
feat_index[k] = k
hist_info_rows = tb_hist_info.nrows - 3
hist_info = np.zeros((hist_info_rows,feat_num))
for k in range(1,tb_hist_info.ncols):
hist_info[:,k-1] = tb_hist_info.col_values(k,3)
#----------------load data from SQL---------------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
sql = """SELECT
-- 大盤點位
t.index_point ,
-- 折價率
t.dis_rate ,
-- 發行價格
t.issue_price ,
t.pe_ttm ,
t.pe_lyr ,
t.pb ,
-- 募集金額
t.collection ,
-- 總市值
t.val_mv ,
-- 發行目的程式碼
t.OBJECT_CODE ,
-- 中證流通PE
t.zzlt_pe ,
-- 中證流通pb
t.zzlt_pb ,
-- 攤薄比例
t.DILUTED_RATIO ,
-- 淨利潤同比
t.yoy_net_profit ,
-- 大股東認購比例
t.holder_subs_rate ,
-- 定增收益
t.seo_yield
FROM seo_cd_factor t
order by t.index ASC
;
"""
cursor.execute(sql)
factor = cursor.fetchall()
db.close()
sample_num = len(factor)
factor = np.array(factor)
#----------------load data from xlsx---------------
# tb_factor_info = data.sheet_by_name('factor')
# sample_num = tb_factor_info.nrows - 2
# factor_num = 16
# factor = np.zeros((sample_num, factor_num))
# for k in range(0,factor_num):
# factor[:,k] = tb_factor_info.col_values(k,2)
#*****************model generate part*****************
#--------hist the rate base on the factor--------
max_bin_num = hist_info_rows
hist_yield = np.zeros((max_bin_num,feat_num))
hist_sample_num = np.zeros((max_bin_num,feat_num))
factor_index = np.zeros((sample_num, feat_num), dtype='int32')
sample_yield = np.zeros((sample_num,1))
sample_yield = factor[:,feat_num]
for col in range(0,feat_num):
bin_num = int(hist_info[max_bin_num-1,col])
for row in range(0,sample_num):
factor_value = factor[row,col]
index = -1
for k in range(0,bin_num):
if factor_value<=hist_info[k,col]:
index = k
factor_index[row,col] = k
hist_sample_num[k,col] += 1
hist_yield[k,col] += sample_yield[row]
break
if index==-1:
factor_index[row, col] = bin_num
hist_sample_num[bin_num, col] += 1
hist_yield[bin_num, col] += sample_yield[row]
#debug
# for col in range(0,feat_num):
# hist_num_sum = 0
# hist_yield_sum = 0
# for row in range(0,max_bin_num):
# hist_num_sum += hist_sample_num[row,col]
# hist_yield_sum += hist_yield[row,col]
# print hist_num_sum
# print hist_yield_sum
# print col
#--------calc the mean yield------------
hist_mean_yield = np.zeros((max_bin_num,feat_num))
for i in range(0,max_bin_num):
for j in range(0,feat_num):
if hist_sample_num[i,j]!=0:
hist_mean_yield[i,j] = hist_yield[i,j]/hist_sample_num[i,j]
#--------map the factor to the mean yield------------
sample_mapping_yield = np.zeros((sample_num, feat_num))
for i in range(0,sample_num):
for j in range(0,feat_num):
index = factor_index[i,j]
sample_mapping_yield[i,j] = hist_mean_yield[index,j]
# ------regression model sample_mapping_yield and sample_yield by factor-----
mapping_yield = sample_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues[1:]
file_model_report = open('model_info.txt','w')
print >>file_model_report,model_reg.summary()
print model_reg.summary()
#---- feature selection and generate new model------
p_value_thresh = 0.05
new_feat_num = 0
for k in range(0,feat_num):
if p_values[k]<p_value_thresh:
new_feat_num += 1
new_mapping_yield = np.zeros((sample_num, new_feat_num))
new_feat_index = np.zeros((new_feat_num,), dtype=np.int)
count = 0
for k in range(0,feat_num):
if p_values[k]<p_value_thresh:
new_mapping_yield[:,count] = sample_mapping_yield[:,k]
new_feat_index[count] = feat_index[k]
count += 1
mapping_yield = new_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues
print >>file_model_report, '\n\n *********************The new model after feature selection********************'
print >>file_model_report, 'Feature index: ', new_feat_index
print >>file_model_report, model_reg.summary()
file_model_report.close()
print '\n\n *********************The new model after feature selection********************'
print 'Feature index: ', new_feat_index
print model_reg.summary()
#--------output the model----------------
np.savetxt('model_para.txt', model_reg.params)
np.savetxt('model_factor_index.txt', new_feat_index)
np.savetxt('hist_mean_yield.txt', hist_mean_yield)
#--------calc the predict result-------------
# np.savetxt('mapping_yield.txt', mapping_yield)
predict_yield = model_reg.predict(mapping_yield)
#----------map the yild to the probability---------
pred_prob = np.zeros(sample_num)
for k in range(0,sample_num):
pred_yield = predict_yield[k]
if pred_yield<-0.5:
pred_prob[k] = 0.01
elif pred_yield<0:
pred_prob[k] = 0.8 * pred_yield + 0.4
elif pred_yield<1:
pred_prob[k] = 0.55 * pred_yield + 0.4
elif pred_yield<4:
pred_prob[k] = 0.05/3.0 * pred_yield + 14.0/15.0
else:
pred_prob[k] = 1
# np.savetxt('pred_prob.txt', pred_prob)
# np.savetxt('predict_yield.txt', predict_yield)
#-------------map the yield to the score-------------
predict_score = np.zeros(sample_num)
for k in range(0,sample_num):
pred_yield = predict_yield[k]
if pred_yield<0:
predict_score[k] = 30 * pred_yield + 30
elif pred_yield<0.3:
predict_score[k] = 100.0/3.0 * pred_yield +30
elif pred_yield<0.45:
predict_score[k] = 200.0/3.0 * pred_yield + 20
elif pred_yield<0.7:
predict_score[k] = 80 * pred_yield + 14
elif pred_yield<0.9:
predict_score[k] = 50 * pred_yield + 35
elif pred_yield<1.2:
predict_score[k] = 100.0/3.0 * pred_yield + 50
elif pred_yield <2:
predict_score[k] = 6.25 * pred_yield + 82.5
else:
predict_score[k] = 2.5 * pred_yield + 90
#---------output the predict result----------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
for k in range(0,sample_num):
if not(math.isnan(predict_score[k])):
sql = """Update seo_cd_factor Set score=%f Where `index`=%d""" % (predict_score[k], k)
cursor.execute(sql)
if not(math.isnan(predict_yield[k])):
sql = """Update seo_cd_factor Set yield=%f Where `index`=%d""" % (predict_yield[k], k)
cursor.execute(sql)
if not(math.isnan(pred_prob[k])):
sql = """Update seo_cd_factor Set probability=%f Where `index`=%d""" % (pred_prob[k], k)
cursor.execute(sql)
db.commit()
db.close()
# #*******************predict part****************************
# #--------predict the yield of the testing data------------
# feat_num = new_feat_num
# feat_index = new_feat_index
# data = xlrd.open_workbook('TestData.xlsx')
# tb_factor_info = data.sheet_by_name('test')
# sample_num = tb_factor_info.nrows - 1
# test_data = np.zeros((sample_num, feat_num))
# for k in range(0,feat_num):
# index = feat_index[k]
# test_data[:,k] = tb_factor_info.col_values(index,1)
#
# predict_yield = np.ones(sample_num) * model_reg.params[0]
# for row in range(0,sample_num):
# for col in range(0,feat_num):
# src_f_index = feat_index[col]
# bin_num = int(hist_info[max_bin_num-1,src_f_index])
# factor_value = test_data[row,col]
# bin_index = -1
# for k in range(0,bin_num):
# if factor_value<=hist_info[k,src_f_index]:
# bin_index = k
# break
# if bin_index == -1:
# bin_index = bin_num
# mapping_yield = hist_mean_yield[bin_index, src_f_index]
# predict_yield[row] += model_reg.params[col + 1] * mapping_yield
#
# print predict_yield
#
# #----------map the yild to the probability---------
# pred_prob = np.zeros(sample_num)
# for k in range(0,sample_num):
# pred_yield = predict_yield[k]
# if pred_yield<0.5:
# pred_prob[k] = 0.01
# elif pred_yield<0:
# pred_prob[k] = 0.8 * pred_yield + 0.4
# elif pred_yield<1:
# pred_prob[k] = 0.55 * pred_yield + 0.4
# elif pred_yield<4:
# pred_prob[k] = 0.05/3 * pred_yield + 14/15
# else:
# pred_prob[k] = 1
#
# print pred_prob
#
# #-------------map the yield to the score-------------
# data = xlrd.open_workbook('StandarData.xlsx')
# tb_score_info = data.sheet_by_name('score_mapping_info')
# score_bin_num = tb_score_info.nrows
# bin_yield = tb_score_info.col_values(0)
# bin_score = tb_score_info.col_values(1)
# predict_score = np.zeros(sample_num)
# for i in range(0,sample_num):
# pred_yield = predict_yield[i]
# for k in range(0,score_bin_num):
# if pred_yield<bin_yield[k]:
# predict_score[i] = bin_score[k]
# break
#
# print predict_score