1. 程式人生 > >AdditionalRadar-線性迴歸,因子篩選,多因子模型

AdditionalRadar-線性迴歸,因子篩選,多因子模型

# coding=utf-8
import numpy as np
import xlrd
import statsmodels.api as sm
import MySQLdb
import math


#*****************Load data part*****************
data = xlrd.open_workbook('StandarData.xlsx')
tb_hist_info = data.sheet_by_name('hist_info')
feat_num = 14
feat_index = np.zeros(feat_num)
for k in range(0,feat_num):
    feat_index[k] = k
hist_info_rows = tb_hist_info.nrows - 3
hist_info = np.zeros((hist_info_rows,feat_num))
for k in range(1,tb_hist_info.ncols):
    hist_info[:,k-1] = tb_hist_info.col_values(k,3)


#----------------load data from SQL---------------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
sql = """SELECT
-- 大盤點位
t.index_point      ,
-- 折價率
t.dis_rate         ,
-- 發行價格
t.issue_price      ,
t.pe_ttm           ,
t.pe_lyr           ,
t.pb               ,
-- 募集金額
t.collection       ,
-- 總市值
t.val_mv           ,
-- 發行目的程式碼
t.OBJECT_CODE      ,
-- 中證流通PE
t.zzlt_pe          ,
-- 中證流通pb
t.zzlt_pb          ,
-- 攤薄比例
t.DILUTED_RATIO    ,
-- 淨利潤同比
t.yoy_net_profit   ,
-- 大股東認購比例
t.holder_subs_rate ,
-- 定增收益
t.seo_yield
         FROM seo_cd_factor t
         order by t.index ASC
;
"""
cursor.execute(sql)
factor = cursor.fetchall()
db.close()
sample_num = len(factor)
factor = np.array(factor)
#----------------load data from xlsx---------------
# tb_factor_info = data.sheet_by_name('factor')
# sample_num = tb_factor_info.nrows - 2
# factor_num = 16
# factor = np.zeros((sample_num, factor_num))
# for k in range(0,factor_num):
#     factor[:,k] = tb_factor_info.col_values(k,2)


#*****************model generate part*****************
#--------hist the rate base on the factor--------
max_bin_num = hist_info_rows
hist_yield = np.zeros((max_bin_num,feat_num))
hist_sample_num = np.zeros((max_bin_num,feat_num))
factor_index = np.zeros((sample_num, feat_num), dtype='int32')
sample_yield = np.zeros((sample_num,1))
sample_yield = factor[:,feat_num]
for col in range(0,feat_num):
    bin_num = int(hist_info[max_bin_num-1,col])
    for row in range(0,sample_num):
        factor_value = factor[row,col]
        index = -1
        for k in range(0,bin_num):
            if factor_value<=hist_info[k,col]:
                index = k
                factor_index[row,col] = k
                hist_sample_num[k,col] += 1
                hist_yield[k,col] += sample_yield[row]
                break
        if index==-1:
            factor_index[row, col] = bin_num
            hist_sample_num[bin_num, col] += 1
            hist_yield[bin_num, col] += sample_yield[row]


#debug
# for col in range(0,feat_num):
#     hist_num_sum = 0
#     hist_yield_sum = 0
#     for row in range(0,max_bin_num):
#         hist_num_sum += hist_sample_num[row,col]
#         hist_yield_sum += hist_yield[row,col]
#     print hist_num_sum
#     print hist_yield_sum
#     print col


#--------calc the mean yield------------
hist_mean_yield = np.zeros((max_bin_num,feat_num))
for i in range(0,max_bin_num):
    for j in range(0,feat_num):
        if hist_sample_num[i,j]!=0:
            hist_mean_yield[i,j] = hist_yield[i,j]/hist_sample_num[i,j]


#--------map the factor to the mean yield------------
sample_mapping_yield = np.zeros((sample_num, feat_num))
for i in range(0,sample_num):
    for j in range(0,feat_num):
        index = factor_index[i,j]
        sample_mapping_yield[i,j] = hist_mean_yield[index,j]


# ------regression model sample_mapping_yield and sample_yield by factor-----
mapping_yield = sample_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues[1:]
file_model_report = open('model_info.txt','w')
print >>file_model_report,model_reg.summary()
print model_reg.summary()


#---- feature selection and generate new model------
p_value_thresh = 0.05
new_feat_num = 0
for k in range(0,feat_num):
    if p_values[k]<p_value_thresh:
        new_feat_num += 1
new_mapping_yield = np.zeros((sample_num, new_feat_num))
new_feat_index = np.zeros((new_feat_num,), dtype=np.int)
count = 0
for k in range(0,feat_num):
    if p_values[k]<p_value_thresh:
        new_mapping_yield[:,count] = sample_mapping_yield[:,k]
        new_feat_index[count] = feat_index[k]
        count += 1


mapping_yield = new_mapping_yield
mapping_yield = sm.add_constant(mapping_yield)
model_reg = sm.OLS(sample_yield,mapping_yield)
model_reg = model_reg.fit()
p_values = model_reg.pvalues
print >>file_model_report, '\n\n *********************The new model after feature selection********************'
print >>file_model_report, 'Feature index: ', new_feat_index
print >>file_model_report, model_reg.summary()
file_model_report.close()


print '\n\n *********************The new model after feature selection********************'
print 'Feature index: ', new_feat_index
print model_reg.summary()


#--------output the model----------------
np.savetxt('model_para.txt', model_reg.params)
np.savetxt('model_factor_index.txt', new_feat_index)
np.savetxt('hist_mean_yield.txt', hist_mean_yield)


#--------calc the predict result-------------
# np.savetxt('mapping_yield.txt', mapping_yield)
predict_yield = model_reg.predict(mapping_yield)
#----------map the yild to the probability---------
pred_prob = np.zeros(sample_num)
for k in range(0,sample_num):
    pred_yield = predict_yield[k]
    if pred_yield<-0.5:
        pred_prob[k] = 0.01
    elif pred_yield<0:
        pred_prob[k] = 0.8 * pred_yield + 0.4
    elif pred_yield<1:
        pred_prob[k] = 0.55 * pred_yield + 0.4
    elif pred_yield<4:
        pred_prob[k] = 0.05/3.0 * pred_yield + 14.0/15.0
    else:
        pred_prob[k] = 1


# np.savetxt('pred_prob.txt', pred_prob)
# np.savetxt('predict_yield.txt', predict_yield)


#-------------map the yield to the score-------------
predict_score = np.zeros(sample_num)
for k in range(0,sample_num):
    pred_yield = predict_yield[k]
    if pred_yield<0:
        predict_score[k] = 30 * pred_yield + 30
    elif pred_yield<0.3:
        predict_score[k] = 100.0/3.0 * pred_yield  +30
    elif pred_yield<0.45:
        predict_score[k] = 200.0/3.0 * pred_yield + 20
    elif pred_yield<0.7:
        predict_score[k] = 80 * pred_yield + 14
    elif pred_yield<0.9:
        predict_score[k] = 50 * pred_yield + 35
    elif pred_yield<1.2:
        predict_score[k] = 100.0/3.0 * pred_yield + 50
    elif pred_yield <2:
        predict_score[k] = 6.25 * pred_yield + 82.5
    else:
        predict_score[k] = 2.5 * pred_yield + 90


#---------output the predict result----------
db = MySQLdb.connect("10.91.24.140","thfund_cia","ciacia","cia" )
cursor = db.cursor()
for k in range(0,sample_num):
    if not(math.isnan(predict_score[k])):
        sql = """Update seo_cd_factor Set score=%f Where `index`=%d""" % (predict_score[k], k)
        cursor.execute(sql)
    if not(math.isnan(predict_yield[k])):
        sql = """Update seo_cd_factor Set yield=%f Where `index`=%d""" % (predict_yield[k], k)
        cursor.execute(sql)
    if not(math.isnan(pred_prob[k])):
        sql = """Update seo_cd_factor Set probability=%f Where `index`=%d""" % (pred_prob[k], k)
        cursor.execute(sql)
db.commit()
db.close()








# #*******************predict part****************************
# #--------predict the yield of the testing data------------
# feat_num = new_feat_num
# feat_index = new_feat_index
# data = xlrd.open_workbook('TestData.xlsx')
# tb_factor_info = data.sheet_by_name('test')
# sample_num = tb_factor_info.nrows - 1
# test_data = np.zeros((sample_num, feat_num))
# for k in range(0,feat_num):
#     index = feat_index[k]
#     test_data[:,k] = tb_factor_info.col_values(index,1)
#
# predict_yield = np.ones(sample_num) * model_reg.params[0]
# for row in range(0,sample_num):
#     for col in range(0,feat_num):
#         src_f_index = feat_index[col]
#         bin_num = int(hist_info[max_bin_num-1,src_f_index])
#         factor_value = test_data[row,col]
#         bin_index = -1
#         for k in range(0,bin_num):
#             if factor_value<=hist_info[k,src_f_index]:
#                 bin_index = k
#                 break
#         if bin_index == -1:
#             bin_index = bin_num
#         mapping_yield = hist_mean_yield[bin_index, src_f_index]
#         predict_yield[row] += model_reg.params[col + 1] * mapping_yield
#
# print predict_yield
#
# #----------map the yild to the probability---------
# pred_prob = np.zeros(sample_num)
# for k in range(0,sample_num):
#     pred_yield = predict_yield[k]
#     if pred_yield<0.5:
#         pred_prob[k] = 0.01
#     elif pred_yield<0:
#         pred_prob[k] = 0.8 * pred_yield + 0.4
#     elif pred_yield<1:
#         pred_prob[k] = 0.55 * pred_yield + 0.4
#     elif pred_yield<4:
#         pred_prob[k] = 0.05/3 * pred_yield + 14/15
#     else:
#         pred_prob[k] = 1
#
# print pred_prob
#
# #-------------map the yield to the score-------------
# data = xlrd.open_workbook('StandarData.xlsx')
# tb_score_info = data.sheet_by_name('score_mapping_info')
# score_bin_num = tb_score_info.nrows
# bin_yield = tb_score_info.col_values(0)
# bin_score = tb_score_info.col_values(1)
# predict_score = np.zeros(sample_num)
# for i in range(0,sample_num):
#     pred_yield = predict_yield[i]
#     for k in range(0,score_bin_num):
#         if pred_yield<bin_yield[k]:
#             predict_score[i] = bin_score[k]
#             break
#
# print predict_score