Kaggle 入門(NLP)——基於新聞語料預測股票漲跌
阿新 • • 發佈:2019-02-20
import pandas as pd import numpy as np import warnings from matplotlib import pyplot #from pandas import read_csv, set_option from pandas import Series, datetime from pandas.tools.plotting import scatter_matrix, autocorrelation_plot from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt import random from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from statsmodels.tsa.arima_model import ARIMA from xgboost import XGBClassifier import seaborn as sns
每次引入資料,都應該檢查其規模大小,資料型別,對資料有一個大致瞭解
sentence_file = "combined_stock_data.csv"
sentence_df = pd.read_csv(sentence_file, parse_dates=[1])#parse_dates is used to 指定時間序列
print(sentence_df.shape)
print(sentence_df.dtypes)
print(sentence_df)
stock_prices = "DJIA_table.csv" stock_data = pd.read_csv(stock_prices,parse_dates=[0]) stock_data.head() #by checking the head or tail we can have an overview of the data
#你會發現Volumn是int型,為了保持所有資料型別的一致性
#應該把資料轉換為float
print(stock_data.shape)
print(stock_data.dtypes)
merged_dataframe = sentence_df[['Date','Label','Subjectivity','Objectivity','Positive', 'Negative', 'Neutral']].merge(stock_data, how='inner', on='Date', left_index=True) print(merged_dataframe.shape) merged_dataframe.head()
merged_dataframe['Volume']=merged_dataframe['Volume'].astype(float)
print(merged_dataframe.dtypes)
#recheck
cols=list(merged_dataframe)
print(cols)
cols.append(cols.pop(cols.index('Label')))
merged_dataframe=merged_dataframe.ix[:,cols]
merged_dataframe.head()
del merged_dataframe['Volume']
merged_dataframe.head()
#w我們做到這裡,是data preparation
#包括資料的整合,資料型別的統一,資料順序的微調
#接下來進行資料質量的檢測
#看缺失值和極端值的影響
#describe() 用來巨集觀觀察資料的質量
print(merged_dataframe.describe())
#對所有資料的畫圖,看分佈
#x軸是列值,y軸是頻數
sns.set()
merged_dataframe.hist(sharex = False, sharey = False, xlabelsize = 4, ylabelsize = 4, figsize=(13, 13))
pyplot.show()
#如果資料分佈不均勻的話,要進行進一部分變換,使得資料的分佈更加均勻
pyplot.scatter(merged_dataframe['Subjectivity'],merged_dataframe['Label'])
pyplot.xlabel('Subjectivity')
pyplot.ylabel('Stock Price Up or Down 0: Down, 1: Up')
pyplot.show()
pyplot.scatter(merged_dataframe['Objectivity'], merged_dataframe['Label'])
pyplot.xlabel('Objectivity')
pyplot.ylabel('Stock Price Up or Down 0: Down, 1: Up')
pyplot.show()
merged_dataframe['Subjectivity'].plot('hist')
pyplot.xlabel('Subjectivity')
pyplot.ylabel('Frequency')
pyplot.show()
merged_dataframe['Objectivity'].plot('hist')
pyplot.xlabel('Objectivity')
pyplot.ylabel('Frequency')
pyplot.show()
print("Size of the Labels column")
print(merged_dataframe.groupby('Label').size())
#如何知道各列之間的關係呢?
#畫一個Correlation Map
# plot a heat map and a scatter matrix
#一些機器學習演算法要求變數之間不能有強關聯關係,這種情況下就需要降維
colormap = pyplot.cm.afmhot
pyplot.figure(figsize=(16,12))
pyplot.title('Pearson correlation of continuous features', y=1.05, size=15)
sns.heatmap(merged_dataframe.corr(),linewidths=0.1,vmax=1.0, square=True,
cmap=colormap, linecolor='white', annot=True)
pyplot.show()
md_copy.index = md_copy.index.sort_values() ##important!
merged_dataframe=md_copy
print(merged_dataframe.dtypes)
print(merged_dataframe.count())
# Change the NaN values to the mean value of that column
nan_list = ['Subjectivity', 'Objectivity', 'Positive', 'Negative', 'Neutral']
for col in nan_list:
merged_dataframe[col] = merged_dataframe[col].fillna(merged_dataframe[col].mean())
# Recheck the count
print(merged_dataframe.count())
# Separate the dataframe for input(X) and output variables(y)
X = merged_dataframe.loc[:,'Subjectivity':'Adj Close']
y = merged_dataframe.loc[:,'Label']
# Set the validation size, i.e the test set to 20%
validation_size = 0.20
# Split the dataset to test and train sets
# Split the initial 70% of the data as training set and the remaining 30% data as the testing set
train_size = int(len(X.index) * 0.7)
print(len(y))
print(train_size)
X_train, X_test = X.loc[0:train_size, :], X.loc[train_size: len(X.index), :]
y_train, y_test = y[0:train_size+1], y.loc[train_size: len(X.index)]
print('Observations: %d' % (len(X.index)))
print('X Training Observations: %d' % (len(X_train.index)))
print('X Testing Observations: %d' % (len(X_test.index)))
print('y Training Observations: %d' % (len(y_train)))
print('y Testing Observations: %d' % (len(y_test)))
pyplot.plot(X_train['Objectivity'])
pyplot.plot([None for i in X_train['Objectivity']] + [x for x in X_test['Objectivity']])
pyplot.show()
num_folds = 10
scoring = 'accuracy'
# Append the models to the models list
models = []
models.append(('LR' , LogisticRegression()))
models.append(('LDA' , LinearDiscriminantAnalysis()))
models.append(('KNN' , KNeighborsClassifier()))
models.append(('CART' , DecisionTreeClassifier()))
models.append(('NB' , GaussianNB()))
models.append(('SVM' , SVC()))
models.append(('RF' , RandomForestClassifier(n_estimators=50)))
models.append(('XGBoost', XGBClassifier()))
Date datetime64[ns] Subjectivity float64 Objectivity float64 Positive float64 Negative float64 Neutral float64 Open float64 High float64 Low float64 Close float64 Adj Close float64 Label int64 dtype: object Date 1989 Subjectivity 1986 Objectivity 1986 Positive 1986 Negative 1986 Neutral 1986 Open 1989 High 1989 Low 1989 Close 1989 Adj Close 1989 Label 1989 dtype: int64 Date 1989 Subjectivity 1989 Objectivity 1989 Positive 1989 Negative 1989 Neutral 1989 Open 1989 High 1989 Low 1989 Close 1989 Adj Close 1989 Label 1989 dtype: int64 1989 1392 Observations: 1989 X Training Observations: 1393 X Testing Observations: 597 y Training Observations: 1393 y Testing Observations: 597
results =[]
for name,model in models:
clf = model
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accu_score = accuracy_score(y_test,y_pred)
print(name+ ":"+str(accu_score))
#XGboost 與 LDA最高
#但是LDA是可信的嗎??
LR:0.9715242881072027
LDA:0.9413735343383585 KNN:0.609715242881072 CART:0.5209380234505863 NB:0.49748743718592964 SVM:0.5309882747068677 RF:0.5527638190954773 XGBoost:0.5862646566164154
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model_lda = LinearDiscriminantAnalysis()
model_lda.fit(rescaledX,y_train)
#estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_lda.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test,predictions))
print("confusion matrix")
print(confusion_matrix(y_test,predictions))
print("classification report")
print(classification_report(y_test,predictions))#很有用哦,返回準確率,召回率
accuracy score: 0.9413735343383585 confusion matrix [[252 28] [ 7 310]] classification report precision recall f1-score support 0 0.97 0.90 0.94 280 1 0.92 0.98 0.95 317 avg / total 0.94 0.94 0.94 597
#當data都在不同範圍的時候,最好用feature scaling歸一化
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model_xgb = XGBClassifier()
model_xgb.fit(rescaledX,y_train)
rescaledValidationX = scaler.transform(X_test)
predictions = model_xgb.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))
#接下來畫ROC/AUC來判別LDA的結果是否可信
#generating the roc curve
y_pred_proba = model_lda.predict_proba(X_test)[:,1]
#第 i 行 第 j 列上的數值是模型預測 第 i 個預測樣本為某個標籤的概率,並且每一行的概率和為1。
fpr,tpr,thresholds = roc_curve(y_test,y_pred_proba)
roc_auc = auc(fpr,tpr)
#plot ROC curve
print("roc auc is :" +str(roc_auc))
pyplot.plot([0,1],[0,1],'k--')
pyplot.plot(fpr,tpr)
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('Roc Curve')
pyplot.show()
kfold_val = KFold(n_splits = num_folds,random_state = 42)
auc_score = cross_val_score(model_lda,X_test,y_test,cv=kfold_val,scoring = 'roc_auc')
print("AUC using cross val: "+str(auc_score))
mean_auc = np.mean(auc_score)
print("Mean AUC score is: "+str(mean_auc))
#Scaling Random Forests
model_rf = RandomForestClassifier(n_estimators=1000)
model_rf.fit(rescaledX,y_train)
#estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_rf.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))
accuracy score: 0.5644891122278057 confusion matrix: [[102 178] [ 82 235]] classification report: precision recall f1-score support 0 0.55 0.36 0.44 280 1 0.57 0.74 0.64 317 avg / total 0.56 0.56 0.55 597
#fine tuning XGBoost
#主要用來調參的是 n_estimators 和 max_depth
#n_estimator:XGBoost is an additive model, multiple models are created on different samples of data and the model learns after training of different samples. How many samples are the optimum best for the XGBoost to train from is usually unknown and the best way to find out is to check by training on different set of estimators.'
#model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
#ignore warnings
import warnings
warnings.filterwarnings("ignore")
n_estimators = [150,200,250,300,450,500,550,600,800,1000]
max_depth=[i for i in range(1,12)]
print(max_depth)
#initialize best depth,best score,best estimator
max_score = 0
best_depth = 0
best_estimator = 0
for n in n_estimators:
for md in max_depth:
model = XGBClassifier(n_estimators = n,max_depth = md)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test,y_pred)
if score>max_score:
max_score = score
best_depth = md
best_estimator = n
#print("score is: "+str(score)+"at depth of "+str(md)+"and estimator"+str(n))
print("Best score is " + str(max_score) + " at depth of " + str(best_depth) + " and estimator of " + str(best_estimator))
#結果顯示depth 3 和 estimator 500結果最佳,準確率上升到0.616
#這個過程有點久,慢慢等
#??準確率居然還下降了
imp_features_df = merged_dataframe[['Low', "Neutral", 'Close', 'Objectivity']]
Xi_train, Xi_test = imp_features_df.loc[0:train_size, :], imp_features_df.loc[train_size: len(X.index), :]
clf = XGBClassifier(n_estimators=500, max_depth=3)
clf.fit(Xi_train, y_train)
yi_pred = clf.predict(Xi_test)
score = accuracy_score(y_test, yi_pred)
print("Score is "+ str(score))
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X)
transformed = pca.transform(X)
print(transformed.shape)
print(type(transformed))
pca_df = pd.DataFrame(transformed)
X_train_pca,X_test_pca = pca_df.loc[0:train_size,:],pca_df.loc[train_size:len(X.index),:]
clf = XGBClassifier(n_estimators = 500,max_depth = 3)
clf.fit(X_train_pca,y_train)
y_pred_pca =clf.predict(X_test_pca)
score = accuracy_score(y_test,y_pred_pca)
print("Score is "+str(score))
#但結果不一定可信,還是得看confusion_matrix 以及classification_report
pca_matrix = confusion_matrix(y_test,y_pred_pca)
pca_report = classification_report(y_test,y_pred_pca)
print("Confusion Matrix: \n" + str(pca_matrix))
print("Classification report: \n" + str(pca_report))
Score is 0.9547738693467337 Confusion Matrix: [[266 14] [ 13 304]] Classification report: precision recall f1-score support 0 0.95 0.95 0.95 280 1 0.96 0.96 0.96 317 avg / total 0.95 0.95 0.95 597
y_pred_proba_pca = clf.predict_proba(X_test_pca)[:,1]
fpr,tpr,thresholds = roc_curve(y_test,y_pred_proba_pca)
roc_auc = auc(fpr,tpr)
print("AUC score is "+str(roc_auc))
print("roc auc is : "+str(roc_auc))
pyplot.plot([0, 1], [0, 1], 'k--')
pyplot.plot(fpr, tpr)
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC Curve')
pyplot.show()
AUC score is 0.9857875168995043 roc auc is : 0.9857875168995043
說明結果可信度還是蠻高的。