NLP中kaggle比賽例項《每日新聞對股票市場的預測》進階版
阿新 • • 發佈:2019-01-02
這篇比基礎版加了什麼呢?
基礎版是直接將文字放入TF-IDF,雖然簡單方便,但還是不夠嚴謹,可以對資料進行進一步處理,如先小寫化,將文字分成曉得tokens,接著刪減如the,a,that等停止詞,用正則化刪除數字。用這些手段將資料進行處理。
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer import pandas as pd import numpy as pd from sklearn.svm import SVC from sklearn.metrics import roc_auc_score fromdatetime import date #匯入資料 data = pd.read_csv('') #將headlines合併起來,考慮所有的news data['combined_news'] = data.filter(regex=('Top.*')).apply(lambda x:''.join(str(x.values)),axis = 1) #分割測試/訓練集 train = data[data['Date']<'2015-01-01'] test = data[data['Date']>'2014-12-31'] #文字預處理 X_train = train['combined_news'].str.lower().str.replace('"','').str.replace("'",'').str.split() X_test = test['combined_news'].str.lower().str.replace('"','').str.replace("'",'').str.split() #刪減停止詞 from nltk.corpus import stopwords stop = stopwords.words('english') #刪除數字 import re def hasNumbers(inputStrings): return bool(re.search(r'\d',inputStrings)) #lemma from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() def check(word): #如果需要這個單詞,則True,如果需要去除,則False if word in stop: return False elif hasNumbers(word): return False else: return True #然後把整個流程放進我們的DF中進行處理 X_train = X_train.apply(lambda x:[wordnet_lemmatizer.lemmatize(item) for item in x if check(item)]) X_train = X_train.apply(lambda x:[wordnet_lemmatizer.lemmatize(item) for item in x if check(item)]) #因為外部庫,比如sklearn ,只支援string輸入,所以我們把調整後的list再變回string X_train = X_train.apply(lambda x:''.join(x)) X_test = X_test.apply(lambda x:''.join(x)) #重新fit一遍我們的clf feature_extraction = TfidfVectorizer(lowercase=False) X_train =feature_extraction.fit_transform(X_train.values) X_test =feature_extraction.fit_transform(X_test.values) #訓練模型 clf = SVC(probability=True,kernel='rbf') clf.fit(X_train,y_train) predictions = clf.predict_proba(X_test) print('ROC_AUC yieds'+str(roc_auc_score(y_test,predictions[:,1])))