對文字抽取詞袋模型特徵
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
analyzer='word', # tokenise by character ngrams
max_features=4000, # keep the most common 4000 ngrams,表示抽取最常見的4000個單詞
#在x_train上提取詞袋模型特徵
vec.fit(x_train)
classifier = MultinomialNB()
# vec.transform(x_train)轉化訓練集樣本,轉變之後矩陣維度是[n_samples, 4000]
classifier.fit(vec.transform(x_train), y_train)
#加入抽取2-gram和3-gram的統計特徵
vec = CountVectorizer(
analyzer='word', # tokenise by character ngrams
ngram_range=(1,4), # use ngrams of size 1 and 2
max_features=20000,) # keep the most common 1000 ngrams
更可靠的驗證效果的方式是交叉驗證,但是交叉驗證最好保證每一份裡面的樣本類別也是相對均衡的,我們這裡使用StratifiedKFold
from sklearn.cross_validation import StratifiedKFold
#x是訓練資料,y是標籤,train_index : test_index = 4:1
stratifiedk_fold = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)
for train_index, test_index in stratifiedk_fold:
X_train, X_test = x[train_index], x[test_index]
y_train = y[train_index]