利用skift實現fasttext模型
阿新 • • 發佈:2018-12-17
skift: 用於Python fastText的scikit-learn 包裝器
什麼是 skift?
skift包括幾個scikit-learn相容包裝器,裡面封裝了fasttext模型,fasttext原理類似於word2vec,主要用於文字快速分類。其優勢在於分類速度快,使用n-gram特徵容易獲得文字句子區域性資訊、構造新詞。缺點是隨著語料的增長,記憶體需求也會增長。那麼如果解決記憶體問題呢?fasttext這種提出三種解決方法,包括
- 過濾掉出現次數少的詞;
- 使用Hash儲存
- 採用word粒度,而非char粒度
例如句子: 我喜歡去中國, 如果採用char粒度,則使用2-gram的話,產生的特徵為
我喜 喜歡 歡中 中國
如果採用word粒度的話,產生的特徵為
我喜歡 喜歡去 去中國
關於fasttext原理比較好的參考有FastText文字分類演算法學習筆記和FastText的內部機制,這裡不詳闡述。
下面使用skift實現faxtText來對細粒度情感分析模板
from tqdm import tqdm
from skift import FirstColFtClassifier
from sklearn.model_selection import KFold
import numpy as np
import os
import pickle
class BasicModel(object):
"""Docstring for BasicModel. """
def __init__(self):
"""TODO: to be defined1. """
pass
def create_model(self, kfold_X_train, y_train, kfold_X_test, y_test, test):
pass
# Generate batches
def batch_iter(self, data, batch_size, num_epochs=1, shuffle=True):
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
for epoch in range(num_epochs):
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((1 + batch_num) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
def get_f1_score(self, x, y, verbose=False):
tp = np.sum(np.logical_and(y > 0, x == y))
fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x)) # 多判或者錯判
fn = np.sum(np.logical_and(y > 0, x == 0)) # 漏判
P = float(tp) / (float(tp + fp) + 1e-8)
R = float(tp) / (float(tp + fn) + 1e-8)
F = 2 * P * R / (P + R + 1e-8)
if verbose:
print('P->', P)
print('R->', R)
print('F->', F)
return F
class BasicStaticModel(BasicModel):
def __init__(self, config=None, n_folds=5, name='BasicStaticModel'):
self.n_folds = n_folds
self.name = name
self.config = config
self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
def train_predict(self, train, train_y, test, option=None):
name = self.name
predict = np.zeros((test.shape[0], 10, 4))
oof_predict = np.zeros((train.shape[0], 10, 4))
scores_f1 = []
for train_index, dev_index in self.kf.split(train):
kfold_X_train, kfold_X_val = train[train_index], train[dev_index]
y_train, y_dev = train_y[train_index], train_y[dev_index]
model_dict = {}
print('start train model:')
for idx in tqdm(range(10)):
label = y_train[:, idx]
model = self.create_model()
model.fit(kfold_X_train, label)
model_dict[idx] = model
print('complete train model')
print('start validate model')
f1_scores = []
for idx in tqdm(range(10)):
label_dev = y_dev[:, idx]
model = model_dict[idx]
dev_prob = model.predict_proba(kfold_X_val)
test_prob = model.predict_proba(test)
oof_predict[dev_index, idx] = dev_prob
predict[:, idx] += test_prob / self.n_folds
dev_predict = np.argmax(dev_prob, 1)
f1_scores.append(self.get_f1_score(dev_predict, label_dev))
f1_score = np.mean(f1_scores)
scores_f1.append(f1_score)
print('f1_scores-> ', f1_scores)
print('f1_score: ', f1_score)
if self.config.is_debug == True:
break
print('Total f1->', scores_f1)
print("Total f1'mean is ", np.mean(scores_f1))
# 儲存結果
os.makedirs('../data/result-ml', exist_ok=True)
with open('../data/result-ml/{}_oof_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
pickle.dump(oof_predict, f)
with open('../data/result-ml/{}_pre_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
pickle.dump(predict, f)
print('done')
class Fasttext(BasicStaticModel):
def __init__(self, name='basicModel', n_folds=5, config=None):
BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
def create_model(self):
# 重寫
sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
wordNgrams=1,
minCount=5, verbose=2)
return sk_clf