word2vec 對影評情感進行預測
阿新 • • 發佈:2019-01-01
上篇用了countvectorize進行文字embling,忽視了文字詞中上下文的語義。因此這裡用到了word2vec。
word2vec訓練詞向量。
import os import re import numpy as np import pandas as pd from bs4 import BeautifulSoup import nltk.data from gensim.models.word2vec import Word2vec def load_dataset(name,nrows=None): datasets={ "unlabeled_train":"unlabelTrainData.tsv", "labeled_train":"labeledTrainData.tsv", "test":"testData.tsv" } if name not in datasets: raise ValueError(name) data_file=os.path.join("..","data",datasets[name]) df=pd.read_csv(data_File,sep="\t",escapechar="\\",nrows=nrows) return df
讀入無標籤資料
用於訓練生成word2vec詞向量
df = load_dataset('unlabeled_train') eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')]) def clean_text(text, remove_stopwords=False): text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') def print_call_counts(f): n=0 def wrapped(*args,**kwargs): nonlocal n n+=1 if n%1000=1: print("method {} called {} times.format(f._name_,n)) return f(*args,**kwargs) return wrapped @print_call_counts def split_sentences(review): saw_sentences=tokenizer.tokenize(review.strip()) sentences=[clean_text(s) for s in raw_sentences if s] return sentences %time sentences = sum(df.review.apply(split_sentences), [])
gensim的word2vec訓練詞嵌入模型
max_features=300 min_word_count=10 num_workers=4 context=10 downsampling=1e-3 model_name="{} features_{}minwords_{}context.model".format(num_features,min_word_count,context) #model model=word2vec.Word2vec(sentences,workers=num_workers,size=num_faetures,min_count=min_word_count,window=context.sample=downsampling) #初始化 model.init_sims(replace=True) model.save(os.path.join("..","models",model_name))
檢視訓練的詞向量的結果
model.most_similar("man")
#結果為
[('woman', 0.6256189346313477),
('lady', 0.5953349471092224),
('lad', 0.576863169670105),
('person', 0.5407935380935669),
('farmer', 0.5382746458053589),
('chap', 0.536788821220398),
('soldier', 0.5292650461196899),
('men', 0.5261573791503906),
('monk', 0.5237958431243896),
('guy', 0.5213091373443604)]
model.most_similar("queen")
Out[11]:
[('princess', 0.6749982833862305),
('maid', 0.6223365068435669),
('bride', 0.6201028227806091),
('belle', 0.6200867891311646),
('temple', 0.6171057224273682),
('stripper', 0.608874499797821),
('catherine', 0.6072724461555481),
('eva', 0.6019693613052368),
('dancer', 0.594109833240509),
('sylvia', 0.5933606624603271)]
model.most_similar("awful")
Out[12]:
[('terrible', 0.7551683187484741),
('atrocious', 0.7340768575668335),
('horrible', 0.7315883040428162),
('dreadful', 0.7080680131912231),
('abysmal', 0.7010548114776611),
('horrendous', 0.6951696872711182),
('appalling', 0.691646933555603),
('horrid', 0.6708598136901855),
('amateurish', 0.6481891870498657),
('embarrassing', 0.6306308507919312)]
讀入以上訓練好的word2vec模型
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('..', 'models', model_name))
#對label_train進行操作
df=load_dataset("labeled_train")
def to_review_vector(review):
words=clean_text(review,remove_stopwords=True)
array=np.array([model[w] for w in words if w in model])
return pd.Series(array.mean(axis=0))#word2vec得到文字各個詞的維度後,進行求平均得到文字的向量
train_data_features=df.review.apply(to_review_vector)
建立分類器並預測
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)
df = load_dataset('test')
test_data_features = df.review.apply(to_review_vector)
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)
#以下為結果
id sentiment
0 12311_10 1
1 8348_2 0
2 5828_4 0
3 7186_2 0
4 12128_7 1
到此,程式結束。
運用了word2vec使得文字的上下文關聯語義得以儲存。
雖然WORD2VEC表示的詞向量不僅考慮了詞之間的語義資訊,還壓縮了維度。但是,有時候當我們需要得到sentence/Document的向量時,雖然可以直接將sentence/Document中所有詞的向量取均值作為sentence/Document的向量表示,但是這樣會忽略了單詞之間的排列順序對句子或文字資訊的影響。所以引出了word2vec的延伸doc2vec