1. 程式人生 > >word2vec 對影評情感進行預測

word2vec 對影評情感進行預測

上篇用了countvectorize進行文字embling,忽視了文字詞中上下文的語義。因此這裡用到了word2vec。

word2vec訓練詞向量。

import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from gensim.models.word2vec import Word2vec

def load_dataset(name,nrows=None):
    datasets={
        "unlabeled_train":"unlabelTrainData.tsv",
        "labeled_train":"labeledTrainData.tsv",
        "test":"testData.tsv"
    }
    if name not in datasets:
        raise ValueError(name)
    data_file=os.path.join("..","data",datasets[name])
    df=pd.read_csv(data_File,sep="\t",escapechar="\\",nrows=nrows)
    return df

讀入無標籤資料

用於訓練生成word2vec詞向量

df = load_dataset('unlabeled_train')

eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n=0
    def wrapped(*args,**kwargs):
        nonlocal n
        n+=1
        if n%1000=1:
            print("method {} called {} times.format(f._name_,n))
        return f(*args,**kwargs)
    return wrapped
@print_call_counts
def split_sentences(review):
    saw_sentences=tokenizer.tokenize(review.strip())
    sentences=[clean_text(s) for s in raw_sentences if s]
    return sentences
%time sentences = sum(df.review.apply(split_sentences), [])

gensim的word2vec訓練詞嵌入模型

max_features=300
min_word_count=10
num_workers=4
context=10
downsampling=1e-3
model_name="{} features_{}minwords_{}context.model".format(num_features,min_word_count,context)

#model
model=word2vec.Word2vec(sentences,workers=num_workers,size=num_faetures,min_count=min_word_count,window=context.sample=downsampling)
#初始化
model.init_sims(replace=True)
model.save(os.path.join("..","models",model_name))


檢視訓練的詞向量的結果

model.most_similar("man")
#結果為
[('woman', 0.6256189346313477),
 ('lady', 0.5953349471092224),
 ('lad', 0.576863169670105),
 ('person', 0.5407935380935669),
 ('farmer', 0.5382746458053589),
 ('chap', 0.536788821220398),
 ('soldier', 0.5292650461196899),
 ('men', 0.5261573791503906),
 ('monk', 0.5237958431243896),
 ('guy', 0.5213091373443604)]


model.most_similar("queen")

Out[11]:
[('princess', 0.6749982833862305),
 ('maid', 0.6223365068435669),
 ('bride', 0.6201028227806091),
 ('belle', 0.6200867891311646),
 ('temple', 0.6171057224273682),
 ('stripper', 0.608874499797821),
 ('catherine', 0.6072724461555481),
 ('eva', 0.6019693613052368),
 ('dancer', 0.594109833240509),
 ('sylvia', 0.5933606624603271)]


model.most_similar("awful")


Out[12]:
[('terrible', 0.7551683187484741),
 ('atrocious', 0.7340768575668335),
 ('horrible', 0.7315883040428162),
 ('dreadful', 0.7080680131912231),
 ('abysmal', 0.7010548114776611),
 ('horrendous', 0.6951696872711182),
 ('appalling', 0.691646933555603),
 ('horrid', 0.6708598136901855),
 ('amateurish', 0.6481891870498657),
 ('embarrassing', 0.6306308507919312)]

讀入以上訓練好的word2vec模型

model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('..', 'models', model_name))

#對label_train進行操作
df=load_dataset("labeled_train")
def to_review_vector(review):
    words=clean_text(review,remove_stopwords=True)
    array=np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))#word2vec得到文字各個詞的維度後,進行求平均得到文字的向量

train_data_features=df.review.apply(to_review_vector)

建立分類器並預測

forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)
df = load_dataset('test')
test_data_features = df.review.apply(to_review_vector)
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)


#以下為結果
           id        sentiment
0       12311_10       1
1       8348_2         0
2       5828_4         0
3       7186_2         0
4       12128_7        1

到此,程式結束。

運用了word2vec使得文字的上下文關聯語義得以儲存。

雖然WORD2VEC表示的詞向量不僅考慮了詞之間的語義資訊,還壓縮了維度。但是,有時候當我們需要得到sentence/Document的向量時,雖然可以直接將sentence/Document中所有詞的向量取均值作為sentence/Document的向量表示,但是這樣會忽略了單詞之間的排列順序對句子或文字資訊的影響。所以引出了word2vec的延伸doc2vec