Word2vec進行中文情感分析
阿新 • • 發佈:2018-11-10
''' Chinese sentiment analysis ''' from sklearn.cross_validation import train_test_split from gensim.models.word2vec import Word2Vec import numpy as np import pandas as pd import jieba from sklearn.externals import joblib #把資料轉化為二進位制 from sklearn.svm import SVC import sys ''' 資料預處理:載入資料 預處理 切分訓練集和測試集 ''' def load_file_and_processing(): neg = pd.read_excel('H:/word2vect_3data/Chinese_data/neg.xls') pos = pd.read_excel('H:/word2vect_3data/Chinese_data/pos.xls') cw = lambda x:list(jieba.cut(x)) #jieba分詞 pos['words'] = pos[0].apply(cw) #此處會報錯,讀取時給列命名,在apply jieba.cut()不會報錯 neg['words'] = neg[0].apply(cw) # use 1 for positive sentiment, 0 for negative y = np.concatenate((np.ones(len(pos)),np.zeros(len(neg)))) x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos['words'],neg['words'])),y,test_size=0.2) np.save('H:/word2vect_3data/Chinese_data/y_train.npy', y_train) np.save('H:/word2vect_3data/Chinese_data/y_test.npy', y_test) return x_train,x_test ''' 對每個句子的所有詞向量取均值,生成一個句子的vector ''' def build_sentence_vector(text,size,imdb_w2v): vec = np.zeros(size).reshape((1,size)) count = 0 for word in text: try: vec += imdb_w2v[word].reshape((1,size)) count += 1 except KeyError: continue if count != 0: vec /= count return vec ''' 計算詞向量 ''' def get_train_vecs(x_train,x_test): n_dim = 300 #初始化模型和詞表 imdb_w2v = Word2Vec(size=n_dim,min_count=10) #詞頻少於min_count次數的單詞會被丟棄掉, 預設值為5 imdb_w2v.build_vocab(x_train) #在評論集上訓練模型 imdb_w2v.train(x_train) train_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_train]) np.save('H:/word2vect_3data/Chinese_data/train_vecs.npy',train_vecs) print('train_vecs size:') print(train_vecs.shape) #在測試集上訓練 imdb_w2v.train(x_test) imdb_w2v.save('H:/word2vect_3data/Chinese_data/w2v_model.pkl') #build test tweet vector then scale test_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_test]) np.save('H:/word2vect_3data/Chinese_data/test_vecs.npy',test_vecs) print('test_vecs size:') print(test_vecs.shape) def get_data(): train_vecs = np.load('H:/word2vect_3data/Chinese_data/train_vecs.npy') y_train = np.load('H:/word2vect_3data/Chinese_data/y_train.npy') test_vecs = np.load('H:/word2vect_3data/Chinese_data/test_vecs.npy') y_test = np.load('H:/word2vect_3data/Chinese_data/y_test.npy') return train_vecs,test_vecs,y_train,y_test ''' 訓練模型 ''' def svm_train(train_vecs,y_train,test_vecs,y_test): clf = SVC(kernel='rbf',verbose=True) clf.fit(train_vecs,y_train) joblib.dump(clf, 'H:/word2vect_3data/Chinese_data/model.pkl') print(clf.score(test_vecs,y_test)) ''' 構建待測句子向量 ''' def get_predict_vecs(words): n_dim = 300 imdb_w2v =Word2Vec.load('H:/word2vect_3data/Chinese_data/w2v_model.pkl') train_vecs = build_sentence_vector(words,n_dim,imdb_w2v) return train_vecs ''' 對單個句子進行情感分析 ''' def svm_predict(string): words = jieba.cut(string) #jieba.lcut直接返回list words_vecs = get_predict_vecs(words) clf =joblib.load('H:/word2vect_3data/Chinese_data/model.pkl') result = clf.predict(words_vecs) if int(result[0]) == 1: print('positive') else: print('negative')