Python使用doc2vec和LR進行文字分類
阿新 • • 發佈:2019-02-08
(1)資料預處理
a.對文字資料進行貼標籤處理,標籤資料類似入下:
平素體質:健康狀況:良,既往有“高血壓病史”多年。#1
其中1表示患有高血壓,0表示沒有患有高血壓。
然後進行分開,文字儲存在一個檔案,標籤儲存在一個檔案,文字內容和標籤行對行對應。
b.對文字檔案的內容進行分詞。
import jieba
#讀取資料生成sentences
file=open(u'/home/ubuntu/file/資料平衡分類',encoding='utf-8')
filenoclass=open(u'/home/ubuntu/file/資料平衡無分類','w')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果' ,'w')
documents=[]
tig=[]
for lines in file:
text=lines.strip().split('#')
segs=jieba.cut(text[0])
for seg in segs:
filenoclass.write(seg+" ")
filenoclass.write('\n')
fileclass.write(str(text[1])+'\n')
filenoclass.close()
fileclass.close()
file.close()
(2)訓練doc2vec得到文字向量
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100 , window=8, min_count=100, workers=8)
#生成文字向量
print(model.docvecs[1])
(3)準備進行分類的資料
def getData():
#生成pandas
tigs = []
data_dict = {}
# 生成pandas資料
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)):
data_dict['p' + str(i)] = model.docvecs[i]
print(tigs)
print(data_dict)
data = pd.DataFrame(data_dict)
data = data.T
data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
return X_train1, y_train1, X_test1, y_test1
(4)準備測試方法
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
(5)進行模型訓練
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文字向量
print(model.docvecs[1])
#使用邏輯迴歸進行預測
def LR():
clf = LogisticRegression()
return clf
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
def getData():
#生成pandas
tigs = []
data_dict = {}
# 生成pandas資料
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)):
data_dict['p' + str(i)] = model.docvecs[i]
print(tigs)
print(data_dict)
data = pd.DataFrame(data_dict)
data = data.T
data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
return X_train1, y_train1, X_test1, y_test1
T = getData()
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))