ptyhon中文本挖掘精簡版
阿新 • • 發佈:2018-07-23
gamma 邏輯 data 算法 pickle kit xls form 精簡版
import xlrd import jieba import sys import importlib import os #python內置的包,用於進行文件目錄操作,我們將會用到os.listdir函數 import pickle #導入cPickle包並且取一個別名pickle #持久化類 import random import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from pylab import mpl from sklearn.naive_bayes importMultinomialNB # 導入多項式貝葉斯算法包 from sklearn import svm from sklearn import metrics from sklearn.datasets.base import Bunch from sklearn.feature_extraction.text import TfidfVectorizer importlib.reload(sys) #把內容和類別轉化成一個向量的形式 trainContentdatasave=[] #存儲所有訓練和測試數據的分詞 testContentdatasave=[] trainContentdata= [] testContentdata = [] trainlabeldata = [] testlabeldata = [] #導入文本描述的訓練和測試數據 def importTrainContentdata(): file = ‘20180716_train.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainContentdata.append(ws.cell(r, 0).value) def importTestContentdata(): file= ‘20180716_test.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testContentdata.append(ws.cell(r, 0).value) #導入類別的訓練和測試數據 def importTrainlabeldata(): file = ‘20180716_train_label.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainlabeldata.append(ws.cell(r, 0).value) def importTestlabeldata(): file = ‘20180716_test_label.xls‘ wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testlabeldata.append(ws.cell(r, 0).value) if __name__=="__main__": importTrainContentdata() importTestContentdata() importTrainlabeldata() importTestlabeldata() ‘‘‘貝葉斯 clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label) #clf = svm.SVC(C=0.7, kernel=‘poly‘, gamma=10, decision_function_shape=‘ovr‘) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) 邏輯回歸 tv = TfidfVectorizer() train_data = tv.fit_transform(X_train) test_data = tv.transform(X_test) lr = LogisticRegression(C=3) lr.fit(train_set.tdm, train_set.label) predicted=lr.predict(test_set.tdm) print(lr.score(test_set.tdm, test_set.label)) #print(test_set.tdm) #SVM clf = SVC(C=1500) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) print(clf.score(test_set.tdm, test_set.label)) ‘‘‘ tv = TfidfVectorizer() train_data = tv.fit_transform(trainContentdata) test_data = tv.transform(testContentdata) clf = SVC(C=1500) clf.fit(train_data, trainlabeldata) print(clf.score(test_data, testlabeldata)) a=[] b=[] for i in range(len(predicted)): b.append((int)(float(predicted[i]))) a.append(int(test_set.label[i][0])) ‘‘‘ f=open(‘F:/goverment/ArticleMining/predict.txt‘, ‘w‘) for i in range(len(predicted)): f.write(str(b[i])) f.write(‘\n‘) f.write("寫好了") f.close() #for i in range(len(predicted)): #print(b[i]) ‘‘‘ metrics_result(a, b)
ptyhon中文本挖掘精簡版