1. 程式人生 > >ptyhon中文本挖掘精簡版

ptyhon中文本挖掘精簡版

gamma 邏輯 data 算法 pickle kit xls form 精簡版

import xlrd
import jieba
import sys  
import importlib
import os         #python內置的包,用於進行文件目錄操作,我們將會用到os.listdir函數  
import pickle    #導入cPickle包並且取一個別名pickle #持久化類
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl  
from sklearn.naive_bayes import
MultinomialNB # 導入多項式貝葉斯算法包 from sklearn import svm from sklearn import metrics from sklearn.datasets.base import Bunch from sklearn.feature_extraction.text import TfidfVectorizer importlib.reload(sys) #把內容和類別轉化成一個向量的形式 trainContentdatasave=[] #存儲所有訓練和測試數據的分詞 testContentdatasave=[] trainContentdata
= [] testContentdata = [] trainlabeldata = [] testlabeldata = [] #導入文本描述的訓練和測試數據 def importTrainContentdata(): file = 20180716_train.xls wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainContentdata.append(ws.cell(r, 0).value) def importTestContentdata(): file
= 20180716_test.xls wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testContentdata.append(ws.cell(r, 0).value) #導入類別的訓練和測試數據 def importTrainlabeldata(): file = 20180716_train_label.xls wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): trainlabeldata.append(ws.cell(r, 0).value) def importTestlabeldata(): file = 20180716_test_label.xls wb = xlrd.open_workbook(file) ws = wb.sheet_by_name("Sheet1") for r in range(ws.nrows): testlabeldata.append(ws.cell(r, 0).value) if __name__=="__main__": importTrainContentdata() importTestContentdata() importTrainlabeldata() importTestlabeldata() ‘‘‘貝葉斯 clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label) #clf = svm.SVC(C=0.7, kernel=‘poly‘, gamma=10, decision_function_shape=‘ovr‘) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) 邏輯回歸 tv = TfidfVectorizer() train_data = tv.fit_transform(X_train) test_data = tv.transform(X_test) lr = LogisticRegression(C=3) lr.fit(train_set.tdm, train_set.label) predicted=lr.predict(test_set.tdm) print(lr.score(test_set.tdm, test_set.label)) #print(test_set.tdm) #SVM clf = SVC(C=1500) clf.fit(train_set.tdm, train_set.label) predicted=clf.predict(test_set.tdm) print(clf.score(test_set.tdm, test_set.label)) ‘‘‘ tv = TfidfVectorizer() train_data = tv.fit_transform(trainContentdata) test_data = tv.transform(testContentdata) clf = SVC(C=1500) clf.fit(train_data, trainlabeldata) print(clf.score(test_data, testlabeldata)) a=[] b=[] for i in range(len(predicted)): b.append((int)(float(predicted[i]))) a.append(int(test_set.label[i][0])) ‘‘‘ f=open(‘F:/goverment/ArticleMining/predict.txt‘, ‘w‘) for i in range(len(predicted)): f.write(str(b[i])) f.write(‘\n‘) f.write("寫好了") f.close() #for i in range(len(predicted)): #print(b[i]) ‘‘‘ metrics_result(a, b)

ptyhon中文本挖掘精簡版