sklearn svm實現文字分類 入門
阿新 • • 發佈:2019-01-02
正在學習sklearn , 實驗室專案需要實現一些文字的分類的功能。
sklearn提供了許多機器學習方面使用的到的寫好的工具。其中也包括分類器。sklearn在這裡不作介紹。有官網,有部落格,也正在學習中
最開始是參照著這片文章:
https://segmentfault.com/a/1190000002472791
用的是樸素貝葉斯,文字向量化用的是HashingVectorizer
實現過後,效果不夠好,在這個基礎上改用了 TfidfVectorizer,CountVectorizer,其中TfidfVectorizer效果較好,達到了50%左右,但是對於實驗來說是不夠的
參照著寫了使用svm來進行分類,改了資料處理的部分,按照0.65左右的比例在整個資料集隨機的生成訓練集與測試集來比較效果。
資料從txt讀取的,格式如下:
男默女淚啊:0
自殺者永世不得為人乃鐵律,!不珍惜生命:0
發達國家都能結婚了,中國人的思維還在百年前。差勁啊:0
愛不是這麼樣表達的,不一定需要擁有,社會這樣我們改變不了什麼,但是,非要死嗎:0
資料本身是存在樣本不均勻問題的,且文字較短。故而有些向量化工具效果不好。
程式碼:
# -*- coding: utf-8 -*-
from sklearn import datasets
from sklearn import svm
import random
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy
#調整了格式,一行是一條資料
def inputdata(filename):
f = open(filename,'r')
linelist = f.readlines()
return linelist
def splitset(trainset,testset):
train_words = []
train_tags = []
test_words = []
test_tags = []
for i in trainset:
i = i.strip()
# index = i.index(':')
train_words.append(i[:-2])
# print i
train_tags.append(int(i[-1]))
for i in testset:
i = i.strip()
# index = i.index(':')
test_words.append(i[:-2])
# print i
test_tags.append(int(i[-1]))
return train_words,train_tags,test_words,test_tags
#完成開啟檔案後的準備工作
comma_tokenizer = lambda x: jieba.cut(x, cut_all=True)
def tfvectorize(train_words,test_words):
v = TfidfVectorizer(tokenizer=comma_tokenizer,binary = False, decode_error = 'ignore',stop_words = 'english')
train_data = v.fit_transform(train_words)
test_data = v.transform(test_words)
return train_data,test_data
#按比例劃分訓練集與測試集
def splitDataset(dataset,splitRatio):
trainSize = int(len(dataset)*splitRatio)
trainSet = []
copy = dataset
while len(trainSet)<trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return trainSet,copy
#得到準確率和召回率
def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred,average='macro')
m_recall = metrics.recall_score(actual,pred,average='macro')
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
#建立svm分類器
def train_clf(train_data, train_tags):
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def covectorize(train_words,test_words):
v = CountVectorizer(tokenizer=comma_tokenizer,binary = False, decode_error = 'ignore',stop_words = 'english')
train_data = v.fit_transform(train_words)
test_data = v.transform(test_words)
return train_data,test_data
if __name__ == '__main__':
linelist = inputdata('data/newdata.txt')
# for i in linelist:
# print i.decode('utf-8')
# 劃分成兩個list
trainset, testset = splitDataset(linelist, 0.65)
# for i in trainset:
# print i.decode('utf-8')
print 'train number:', len(trainset)
print 'test number:', len(testset)
train_words, train_tags, test_words, test_tags = splitset(trainset, testset)
# for i in train_words:
# print i
# for i in train_tags:
# print i
# for i in numpy.asarray(train_tags):
# print i
# for i in test_words:
# print i
# for i in test_tags:
# print i
# train_data, test_data = tfvectorize(train_words, test_words)
train_data, test_data = covectorize(train_words, test_words)
# for i in test_data:
# print i
clf = train_clf(train_data,train_tags)
re = clf.predict(test_data)
# print re
evaluate(numpy.asarray(test_tags),re)
# print re
svm演算法下,TfidfVectorizer後,C=100.0(懲罰因子)左右,效果才達得到85%左右 ,而CountVectorizer在 C=10.0左右,就可以達到80%多。
可以看出,使用什麼樣的特徵抽取方法,以及模型設定什麼樣的引數,對預測結果是有非常大影響的。
這部分還在學習中。