1. 程式人生 > 實用技巧 >基於LDA主題模型和SVM的文字分類

基於LDA主題模型和SVM的文字分類

用LDA模型抽取文字特徵,再用線性SVM分類,發現效果很差,F1=0.654。

Precision:0.680,Recall:0.649,F1:0.654

RandomForestClassifier的表現也比較差:

Precision:0.680,Recall:0.668,F1:0.670

而隨便用一個深度學習模型(textCNN,LSTM+Attention)都能達到0.95+的F1,而且還不用處理特徵、不用分詞。

說下具體流程:提取LDA特徵時,需要CountVectorizer來先對文字進行向量化,首先需要對文字進行分詞,考慮到樣本數量較多(搜狐新聞資料集,5個類別*3000條資訊),使用了多程序程(此處用了程序池ProcessPoolExecutor來實現)來進行jieba分詞。

import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import multiprocessing
from concurrent.futures import ProcessPoolExecutor,as_completed
from utils import log
from tqdm import tqdm
import time
import
pickle as pk import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC,SVC from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_score,recall_score,f1_score def transform_text(text,stopwords): #對文章進行jieba分詞 words=[w for
w in jieba.cut(text) if w.strip() and (w not in stopwords)] return ','.join(words) def cut_texts(lock,texts,stopwords,processName,doc_list=[]): #程序+鎖的形式來做多程序分詞 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) lock.acquire() doc_list.extend(docs) lock.release() def cut_texts_pool(texts,stopwords,processName): #分詞,此方法將以,程序池方式的方式實現多程序加速執行 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) log('Process {} finished cutting.'.format(processName)) return docs def hard_work(processName): #測試方法,模擬耗時操作 log('Process {} is running...'.format(processName)) time.sleep(2) log('Process {} finished.'.format(processName)) return processName def mp_pool_test(texts=None,res=None): #多程序測試 n_process=multiprocessing.cpu_count() pool=ProcessPoolExecutor() fs=[] for i in range(n_process): f=pool.submit(hard_work,i) fs.append(f) names=[] for f in as_completed(fs): name = f.result() names.append(name) log(names) def partition(iterable_,n_parittion): #多文字進行分割,大體均分為n_parittion份 assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"' temp=list(iterable_) total=len(temp) assert total>n_parittion,'Size of iterable is less than "n_partition"' partition_size=total//n_parittion res=[] for i in range(n_parittion-1): res.append(temp[partition_size*i:partition_size*(i+1)]) res.append(temp[partition_size*(i+1):]) return res def mp_cut_pool(texts): #有幾個CPU就建立幾個程序 n_process=multiprocessing.cpu_count() texts=partition(texts,n_process) #以程序池的方式進行多程序分詞 pool=ProcessPoolExecutor(max_workers=12) fs=[] docs=[] for i in range(n_process): #submit啟動程序,第一個引數是目標方法,後面是該方法的引數 f=pool.submit(cut_texts_pool,texts[i],[],i) #f是一個Future物件 fs.append(f) #as_completed返回一個迭代器,當程序池當中的程序執行結束時呼叫 for f in as_completed(fs): #f.result()獲取每個程序的返回值 docs.extend(f.result()) return docs class LDA_Transformer: def __init__(self,n_features): self.n_features=n_features def fit(self,texts): log('Building CountVectorizer with texts...') ct=CountVectorizer() self.count_vectorizer=ct log(type(texts)) if isinstance(texts,list): log('Len of texts:{}'.format(len(texts))) #log(texts) else: log('Shape of texts:{}'.format(texts.shape)) print('texts[0]',texts[0]) ctv=ct.fit_transform(texts) log('Building LDA model with CountVectorizer..') #n_components是LDA的主題個數,類似於word embedding的維度大小 lda=LatentDirichletAllocation(n_components=self.n_features) lda.fit(ctv) log('Done building LDA model.') self.lda_model=lda def transform(self,texts): count_vec=self.count_vectorizer.transform(texts) return self.lda_model.transform(count_vec) def build_data(): df=pd.read_excel('data/souhu_news_400_500.xlsx') texts=list(df['content'])#文字欄位 log(df.columns) docs=mp_cut_pool(texts) lda_transformer=LDA_Transformer(64) lda_transformer.fit(docs) #儲存LDA模型到本地 with open('output/lda_transformer.pkl','wb') as f: pk.dump(lda_transformer,f) indices=list(range(df.shape[0])) np.random.shuffle(indices) df=df.iloc[indices] dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))} y=[dic[topic] for topic in list(df['topic'])] with open('data/y_lda.pkl','wb') as f: pk.dump(y,f) texts=list(df['content']) X=lda_transformer.transform(texts) with open('data/X_lda.pkl','wb') as f: pk.dump(X,f) log('Training data is saved.') def load_train_data(): with open('data/X_lda.pkl','rb') as f: X=pk.load(f) with open('data/y_lda.pkl','rb') as f: y=pk.load(f) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) return X_train,X_test,y_train,y_test def main(): log('Building training data...') build_data() log('Loading training data with LDA features...') X_train,X_test,y_train,y_test=load_train_data() log('Training LinearSVC model..') #model=LinearSVC() model=RandomForestClassifier() model.fit(X_train,y_train) log('Evaluating model...') acc=model.score(X_test,y_test) log('Accuracy:{}'.format(acc)) y_pred=model.predict(X_test) p=precision_score(y_test,y_pred,average='macro') r=recall_score(y_test,y_pred,average='macro') f1=f1_score(y_test,y_pred,average='macro') log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1)) if __name__=='__main__': main()