中文短文字分類
阿新 • • 發佈:2018-11-30
特徵提取+樸素貝葉斯模型:
import random import jieba import pandas as pd #載入停用詞 stopwords=pd.read_csv('D://input_py//day06//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') stopwords=stopwords['stopword'].values #載入語料 laogong_df = pd.read_csv('D://input_py//day06//beilaogongda.csv', encoding='utf-8', sep=',') laopo_df = pd.read_csv('D://input_py//day06//beilaopoda.csv', encoding='utf-8', sep=',') erzi_df = pd.read_csv('D://input_py//day06//beierzida.csv', encoding='utf-8', sep=',') nver_df = pd.read_csv('D://input_py//day06//beinverda.csv', encoding='utf-8', sep=',') #刪除語料的NAN行 laogong_df.dropna(inplace=True) laopo_df.dropna(inplace=True) erzi_df.dropna(inplace=True) nver_df.dropna(inplace=True) #轉換 laogong = laogong_df.segment.values.tolist() laopo = laopo_df.segment.values.tolist() erzi = erzi_df.segment.values.tolist() nver = nver_df.segment.values.tolist() #定義分詞和打標籤函式preprocess_text #引數content_lines即為上面轉換的list #引數sentences是定義的空list,用來儲存打標籤之後的資料 #引數category 是型別標籤 def preprocess_text(content_lines, sentences, category): for line in content_lines: try: segs=jieba.lcut(line) segs = [v for v in segs if not str(v).isdigit()]#去數字 segs = list(filter(lambda x:x.strip(), segs)) #去左右空格 segs = list(filter(lambda x:len(x)>1, segs)) #長度為1的字元 segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用詞 sentences.append((" ".join(segs), category))# 打標籤 except Exception: print(line) continue sentences = [] preprocess_text(laogong, sentences, 0) preprocess_text(laopo, sentences, 1) preprocess_text(erzi, sentences, 2) preprocess_text(nver, sentences, 3) random.shuffle(sentences) # 輸出前10條資料 # for sentence in sentences[:10]: # print(sentence[0], sentence[1]) # 下標0是詞列表,1是標籤 # 定義文字抽取詞袋模型特徵 from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer( analyzer='word', # tokenise by character ngrams max_features=4000, # keep the most common 1000 ngrams ) # 把語料資料切分 from sklearn.model_selection import train_test_split x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256) # 把訓練資料轉換為詞袋模型 vec.fit(x_train) # 演算法建模和模型訓練 from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() classifier.fit(vec.transform(x_train), y_train) # 計算 AUC 值 print(classifier.score(vec.transform(x_test), y_test))
結果評分為:0.6587
特徵提取+svm模型:
import random import jieba import pandas as pd #載入停用詞 stopwords=pd.read_csv('D://input_py//day06//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') stopwords=stopwords['stopword'].values #載入語料 laogong_df = pd.read_csv('D://input_py//day06//beilaogongda.csv', encoding='utf-8', sep=',') laopo_df = pd.read_csv('D://input_py//day06//beilaopoda.csv', encoding='utf-8', sep=',') erzi_df = pd.read_csv('D://input_py//day06//beierzida.csv', encoding='utf-8', sep=',') nver_df = pd.read_csv('D://input_py//day06//beinverda.csv', encoding='utf-8', sep=',') #刪除語料的NAN行 laogong_df.dropna(inplace=True) laopo_df.dropna(inplace=True) erzi_df.dropna(inplace=True) nver_df.dropna(inplace=True) #轉換 laogong = laogong_df.segment.values.tolist() laopo = laopo_df.segment.values.tolist() erzi = erzi_df.segment.values.tolist() nver = nver_df.segment.values.tolist() #定義分詞和打標籤函式preprocess_text #引數content_lines即為上面轉換的list #引數sentences是定義的空list,用來儲存打標籤之後的資料 #引數category 是型別標籤 def preprocess_text(content_lines, sentences, category): for line in content_lines: try: segs=jieba.lcut(line) segs = [v for v in segs if not str(v).isdigit()]#去數字 segs = list(filter(lambda x:x.strip(), segs)) #去左右空格 segs = list(filter(lambda x:len(x)>1, segs)) #長度為1的字元 segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用詞 sentences.append((" ".join(segs), category))# 打標籤 except Exception: print(line) continue sentences = [] preprocess_text(laogong, sentences, 0) preprocess_text(laopo, sentences, 1) preprocess_text(erzi, sentences, 2) preprocess_text(nver, sentences, 3) random.shuffle(sentences) # 把語料資料切分 from sklearn.model_selection import train_test_split x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256) # 改變特徵向量模型 from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer( analyzer='word', # tokenise by character ngrams ngram_range=(1,4), # use ngrams of size 1 and 2 max_features=20000, # keep the most common 1000 ngrams ) vec.fit(x_train) # 用svm演算法進行模型訓練 from sklearn.svm import SVC svm = SVC(kernel='linear') svm.fit(vec.transform(x_train), y_train) print(svm.score(vec.transform(x_test), y_test))
結果評分為:0.9976