tf-idf + svm 文字分類
阿新 • • 發佈:2019-01-01
01分類
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve import matplotlib.pyplot as plt def create_model(d_train , d_test): print("訓練樣本 = %d" % len(d_train)) print("測試樣本 = %d" %len(d_test)) vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特徵抽取ngram_range=(1,2) features = vectorizer.fit_transform(d_train.title) print("訓練樣本特徵表長度為 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特徵名展示 test_features = vectorizer.transform(d_test.title) print("測試樣本特徵長度為:"+str(test_features.shape)) #支援向量機 #C: 目標函式的懲罰係數C,用來平衡分類間隔margin和錯分樣本的,default C = 1.0 svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:引數選擇有rbf, linear, poly, Sigmoid, 預設的是"RBF"; nn = svmmodel.fit(features , d_train.two_category) print(nn) # predict = svmmodel.score(test_features ,d_test.two_category) # print(predict) pre_test = svmmodel.predict(test_features) d_test["01category"] = pre_test d_test.to_excel("wr01_pre_1025.xlsx", index=False) # d_train , d_test = data_prepare() print("對新樣本進行01預測") df = pd.read_excel("wr01_new_train1012.xlsx") #訓練 d_train = df d_test = pd.read_excel("wr_100樣本1023.xlsx") #測試 create_model(d_train, d_test)
60分類
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split def create_model(d_train , d_test): print("訓練樣本 = %d" % len(d_train)) print("測試樣本 = %d" %len(d_test)) vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特徵抽取ngram_range=(1,2) features = vectorizer.fit_transform(d_train.title) print("訓練樣本特徵表長度為 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特徵名展示 test_features = vectorizer.transform(d_test.title) print("測試樣本特徵表長度為 "+ str(test_features.shape)) #支援向量機 #C: 目標函式的懲罰係數C,用來平衡分類間隔margin和錯分樣本的,default C = 1.0 svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:引數選擇有rbf, linear, poly, Sigmoid, 預設的是"RBF"; nn = svmmodel.fit(features , d_train.sku) print(nn) # predict = svmmodel.score(test_features ,d_test.sku) # print(predict) pre_test = svmmodel.predict(test_features) d_test["pre_skuno"] = pre_test d_test.to_excel("wr60_svm_pre1012.xlsx", index=False) print("對新樣本進行60個車型預測") d_train = pd.read_excel("wr60_train1012.xlsx") #訓練 df = pd.read_excel("wr機器學習分析報告.xlsx",sheetname="01預測") #測試 d_test = df[df.pre_category == 1] create_model(d_train, d_test) # 訓練樣本 = 75987 # 測試樣本 = 32606 # 訓練樣本特徵表長度為 (75987, 18040) # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, # decision_function_shape=None, degree=3, gamma='auto', kernel='linear', # max_iter=-1, probability=False, random_state=None, shrinking=True, # tol=0.001, verbose=False) # 0.920137398025 #0.933329022245