1. 程式人生 > >tf-idf + svm 文字分類

tf-idf + svm 文字分類

01分類

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt



def create_model(d_train , d_test):
    print("訓練樣本 = %d" % len(d_train))
    print("測試樣本 = %d" %len(d_test))
    vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特徵抽取ngram_range=(1,2)
    features = vectorizer.fit_transform(d_train.title)
    print("訓練樣本特徵表長度為 " + str(features.shape))
    # print(vectorizer.get_feature_names()[3000:3050]) #特徵名展示
    test_features = vectorizer.transform(d_test.title)
    print("測試樣本特徵長度為:"+str(test_features.shape))
    #支援向量機
    #C: 目標函式的懲罰係數C,用來平衡分類間隔margin和錯分樣本的,default C = 1.0
    svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:引數選擇有rbf, linear, poly, Sigmoid, 預設的是"RBF";

    nn = svmmodel.fit(features , d_train.two_category)
    print(nn)
    # predict = svmmodel.score(test_features ,d_test.two_category)
    # print(predict)
    pre_test = svmmodel.predict(test_features)
    d_test["01category"] = pre_test
    d_test.to_excel("wr01_pre_1025.xlsx", index=False)



# d_train , d_test = data_prepare()
print("對新樣本進行01預測")
df = pd.read_excel("wr01_new_train1012.xlsx") #訓練
d_train = df

d_test = pd.read_excel("wr_100樣本1023.xlsx") #測試
create_model(d_train, d_test)



60分類

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



def create_model(d_train , d_test):
    print("訓練樣本 = %d" % len(d_train))
    print("測試樣本 = %d" %len(d_test))
    vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特徵抽取ngram_range=(1,2)
    features = vectorizer.fit_transform(d_train.title)
    print("訓練樣本特徵表長度為 " + str(features.shape))
    # print(vectorizer.get_feature_names()[3000:3050]) #特徵名展示
    test_features = vectorizer.transform(d_test.title)
    print("測試樣本特徵表長度為 "+ str(test_features.shape))
    #支援向量機
    #C: 目標函式的懲罰係數C,用來平衡分類間隔margin和錯分樣本的,default C = 1.0
    svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:引數選擇有rbf, linear, poly, Sigmoid, 預設的是"RBF";

    nn = svmmodel.fit(features , d_train.sku)
    print(nn)
    # predict = svmmodel.score(test_features ,d_test.sku)
    # print(predict)
    pre_test = svmmodel.predict(test_features)
    d_test["pre_skuno"] = pre_test
    d_test.to_excel("wr60_svm_pre1012.xlsx", index=False)

print("對新樣本進行60個車型預測")
d_train = pd.read_excel("wr60_train1012.xlsx") #訓練

df = pd.read_excel("wr機器學習分析報告.xlsx",sheetname="01預測") #測試
d_test = df[df.pre_category == 1]
create_model(d_train, d_test)

# 訓練樣本 = 75987
# 測試樣本 = 32606
# 訓練樣本特徵表長度為 (75987, 18040)
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
#   max_iter=-1, probability=False, random_state=None, shrinking=True,
#   tol=0.001, verbose=False)
# 0.920137398025
#0.933329022245