貝葉斯公式實戰
阿新 • • 發佈:2021-01-06
技術標籤:ML
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def nb_news():
"""
用貝葉斯演算法,對新聞進行分類
:return:
"""
# 1)獲取資料
news = fetch_20newsgroups(subset="all")
# print(news)
# 2)劃分資料集
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target)
# 3)特徵工程:文字的特徵抽取tfidf
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
# 4)貝葉斯演算法預估器流程
estimator = MultinomialNB()
estimator.fit(x_train, y_train)
# 5)模型的評估
# 方法一:直接比較
y_predict = estimator.predict(x_test)
print("預測值和真實值對比:\n", y_test == y_predict)
# 方法二:計算準確率
score = estimator.score(x_test, y_test)
print("準確率為:\n" , score)
return None
# def nbcls():
# """
# 樸素貝葉斯對新聞資料集進行預測
# :return:
# """
# # 獲取新聞的資料,20個類別
# news = fetch_20newsgroups(subset='all')
#
# # 進行資料集分割
# x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)
#
# # 對於文字資料,進行特徵抽取
# tf = TfidfVectorizer()
#
# x_train = tf.fit_transform(x_train)
# # 這裡打印出來的列表是:訓練集當中的所有不同詞的組成的一個列表
# print(tf.get_feature_names())
# # print(x_train.toarray())
#
# # 不能呼叫fit_transform
# x_test = tf.transform(x_test)
#
# # estimator估計器流程
# mlb = MultinomialNB(alpha=1.0)
#
# mlb.fit(x_train, y_train)
#
# # 進行預測
# y_predict = mlb.predict(x_test)
#
# print("預測每篇文章的類別:", y_predict[:100])
# print("真實類別為:", y_test[:100])
#
# print("預測準確率為:", mlb.score(x_test, y_test))
#
# return None
if __name__ == "__main__":
nb_news()