機器學習之新聞文字分類。
阿新 • • 發佈:2021-06-20
新聞文字分類首先需要通過大量的訓練之後獲得一個存放關鍵字的表,
之後再輸入一個新聞內容,通過程式碼就可以自動判斷出這個新聞的類別,
我這裡是在已經有了新聞文字的關鍵詞表後的處理,
# encoding=utf-8 #遍歷檔案,用ProsessofWords處理檔案 from imp import reload import jieba import os import sys from imp import reload from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformerfrom sklearn.neighbors import KNeighborsClassifier reload(sys) VECTOR_DIR = 'vectors.bin' MAX_SEQUENCE_LENGTH = 100 EMBEDDING_DIM = 200 TEST_SPLIT = 0.2 def deposit_txt(title, content): textpath = "news/news.txt" f = open(textpath, 'w+', encoding='utf-8') f.write(title+content) f.close()def EnumPathFiles(path, callback, stop_words_list): if not os.path.isdir(path): print('Error:"', path, '" is not a directory or does not exist.') return list_dirs = os.walk(path) for root, dirs, files in list_dirs: for d in dirs: print(d) EnumPathFiles(os.path.join(root, d), callback, stop_words_list)for f in files: callback(root, f, stop_words_list) def ProsessofWords(textpath, stop_words_list): f = open(textpath, 'r', encoding='utf-8') text = f.read() f.close() result = list() outstr = '' seg_list = jieba.cut(text, cut_all=False) for word in seg_list: if word not in stop_words_list: if word != '\t': outstr += word outstr += " " f = open(textpath, 'w+', encoding='utf-8') f.write(outstr) f.close() def callback1(path, filename, stop_words_list): textpath = path + '\\' + filename print(textpath) ProsessofWords(textpath, stop_words_list) def fenci(): stopwords_file = "stopword/stopword.txt" stop_f = open(stopwords_file, "r", encoding='utf-8') stop_words = list() for line in stop_f.readlines(): line = line.strip() if not len(line): continue stop_words.append(line) stop_f.close() print(len(stop_words)) EnumPathFiles(r'news', callback1, stop_words) def CV_Tfidf(): reload(sys) # 資料獲取 print('(1) load texts...') train_texts = open('dataset_train/x_train.txt', encoding='utf-8').read().split('\n') train_labels = open('dataset_train/y_train.txt', encoding='utf-8').read().split('\n') test_texts = open('news/news.txt', encoding='utf-8').read().split('\n') all_text = train_texts + test_texts # 特徵值抽取 print('(2) doc to var...') count_v0 = CountVectorizer() counts_all = count_v0.fit_transform(all_text) count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_train = count_v1.fit_transform(train_texts) print("the shape of train is " + repr(counts_train.shape)) count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_test = count_v2.fit_transform(test_texts) print("the shape of test is " + repr(counts_test.shape)) tfidftransformer = TfidfTransformer() train_data = tfidftransformer.fit(counts_train).transform(counts_train) test_data = tfidftransformer.fit(counts_test).transform(counts_test) x_train = train_data y_train = train_labels x_test = test_data # KNN演算法建模 print('(3) KNN...') knnclf = KNeighborsClassifier(n_neighbors=3) knnclf.fit(x_train, y_train) preds = knnclf.predict(x_test) preds = preds.tolist() for i, pred in enumerate(preds): print(pred) if pred == '1': return"此新聞為娛樂類新聞" elif pred == '2': return "此新聞為汽車類新聞" elif pred == '3': return "此新聞為遊戲類新聞" elif pred == '4': return "此新聞為科技類新聞" elif pred == '5': return "此新聞為綜合體育最新類新聞" elif pred == '6': return "此新聞為財經類新聞" elif pred == '7': return "此新聞為房產類新聞" elif pred == '8': return "此新聞為教育類新聞" elif pred == '9': return "此新聞為軍事類新聞" def news(title, content): deposit_txt(title, content) fenci() result = CV_Tfidf() return result