python機器學習,載入樣本集,對資料分類
阿新 • • 發佈:2021-06-25
import pandas,numpy,os,nltk,langid from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB #preprocess用於將一個文字文件進行切詞,並以字串形式輸出切詞結果 def preprocess(path_name): text_with_spaces="" textfile=open(path_name,"r",encoding="utf-8").read() textcut=nltk.word_tokenize(textfile) for word in textcut: text_with_spaces+=word+" " return text_with_spaces #loadtrainset用於將某一資料夾下的所有文字文件批量切詞後,載入為訓練資料集;返回訓練集和每一個文字(元組)對應的類標號。 def loadtrainset(path,classtag): allfiles=os.listdir(path) processed_textset=[] allclasstags=[] for thisfile in allfiles: path_name=path+"/"+thisfile processed_textset.append(preprocess(path_name)) allclasstags.append(classtag) return processed_textset,allclasstags def train(): processed_textdata1,class1=loadtrainset("data/CS", "CS") processed_textdata2,class2=loadtrainset("data/CL", "CL") integrated_train_data=processed_textdata1+processed_textdata2 classtags_list=class1+class2 count_vector = CountVectorizer() #該類會將文字中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文字下的詞頻 vector_matrix = count_vector.fit_transform(integrated_train_data) #tfidf度量模型 train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vector_matrix) #將詞頻矩陣轉化為權重矩陣,每一個特徵值就是一個單詞的TF-IDF值 #呼叫MultinomialNB分類器進行訓練 clf = MultinomialNB().fit(train_tfidf,classtags_list)# return count_vector,clf def isCyber(content): #[CL,CS] content_lang = langid.classify(content)[0] if content_lang == 'en': text_with_spaces="" textcut=nltk.word_tokenize(content) for word in textcut: text_with_spaces+=word+" " testset=[] testset.append(text_with_spaces) count_vector,clf = train() new_count_vector = count_vector.transform(testset) new_tfidf= TfidfTransformer(use_idf=False).fit_transform(new_count_vector) predict_result = clf.predict(new_tfidf) #預測結果 print(predict_result) print( clf.predict_proba(new_tfidf) ) print( clf.predict_proba(new_tfidf)[0][1] ) if predict_result[0] == 'CS': if clf.predict_proba(new_tfidf)[0][1] >= 0.7: return True return False if content_lang == 'zh': print() if __name__=='__main__': content = '''These pandemic days flow by in waves of exhilaration and stillness. Who knew a trip to the grocery store could be so exciting? Bread-and-milk runs have become surgical raids: Sterilize the grocery cart with a disinfectant wipe, scout out the TP aisle, exchange sideways glances with the could-be infected, grab the essentials, and get the hell out of there. Later, as another news alert interrupts the Netflix stream, the group text explodes: “This is crazy,” everyone says from their respective couches. Few hasten to add that crazy is also sort of fun.''' isCyber(content)