1. 程式人生 > >作業12

作業12

lower 樸素 nump 訓練 ram strong and from metrics

text = ‘‘‘The world is changing
and time is spinning fast
it‘s so amazing how you came into my life
I know it seems all hope is gone
I know you feel you can‘t be strong
and once again the story ends with you and I
And anytime you feel like you just can‘t go on
just hold on to my love
and you‘ll never be alone
Hold on
we can make it through the fire
and my love   
‘‘‘ import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer #預處理 def preprocessing(text): #text=text.decode("utf-8) tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] stops=stopwords.words(english) tokens=[token for
token in tokens if token not in stops] tokens=[token.lower() for token in tokens if len(token)>=3] lmtzr=WordNetLemmatizer() tokens=[lmtzr.lemmatize(token) for token in tokens] preprocessed_text= .join(tokens) return preprocessed_text preprocessing (text) #讀取數據集 import csv file_path
=rD:\SMSSpamCollectionjs.txt sms=open(file_path,r,encoding=utf-8) sms_data=[] sms_label=[] csv_rreader=csv.reader(sms,delimiter=\t) for line in csv_reader: sms_label.append(line[0]) sms_data.append(preprocessing(line[1])) sms.close() #按0.7,0.3比例分為訓練集和測試集 import numpy as np sms_data=np.array(sms_data) sms_label=np.array(sms_label) from sklearn.model_selection import train_text_split x_train, x_test, y_train, y_test = train_text_split(sms_data, sms_label, test_size=0.3, random_state=0, stratify=sms_label) #將其向量化 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words=english,strip_accents=unicode,norm=l2) X_train=vectorizer.fit_transform(x_train) X_test=vectorizer.transform(x_test) X_train a=X_train.toarray() print(a) for i in range(1000): for j in range(5984): if a[i,j]!=0: print(i,j,a[i,j]) #樸素貝葉斯分類器 from sklearn.navie_bayes import MultionmialNB clf= MultionmialNB().fit(X_train,y_train) y_nb_pred=clf.predict(X_test) #分類結果顯示 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print(y_nb_pred.shape, y_nb_pred) #x_test預測結果 # from sklearn.metrics import classification_report cr = classification_report(y_nb_pred,y_test) print(cr)

作業12