作業12
阿新 • • 發佈:2018-12-06
lower 樸素 nump 訓練 ram strong and from metrics
text = ‘‘‘The world is changing and time is spinning fast it‘s so amazing how you came into my life I know it seems all hope is gone I know you feel you can‘t be strong and once again the story ends with you and I And anytime you feel like you just can‘t go on just hold on to my love and you‘ll never be alone Hold on we can make it through the fire and my love‘‘‘ import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer #預處理 def preprocessing(text): #text=text.decode("utf-8) tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] stops=stopwords.words(‘english‘) tokens=[token fortoken in tokens if token not in stops] tokens=[token.lower() for token in tokens if len(token)>=3] lmtzr=WordNetLemmatizer() tokens=[lmtzr.lemmatize(token) for token in tokens] preprocessed_text=‘ ‘.join(tokens) return preprocessed_text preprocessing (text) #讀取數據集 import csv file_path=r‘D:\SMSSpamCollectionjs.txt‘ sms=open(file_path,‘r‘,encoding=‘utf-8‘) sms_data=[] sms_label=[] csv_rreader=csv.reader(sms,delimiter=‘\t‘) for line in csv_reader: sms_label.append(line[0]) sms_data.append(preprocessing(line[1])) sms.close() #按0.7,0.3比例分為訓練集和測試集 import numpy as np sms_data=np.array(sms_data) sms_label=np.array(sms_label) from sklearn.model_selection import train_text_split x_train, x_test, y_train, y_test = train_text_split(sms_data, sms_label, test_size=0.3, random_state=0, stratify=sms_label) #將其向量化 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words=‘english‘,strip_accents=‘unicode‘,norm=‘l2‘) X_train=vectorizer.fit_transform(x_train) X_test=vectorizer.transform(x_test) X_train a=X_train.toarray() print(a) for i in range(1000): for j in range(5984): if a[i,j]!=0: print(i,j,a[i,j]) #樸素貝葉斯分類器 from sklearn.navie_bayes import MultionmialNB clf= MultionmialNB().fit(X_train,y_train) y_nb_pred=clf.predict(X_test) #分類結果顯示 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print(y_nb_pred.shape, y_nb_pred) #x_test預測結果 # from sklearn.metrics import classification_report cr = classification_report(y_nb_pred,y_test) print(cr)
作業12