bag_of_words------英文情感分類問題
阿新 • • 發佈:2018-11-10
本次練習訓練集只使用了有標註的資料,未標註的資料未使用,後續會更新~~
import os import re import numpy as np import pandas as pd from bs4 import BeautifulSoup from sklearn.feature_extraction.text import CountVectorizer #計數 from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix #評估準則 import nltk from nltk.corpus import stopwords ''' 讀取訓練資料 ''' datafile = os.path.join('H:/word2vect_3data/labeledTrainData.tsv') df = pd.read_csv(datafile,sep='\t',escapechar='\\') # print('Number of reviews:{}'.format(len(df))) # df = pd.read_csv('H:/word2vect_3data/labeledTrainData.tsv',sep='\t',escapechar='\\') # print(len(df)) ''' 對影評資料做以下處理: 1、去掉html標籤 2、移除標點 3、切分成詞/token 4、去掉停用詞 5、重組成新的句子 ''' def display(text,title): print(title) print("\n----------分割線----------\n") print(text) raw_example = df.review[0] # display(raw_example,'原始資料') # # example = BeautifulSoup(raw_example,'html.parser').get_text() #去除其中的html標籤 # # example_letters = re.sub(r'[^a-zA-Z]',' ',example) #用空格替換example中所有非字母的項,re.sub用於替換字串中的匹配項 # # display(example_letters,'去掉標籤和非字母項後') # # words = example_letters.lower().split() #小寫歸一化後分詞 # # words_stop = [w for w in words if w not in stopwords.words('english')] # display(words_stop,"去除停用詞") #將以上處理定義在一起 eng_stopwords = set(stopwords.words('english')) def clean_text(text): text = BeautifulSoup(text,'html.parser').get_text() text = re.sub(r'[^a-zA-Z]',' ',text) words = text.lower().split() words = [w for w in words if w not in eng_stopwords] return ' '.join(words) #' '中間有空格,不然所有字元都無間隔的連在一起了 ''' 構建新特徵 ''' df['clean_review'] = df.review.apply(clean_text) ''' 轉換為bag_of_words特徵形式 ''' vectorizer = CountVectorizer(max_features=5000) #取top5000 train_data_features = vectorizer.fit_transform(df.clean_review).toarray() # print(train_data_features.shape) #輸出為‘(25000, 5000)’ ''' 訓練分類器 ''' forest = RandomForestClassifier(n_estimators=100) forest = forest.fit(train_data_features,df.sentiment) ''' 在訓練集上進行predict ''' predict_values = confusion_matrix(df.sentiment,forest.predict(train_data_features)) # print(predict_values) ''' 讀取測試資料進行處理 ''' datafile_test = os.path.join('H:/word2vect_3data/testData.tsv') df_test = pd.read_csv(datafile_test,sep='\t',escapechar='\\') #對測試集文件進行同樣的處理 df_test['clean_review'] = df_test.review.apply(clean_text) test_data_feature = vectorizer.fit_transform(df_test.clean_review).toarray() test_pre = forest.predict(test_data_feature) output = pd.DataFrame({'id':df_test.id,'sentiment':test_pre}) output.to_csv('H:/word2vect_3data/submission.csv')