1. 程式人生 > >bag_of_words------英文情感分類問題

bag_of_words------英文情感分類問題

本次練習訓練集只使用了有標註的資料,未標註的資料未使用,後續會更新~~

import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer     #計數
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix                    #評估準則
import nltk
from nltk.corpus import stopwords


'''
讀取訓練資料
'''
datafile = os.path.join('H:/word2vect_3data/labeledTrainData.tsv')
df = pd.read_csv(datafile,sep='\t',escapechar='\\')
# print('Number of reviews:{}'.format(len(df)))

# df = pd.read_csv('H:/word2vect_3data/labeledTrainData.tsv',sep='\t',escapechar='\\')
# print(len(df))

'''
對影評資料做以下處理:
    1、去掉html標籤
    2、移除標點
    3、切分成詞/token
    4、去掉停用詞
    5、重組成新的句子
'''
def display(text,title):
    print(title)
    print("\n----------分割線----------\n")
    print(text)

raw_example = df.review[0]
# display(raw_example,'原始資料')
#
# example = BeautifulSoup(raw_example,'html.parser').get_text()       #去除其中的html標籤
#
# example_letters = re.sub(r'[^a-zA-Z]',' ',example)             #用空格替換example中所有非字母的項,re.sub用於替換字串中的匹配項
# # display(example_letters,'去掉標籤和非字母項後')
#
# words = example_letters.lower().split()                         #小寫歸一化後分詞
#
# words_stop = [w for w in words if w not in stopwords.words('english')]
# display(words_stop,"去除停用詞")

#將以上處理定義在一起

eng_stopwords = set(stopwords.words('english'))
def clean_text(text):
    text = BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return  ' '.join(words)             #' '中間有空格,不然所有字元都無間隔的連在一起了
'''
構建新特徵
'''
df['clean_review'] = df.review.apply(clean_text)

'''
轉換為bag_of_words特徵形式
'''
vectorizer = CountVectorizer(max_features=5000)         #取top5000
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
# print(train_data_features.shape)                    #輸出為‘(25000, 5000)’

'''
訓練分類器
'''
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features,df.sentiment)

'''
在訓練集上進行predict
'''
predict_values = confusion_matrix(df.sentiment,forest.predict(train_data_features))
# print(predict_values)

'''
讀取測試資料進行處理
'''
datafile_test = os.path.join('H:/word2vect_3data/testData.tsv')
df_test = pd.read_csv(datafile_test,sep='\t',escapechar='\\')
#對測試集文件進行同樣的處理
df_test['clean_review'] = df_test.review.apply(clean_text)
test_data_feature = vectorizer.fit_transform(df_test.clean_review).toarray()
test_pre = forest.predict(test_data_feature)
output = pd.DataFrame({'id':df_test.id,'sentiment':test_pre})
output.to_csv('H:/word2vect_3data/submission.csv')