tf2: nn和cnn實現評論分類
阿新 • • 發佈:2018-12-27
本帖是前一帖的補充:
- 使用大資料,瞭解怎麼處理資料不能一次全部載入到記憶體的情況。如果你記憶體充足,當我沒說
- 訓練好的模型的儲存和使用
- 使用的模型沒變,還是簡單的feedforward神經網路(update:新增CNN模型)
- 如果你要執行本帖程式碼,推薦使用GPU版本或強大的VPS,我使用小筆記本差點等吐血
在正文開始之前,我畫了一個機器學習模型的基本開發流程圖:
使用的資料集
資料集包含1百60萬條推特,包含消極、中性和積極tweet。不知道有沒有現成的微博資料集。
資料格式:移除表情符號的CSV檔案,欄位如下:
- 0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- 1 – the id of the tweet (2087)
- 2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- 3 – the query (lyx). If there is no query, then this value is NO_QUERY.
- 4 – the user that tweeted (robotickilldozr)
- 5 – the text of the tweet (Lyx is cool)
training.1600000.processed.noemoticon.csv(238M)
testdata.manual.2009.06.14.csv(74K)
資料預處理
import nltk from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import pickle import numpy as np import pandas as pd from collections import OrderedDict org_train_file = 'training.1600000.processed.noemoticon.csv' org_test_file = 'testdata.manual.2009.06.14.csv' # 提取檔案中有用的欄位 def usefull_filed(org_file, output_file): output = open(output_file, 'w') with open(org_file, buffering=10000, encoding='latin-1') as f: try: for line in f: # "4","2193601966","Tue Jun 16 08:40:49 PDT 2009","NO_QUERY","AmandaMarie1028","Just woke up. Having no school is the best feeling ever " line = line.replace('"', '') clf = line.split(',')[0] # 4 if clf == '0': clf = [0, 0, 1] # 消極評論 elif clf == '2': clf = [0, 1, 0] # 中性評論 elif clf == '4': clf = [1, 0, 0] # 積極評論 tweet = line.split(',')[-1] outputline = str(clf) + ':%:%:%:' + tweet output.write(outputline) # [0, 0, 1]:%:%:%: that's a bummer. You shoulda got David Carr of Third Day to do it. ;D except Exception as e: print(e) output.close() # 處理完成,處理後文件大小127.5M usefull_filed(org_train_file, 'training.csv') usefull_filed(org_test_file, 'tesing.csv') # 建立詞彙表 def create_lexicon(train_file): lex = [] lemmatizer = WordNetLemmatizer() with open(train_file, buffering=10000, encoding='latin-1') as f: try: count_word = {} # 統計單詞出現次數 for line in f: tweet = line.split(':%:%:%:')[1] words = word_tokenize(line.lower()) for word in words: word = lemmatizer.lemmatize(word) if word not in count_word: count_word[word] = 1 else: count_word[word] += 1 count_word = OrderedDict(sorted(count_word.items(), key=lambda t: t[1])) for word in count_word: if count_word[word] < 100000 and count_word[word] > 100: # 過濾掉一些詞 lex.append(word) except Exception as e: print(e) return lex lex = create_lexicon('training.csv') with open('lexcion.pickle', 'wb') as f: pickle.dump(lex, f) """ # 把字串轉為向量 def string_to_vector(input_file, output_file, lex): output_f = open(output_file, 'w') lemmatizer = WordNetLemmatizer() with open(input_file, buffering=10000, encoding='latin-1') as f: for line in f: label = line.split(':%:%:%:')[0] tweet = line.split(':%:%:%:')[1] words = word_tokenize(tweet.lower()) words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex)) for word in words: if word in lex: features[lex.index(word)] = 1 # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大 features = list(features) output_f.write(str(label) + ":" + str(features) + '\n') output_f.close() f = open('lexcion.pickle', 'rb') lex = pickle.load(f) f.close() # lexcion詞彙表大小112k,training.vec大約112k*1600000 170G 太大,只能邊轉邊訓練了 # string_to_vector('training.csv', 'training.vec', lex) # string_to_vector('tesing.csv', 'tesing.vec', lex) """
上面程式碼把原始資料轉為training.csv、和tesing.csv,裡面只包含label和tweet。lexcion.pickle檔案儲存了詞彙表。
如果資料檔案太大,不能一次載入到記憶體,可以把資料匯入資料庫
Dask可處理大csv檔案
開始漫長的訓練
import os
import random
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
def get_random_line(file, point):
file.seek(point)
file.readline()
return file.readline()
# 從檔案中隨機選擇n條記錄
def get_n_random_line(file_name, n=150):
lines = []
file = open(file_name, encoding='latin-1')
total_bytes = os.stat(file_name).st_size
for i in range(n):
random_point = random.randint(0, total_bytes)
lines.append(get_random_line(file, random_point))
file.close()
return lines
def get_test_dataset(test_file):
with open(test_file, encoding='latin-1') as f:
test_x = []
test_y = []
lemmatizer = WordNetLemmatizer()
for line in f:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
test_x.append(list(features))
test_y.append(eval(label))
return test_x, test_y
test_x, test_y = get_test_dataset('tesing.csv')
#######################################################################
n_input_layer = len(lex) # 輸入層
n_layer_1 = 2000 # hide layer
n_layer_2 = 2000 # hide layer(隱藏層)聽著很神祕,其實就是除輸入輸出層外的中間層
n_output_layer = 3 # 輸出層
def neural_network(data):
# 定義第一層"神經元"的權重和biases
layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
# 定義第二層"神經元"的權重和biases
layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
# 定義輸出層"神經元"的權重和biases
layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
# w·x+b
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.nn.relu(layer_1) # 啟用函式
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = tf.nn.relu(layer_2 ) # 啟用函式
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
X = tf.placeholder('float')
Y = tf.placeholder('float')
batch_size = 90
def train_neural_network(X, Y):
predict = neural_network(X)
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict, Y))
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
lemmatizer = WordNetLemmatizer()
saver = tf.train.Saver()
i = 0
pre_accuracy = 0
while True: # 一直訓練
batch_x = []
batch_y = []
#if model.ckpt檔案已存在:
# saver.restore(session, 'model.ckpt') 恢復儲存的session
try:
lines = get_n_random_line('training.csv', batch_size)
for line in lines:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1 # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
batch_x.append(list(features))
batch_y.append(eval(label))
session.run([optimizer, cost_func], feed_dict={X:batch_x,Y:batch_y})
except Exception as e:
print(e)
# 準確率
if i > 100:
correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
accuracy = accuracy.eval({X:test_x, Y:test_y})
if accuracy > pre_accuracy: # 儲存準確率最高的訓練模型
print('準確率: ', accuracy)
pre_accuracy = accuracy
saver.save(session, 'model.ckpt') # 儲存session
i = 0
i += 1
train_neural_network(X,Y)
上面程式佔用記憶體600M,峰值1G。
執行:
訓練模型儲存為model.ckpt。
使用訓練好的模型
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
n_input_layer = len(lex) # 輸入層
n_layer_1 = 2000 # hide layer
n_layer_2 = 2000 # hide layer(隱藏層)聽著很神祕,其實就是除輸入輸出層外的中間層
n_output_layer = 3 # 輸出層
def neural_network(data):
# 定義第一層"神經元"的權重和biases
layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
# 定義第二層"神經元"的權重和biases
layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
# 定義輸出層"神經元"的權重和biases
layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
# w·x+b
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.nn.relu(layer_1) # 啟用函式
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = tf.nn.relu(layer_2 ) # 啟用函式
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
X = tf.placeholder('float')
def prediction(tweet_text):
predict = neural_network(X)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
saver = tf.train.Saver()
saver.restore(session, 'model.ckpt')
lemmatizer = WordNetLemmatizer()
words = word_tokenize(tweet_text.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
#print(predict.eval(feed_dict={X:[features]})) [[val1,val2,val3]]
res = session.run(tf.argmax(predict.eval(feed_dict={X:[features]}),1 ))
return res
prediction("I am very happe")
上面使用簡單的feedfroward模型,下面使用CNN模型
# https://github.com/Lab41/sunny-side-up
import os
import random
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
def get_random_line(file, point):
file.seek(point)
file.readline()
return file.readline()
# 從檔案中隨機選擇n條記錄
def get_n_random_line(file_name, n=150):
lines = []
file = open(file_name, encoding='latin-1')
total_bytes = os.stat(file_name).st_size
for i in range(n):
random_point = random.randint(0, total_bytes)
lines.append(get_random_line(file, random_point))
file.close()
return lines
def get_test_dataset(test_file):
with open(test_file, encoding='latin-1') as f:
test_x = []
test_y = []
lemmatizer = WordNetLemmatizer()
for line in f:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
test_x.append(list(features))
test_y.append(eval(label))
return test_x, test_y
test_x, test_y = get_test_dataset('tesing.csv')
##############################################################################
input_size = len(lex)
num_classes = 3
X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
dropout_keep_prob = tf.placeholder(tf.float32)
batch_size = 90
def neural_network():
# embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
embedding_size = 128
W = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))
embedded_chars = tf.nn.embedding_lookup(W, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
# convolution + maxpool layer
num_filters = 128
filter_sizes = [3,4,5]
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")
h = tf.nn.relu(tf.nn.bias_add(conv, b))
pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
pooled_outputs.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(3, pooled_outputs)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
# dropout
with tf.name_scope("dropout"):
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
# output
with tf.name_scope("output"):
W = tf.get_variable("W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]))
output = tf.nn.xw_plus_b(h_drop, W, b)
return output
def train_neural_network():
output = neural_network()
optimizer = tf.train.AdamOptimizer(1e-3)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars)
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
lemmatizer = WordNetLemmatizer()
i = 0
while True:
batch_x = []
batch_y = []
#if model.ckpt檔案已存在:
# saver.restore(session, 'model.ckpt') 恢復儲存的session
try:
lines = get_n_random_line('training.csv', batch_size)
for line in lines:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1 # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
batch_x.append(list(features))
batch_y.append(eval(label))
_, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})
print(loss_)
except Exception as e:
print(e)
if i % 10 == 0:
predictions = tf.argmax(output, 1)
correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
accur = sess.run(accuracy, feed_dict={X:test_x[0:50], Y:test_y[0:50], dropout_keep_prob:1.0})
print('準確率:', accur)
i += 1
train_neural_network()
使用了CNN模型之後,準確率有了顯著提升。
Share the post "TensorFlow練習2: 對評論進行分類"