1. 程式人生 > >tf2: nn和cnn實現評論分類

tf2: nn和cnn實現評論分類

本帖是前一帖的補充:

  1. 使用大資料,瞭解怎麼處理資料不能一次全部載入到記憶體的情況。如果你記憶體充足,當我沒說
  2. 訓練好的模型的儲存和使用
  3. 使用的模型沒變,還是簡單的feedforward神經網路(update:新增CNN模型)
  4. 如果你要執行本帖程式碼,推薦使用GPU版本或強大的VPS,我使用小筆記本差點等吐血

在正文開始之前,我畫了一個機器學習模型的基本開發流程圖:

使用的資料集

資料集包含1百60萬條推特,包含消極、中性和積極tweet。不知道有沒有現成的微博資料集。

資料格式:移除表情符號的CSV檔案,欄位如下:

  • 0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
  • 1 – the id of the tweet (2087)
  • 2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
  • 3 – the query (lyx). If there is no query, then this value is NO_QUERY.
  • 4 – the user that tweeted (robotickilldozr)
  • 5 – the text of the tweet (Lyx is cool)

training.1600000.processed.noemoticon.csv(238M)
testdata.manual.2009.06.14.csv(74K)

資料預處理

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
 
import pickle
import numpy as np
import pandas as pd
from collections import OrderedDict
 
org_train_file = 'training.1600000.processed.noemoticon.csv'
org_test_file = 'testdata.manual.2009.06.14.csv'
 
# 提取檔案中有用的欄位
def usefull_filed(org_file, output_file):
	output = open(output_file, 'w')
	with open(org_file, buffering=10000, encoding='latin-1') as f:
		try:
			for line in f:                # "4","2193601966","Tue Jun 16 08:40:49 PDT 2009","NO_QUERY","AmandaMarie1028","Just woke up. Having no school is the best feeling ever "
				line = line.replace('"', '')
				clf = line.split(',')[0]   # 4
				if clf == '0':
					clf = [0, 0, 1]  # 消極評論
				elif clf == '2':
					clf = [0, 1, 0]  # 中性評論
				elif clf == '4':
					clf = [1, 0, 0]  # 積極評論
 
				tweet = line.split(',')[-1]
				outputline = str(clf) + ':%:%:%:' + tweet
				output.write(outputline)  # [0, 0, 1]:%:%:%: that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
		except Exception as e:
			print(e)
	output.close()  # 處理完成,處理後文件大小127.5M
 
usefull_filed(org_train_file, 'training.csv')
usefull_filed(org_test_file, 'tesing.csv')
 
# 建立詞彙表
def create_lexicon(train_file):
	lex = []
	lemmatizer = WordNetLemmatizer()
	with open(train_file, buffering=10000, encoding='latin-1') as f:
		try:
			count_word = {}  # 統計單詞出現次數
			for line in f:
				tweet = line.split(':%:%:%:')[1]
				words = word_tokenize(line.lower())
				for word in words:
					word = lemmatizer.lemmatize(word)
					if word not in count_word:
						count_word[word] = 1
					else:
						count_word[word] += 1
 
			count_word = OrderedDict(sorted(count_word.items(), key=lambda t: t[1]))
			for word in count_word:
				if count_word[word] < 100000 and count_word[word] > 100:  # 過濾掉一些詞
					lex.append(word)
		except Exception as e:
			print(e)
	return lex
 
lex = create_lexicon('training.csv')
 
with open('lexcion.pickle', 'wb') as f:
	pickle.dump(lex, f)
 
 
"""
# 把字串轉為向量
def string_to_vector(input_file, output_file, lex):
	output_f = open(output_file, 'w')
	lemmatizer = WordNetLemmatizer()
	with open(input_file, buffering=10000, encoding='latin-1') as f:
		for line in f:
			label = line.split(':%:%:%:')[0]
			tweet = line.split(':%:%:%:')[1]
			words = word_tokenize(tweet.lower())
			words = [lemmatizer.lemmatize(word) for word in words]
 
			features = np.zeros(len(lex))
			for word in words:
				if word in lex:
					features[lex.index(word)] = 1  # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
			
			features = list(features)
			output_f.write(str(label) + ":" + str(features) + '\n')
	output_f.close()
 
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
# lexcion詞彙表大小112k,training.vec大約112k*1600000  170G  太大,只能邊轉邊訓練了
# string_to_vector('training.csv', 'training.vec', lex)
# string_to_vector('tesing.csv', 'tesing.vec', lex)
"""

上面程式碼把原始資料轉為training.csv、和tesing.csv,裡面只包含label和tweet。lexcion.pickle檔案儲存了詞彙表。

如果資料檔案太大,不能一次載入到記憶體,可以把資料匯入資料庫
Dask可處理大csv檔案

開始漫長的訓練

import os
import random 
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
 
def get_random_line(file, point):
	file.seek(point)
	file.readline()
	return file.readline()
# 從檔案中隨機選擇n條記錄
def get_n_random_line(file_name, n=150):
	lines = []
	file = open(file_name, encoding='latin-1')
	total_bytes = os.stat(file_name).st_size 
	for i in range(n):
		random_point = random.randint(0, total_bytes)
		lines.append(get_random_line(file, random_point))
	file.close()
	return lines
 
 
def get_test_dataset(test_file):
	with open(test_file, encoding='latin-1') as f:
		test_x = []
		test_y = []
		lemmatizer = WordNetLemmatizer()
		for line in f:
			label = line.split(':%:%:%:')[0]
			tweet = line.split(':%:%:%:')[1]
			words = word_tokenize(tweet.lower())
			words = [lemmatizer.lemmatize(word) for word in words]
			features = np.zeros(len(lex))
			for word in words:
				if word in lex:
					features[lex.index(word)] = 1
			
			test_x.append(list(features))
			test_y.append(eval(label))
	return test_x, test_y
 
test_x, test_y = get_test_dataset('tesing.csv')
 
 
#######################################################################
 
n_input_layer = len(lex)  # 輸入層
 
n_layer_1 = 2000     # hide layer
n_layer_2 = 2000    # hide layer(隱藏層)聽著很神祕,其實就是除輸入輸出層外的中間層
 
n_output_layer = 3       # 輸出層
 
 
def neural_network(data):
	# 定義第一層"神經元"的權重和biases
	layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
	# 定義第二層"神經元"的權重和biases
	layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
	# 定義輸出層"神經元"的權重和biases
	layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
 
	# w·x+b
	layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
	layer_1 = tf.nn.relu(layer_1)  # 啟用函式
	layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
	layer_2 = tf.nn.relu(layer_2 ) # 啟用函式
	layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
 
	return layer_output
 
 
X = tf.placeholder('float')
Y = tf.placeholder('float')
batch_size = 90
 
def train_neural_network(X, Y):
	predict = neural_network(X)
	cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict, Y))
	optimizer = tf.train.AdamOptimizer().minimize(cost_func)
 
	with tf.Session() as session:
		session.run(tf.initialize_all_variables())
 
		lemmatizer = WordNetLemmatizer()
		saver = tf.train.Saver()
		i = 0
		pre_accuracy = 0
		while True:   # 一直訓練
			batch_x = []
			batch_y = []
 
			#if model.ckpt檔案已存在:
			#	saver.restore(session, 'model.ckpt')  恢復儲存的session
 
			try:
				lines = get_n_random_line('training.csv', batch_size)
				for line in lines:
					label = line.split(':%:%:%:')[0]
					tweet = line.split(':%:%:%:')[1]
					words = word_tokenize(tweet.lower())
					words = [lemmatizer.lemmatize(word) for word in words]
 
					features = np.zeros(len(lex))
					for word in words:
						if word in lex:
							features[lex.index(word)] = 1  # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
				
					batch_x.append(list(features))
					batch_y.append(eval(label))
 
				session.run([optimizer, cost_func], feed_dict={X:batch_x,Y:batch_y})
			except Exception as e:
				print(e)
 
			# 準確率
			if i > 100:
				correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
				accuracy = tf.reduce_mean(tf.cast(correct,'float'))
				accuracy = accuracy.eval({X:test_x, Y:test_y})
				if accuracy > pre_accuracy:  # 儲存準確率最高的訓練模型
					print('準確率: ', accuracy)
					pre_accuracy = accuracy
					saver.save(session, 'model.ckpt')  # 儲存session
				i = 0
			i += 1
 
 
train_neural_network(X,Y)

上面程式佔用記憶體600M,峰值1G。

執行:

TensorFlow練習2: 對評論進行分類

訓練模型儲存為model.ckpt。

使用訓練好的模型

import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
n_input_layer = len(lex)  # 輸入層
 
n_layer_1 = 2000     # hide layer
n_layer_2 = 2000    # hide layer(隱藏層)聽著很神祕,其實就是除輸入輸出層外的中間層
 
n_output_layer = 3       # 輸出層
def neural_network(data):
	# 定義第一層"神經元"的權重和biases
	layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
	# 定義第二層"神經元"的權重和biases
	layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
	# 定義輸出層"神經元"的權重和biases
	layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
 
	# w·x+b
	layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
	layer_1 = tf.nn.relu(layer_1)  # 啟用函式
	layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
	layer_2 = tf.nn.relu(layer_2 ) # 啟用函式
	layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
 
	return layer_output
 
X = tf.placeholder('float')
def prediction(tweet_text):
	predict = neural_network(X)
 
	with tf.Session() as session:
		session.run(tf.initialize_all_variables())
		saver = tf.train.Saver()
		saver.restore(session, 'model.ckpt')
 
		lemmatizer = WordNetLemmatizer()
		words = word_tokenize(tweet_text.lower())
		words = [lemmatizer.lemmatize(word) for word in words]
 
		features = np.zeros(len(lex))
		for word in words:
			if word in lex:
				features[lex.index(word)] = 1
		
		#print(predict.eval(feed_dict={X:[features]})) [[val1,val2,val3]]
		res = session.run(tf.argmax(predict.eval(feed_dict={X:[features]}),1 ))
		return res
 
 
prediction("I am very happe")

上面使用簡單的feedfroward模型,下面使用CNN模型

# https://github.com/Lab41/sunny-side-up
import os
import random
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
def get_random_line(file, point):
	file.seek(point)
	file.readline()
	return file.readline()
# 從檔案中隨機選擇n條記錄
def get_n_random_line(file_name, n=150):
	lines = []
	file = open(file_name, encoding='latin-1')
	total_bytes = os.stat(file_name).st_size 
	for i in range(n):
		random_point = random.randint(0, total_bytes)
		lines.append(get_random_line(file, random_point))
	file.close()
	return lines
 
def get_test_dataset(test_file):
	with open(test_file, encoding='latin-1') as f:
		test_x = []
		test_y = []
		lemmatizer = WordNetLemmatizer()
		for line in f:
			label = line.split(':%:%:%:')[0]
			tweet = line.split(':%:%:%:')[1]
			words = word_tokenize(tweet.lower())
			words = [lemmatizer.lemmatize(word) for word in words]
			features = np.zeros(len(lex))
			for word in words:
				if word in lex:
					features[lex.index(word)] = 1
			
			test_x.append(list(features))
			test_y.append(eval(label))
	return test_x, test_y
 
test_x, test_y = get_test_dataset('tesing.csv')
##############################################################################
input_size = len(lex)
num_classes = 3
 
X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
 
dropout_keep_prob = tf.placeholder(tf.float32)
 
batch_size = 90
 
def neural_network():
	# embedding layer
	with tf.device('/cpu:0'), tf.name_scope("embedding"):
		embedding_size = 128
		W = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))
		embedded_chars = tf.nn.embedding_lookup(W, X)
		embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
	# convolution + maxpool layer
	num_filters = 128
	filter_sizes = [3,4,5]
	pooled_outputs = []
	for i, filter_size in enumerate(filter_sizes):
		with tf.name_scope("conv-maxpool-%s" % filter_size):
			filter_shape = [filter_size, embedding_size, 1, num_filters]
			W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
			b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
			conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")
			h = tf.nn.relu(tf.nn.bias_add(conv, b))
			pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
			pooled_outputs.append(pooled)
 
	num_filters_total = num_filters * len(filter_sizes)
	h_pool = tf.concat(3, pooled_outputs)
	h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
	# dropout
	with tf.name_scope("dropout"):
		h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
	# output
	with tf.name_scope("output"):
		W = tf.get_variable("W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
		b = tf.Variable(tf.constant(0.1, shape=[num_classes]))
		output = tf.nn.xw_plus_b(h_drop, W, b)
		
	return output
 
def train_neural_network():
	output = neural_network()
 
	optimizer = tf.train.AdamOptimizer(1e-3)
	loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
	grads_and_vars = optimizer.compute_gradients(loss)
	train_op = optimizer.apply_gradients(grads_and_vars)
 
	saver = tf.train.Saver(tf.global_variables())
	with tf.Session() as sess:
		sess.run(tf.global_variables_initializer())
 
		lemmatizer = WordNetLemmatizer()
		i = 0
		while True:
			batch_x = []
			batch_y = []
 
			#if model.ckpt檔案已存在:
			#	saver.restore(session, 'model.ckpt')  恢復儲存的session
			try:
				lines = get_n_random_line('training.csv', batch_size)
				for line in lines:
					label = line.split(':%:%:%:')[0]
					tweet = line.split(':%:%:%:')[1]
					words = word_tokenize(tweet.lower())
					words = [lemmatizer.lemmatize(word) for word in words]
 
					features = np.zeros(len(lex))
					for word in words:
						if word in lex:
							features[lex.index(word)] = 1  # 一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
				
					batch_x.append(list(features))
					batch_y.append(eval(label))
 
				_, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})
				print(loss_)
			except Exception as e:
				print(e)
 
			if i % 10 == 0:
				predictions = tf.argmax(output, 1)
				correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))
				accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
				accur = sess.run(accuracy, feed_dict={X:test_x[0:50], Y:test_y[0:50], dropout_keep_prob:1.0})
				print('準確率:', accur)
 
			i += 1
 
train_neural_network()

使用了CNN模型之後,準確率有了顯著提升。

Share the post "TensorFlow練習2: 對評論進行分類"