推薦系統3-1
阿新 • • 發佈:2018-12-07
使用http://mattmahoney.net/dc/textdata訓練word2vect skip-gram
1、匯入庫檔案
from __future__ import print_function import collections import math import numpy as np import os import random import tensorflow as tf import zipfile from matplotlib import pylab from six.moves import range from six.moves.urllib.request import urlretrieve from sklearn.manifold import TSNE print('check:libs well prepared')
2、下載資料集並解壓
url = 'http://mattmahoney.net/dc/' def maybe_download(filename, expected_bytes): #判斷檔案是否存在 if not os.path.exists(filename): #下載 print('download...') filename, _ = urlretrieve(url + filename, filename) #校驗大小 statinfo = os.stat(filename) if statinfo.st_size == expected_bytes: print('Found and verified %s' % filename) else: print('exception %s' % statinfo.st_size) return filename filename = maybe_download('text8.zip', 31344016)
3、編碼並替換低頻次
vocabulary_size = 50000 def build_dataset(words): count = [['UNK', -1]] #每個詞出現的次數 count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) dictionary = dict() #單詞到數字的對映 for word, _ in count: dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 unk_count = unk_count + 1 data.append(index) count[0][1] = unk_count #數字到單詞的對映 reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reverse_dictionary #對映之後的訓練資料 data, count, dictionary, reverse_dictionary = build_dataset(words) # print('Most common words (+UNK)', count[:5]) print('original data', words[:10]) print('training data', data[:10])
輸出:
Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
original data ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
training data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
4、生成skip-gram訓練資料
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
# x y
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # context word context
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
# 迴圈使用
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window #
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)
print(' batch:', [reverse_dictionary[bi] for bi in batch])
print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
輸出:
data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']
batch: ['as', 'as', 'a', 'a', 'term', 'term', 'of', 'of']
labels: ['originated', 'anarchism', 'of', 'term', 'as', 'a', 'abuse', 'first']
5、定義網路結構
batch_size = 128
embedding_size = 128 #
skip_window = 1 #
num_skips = 2 #
valid_size = 16 #
valid_window = 100 #
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 #
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# 輸入資料
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# 定義變數
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
#本次訓練資料對應的embedding
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# batch loss
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
#優化loss,更新引數
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
#歸一化
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
#用已有embedding計算valid的相似次
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
6、執行訓練流程
num_steps = 100000
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
average_loss = 0
for step in range(num_steps+1):
batch_data, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
#2000次列印loss
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# 列印valid效果
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 5 #相似度最高的5個詞
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
執行結果示例:
Average loss at step 0: 8.125039
Nearest to their: liliuokalani, kobe, aeolian, judeo, gutman,
Nearest to state: emulates, matching, heritage, coder, rebounding,
Nearest to the: super, represent, whitacre, swine, clothing,
Nearest to system: populace, harshness, bungee, pounds, nist,
Nearest to between: infimum, macedonians, abyss, ziegler, lorica,
Nearest to such: wrath, comecon, ignite, winfield, revolution,
Nearest to up: coexist, breads, applesoft, azores, dogs,
Nearest to this: apart, vorarlberg, par, jardines, syntax,
Nearest to if: knowles, hindi, defeated, biochemical, lonergan,
Nearest to from: usp, martov, hormonal, pd, clouds,
Nearest to s: mediating, bit, challenges, lys, lavos,
Nearest to however: eps, lambdamoo, eternally, zanetti, cavers,
Nearest to over: yardbirds, duct, mayer, breaks, plagues,
。。。。。。
。。。。。。
Nearest to between: with, within, among, through, against,
Nearest to such: well, intelligent, known, certain, these,
Nearest to up: out, off, down, back, them,
Nearest to this: which, it, another, itself, some,
Nearest to if: when, though, where, before, because,
Nearest to from: through, into, protestors, muir, in,
Nearest to s: whose, his, isbn, my, dedicates,
Nearest to however: but, although, that, though, especially,
Nearest to over: overshadowed, around, between, through, within,
Nearest to also: often, still, now, never, sometimes,
Nearest to used: designed, referred, seen, considered, known,
Nearest to on: upon, in, through, under, bathroom,
7、視覺化
num_points = 400
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])
def plot(embeddings, labels):
assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
pylab.figure(figsize=(20,20)) # in inches
for i, label in enumerate(labels):
x, y = embeddings[i,:]
pylab.scatter(x, y)
pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
ha='right', va='bottom')
pylab.show()
words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)