從零開始 TensorFlow Word2vec
阿新 • • 發佈:2018-12-25
Word2vec 之 Skip-Gram 模型
from __future__ import division, print_function, absolute_import
import collections
import os
import random
import urllib.request
import zipfile
import numpy as np
import tensorflow as tf
learning_rate=0.1
batch_size=128
num_steps=3000000
display_step=10000
eval_step=200000
eval_words= [b'five',b'of',b'going',b'hardware',b'american',b'britain']
embedding_size=200
max_vocabulary_size=50000
min_occurrence=10
skip_window=3
num_skips=2
num_sampled=64
url = 'http://mattmahoney.net/dc/text8.zip'#自己先下載好
data_path = r'E:\learn\pc_code\tensorflow\12.24\text8.zip'
with zipfile.ZipFile(data_path) as f:
text_words=f.read(f.namelist()[0]).lower().split()
count=[('UNK',-1)]
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size-1))
#去掉小於10次的單詞
for i in range(len(count)-1,-1,-1):
if count[i][1]<min_occurrence:
count.pop(i)
else:
break
vocabulary_size= len(count)
word2id=dict()
for i, (word,_) in enumerate(count):
word2id[word]=i
data=list()
unk_count=0
for word in text_words:
index=word2id.get(word,0)
if index==0: unk_count+=1
data.append(index)
count[0]=('UNK',unk_count)
id2word=dict(zip(word2id.values(),word2id.keys()))
print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("vocabulary_size:", vocabulary_size)
print("Most Commen words:", count[:10])
data_index=0
def next_batch(batch_size,num_skips,skip_window):
global data_index
assert batch_size%num_skips==0
assert num_skips<= 2*skip_window
batch=np.ndarray(shape=(batch_size),dtype=np.int32)
labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
span=2*skip_window+1
buffer=collections.deque(maxlen=span)
if data_index + span>len(data):
data_index=0
buffer.extend(data[data_index:data_index+span])
data_index+=span
for i in range(batch_size//num_skips):
context_words=[w for w in range(span) if w != skip_window]
words_to_use=random.sample(context_words,num_skips)
for j, context_word in enumerate(words_to_use):
batch[i*num_skips+j]=buffer[skip_window]
labels[i*num_skips+j,0]=buffer[context_word]
if data_index==len(data):
buffer.extend(data[0:span])
data_index=span
else:
buffer.append(data[data_index])
data_index+=1
data_index=(data_index+len(data)-span)%len(data)
return batch,labels
X=tf.placeholder(tf.int32,shape=[None])
Y=tf.placeholder(tf.int32,shape=[None,1])
with tf.device('/cpu:0'):
embedding=tf.Variable(tf.random_normal([vocabulary_size,embedding_size]))
X_embed=tf.nn.embedding_lookup(embedding,X)
nce_weights=tf.Variable(tf.random_normal([vocabulary_size,embedding_size]))
nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
loss_op=tf.reduce_mean(
tf.nn.nce_loss(
weights=nce_weights,
biases=nce_biases,
labels=Y,
inputs=X_embed,
num_sampled=num_sampled,
num_classes=vocabulary_size
)
)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
train_op=optimizer.minimize(loss_op)
X_embed_norm = X_embed/tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding/tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims=True))
cosine_sim_op=tf.matmul(X_embed_norm,embedding_norm,transpose_b=True)
init=tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
x_test=np.array([word2id[w] for w in eval_words])
average_loss=0
for step in range(1,num_steps+1):
batch_x, batch_y=next_batch(batch_size, num_skips, skip_window)
_, loss=sess.run([train_op,loss_op],feed_dict={X:batch_x,Y:batch_y})
average_loss += loss
if step % display_step==0 or step==1:
if step > 1:
average_loss /= display_step
print("Step " + str(step) + ", Average Loss= " + \
"{:.4f}".format(average_loss))
average_loss = 0
# Evaluation
if step % eval_step == 0 or step == 1:
print("Evaluation...")
sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
for i in range(len(eval_words)):
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = '"%s" nearest neighbors:' % eval_words[i]
for k in range(top_k):
log_str = '%s %s,' % (log_str, id2word[nearest[k]])
print(log_str)