1. 程式人生 > >Tensorflow練習2-Word2vec模型計算詞語相似度

Tensorflow練習2-Word2vec模型計算詞語相似度

#-*- coding: utf8 -*-
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic word2vec example."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')  #jason
import collections
import math
import os
import random
from tempfile import gettempdir
import zipfile
import pdb
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


# pylint: disable=redefined-outer-name
#下載並驗證text8資料集
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  local_filename = os.path.join(gettempdir(), filename)
  #先看是否已經下載過了 
  if not os.path.exists(local_filename):
    local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
  statinfo = os.stat(local_filename)
  #校驗檔案的尺寸
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
  return local_filename

pdb.set_trace()
filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
#讀取資料集,轉化為列表vocabulary(每個元素為單詞)
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  #解壓檔案
  with zipfile.ZipFile(filename) as f:
    #tf.compat.as_str將資料轉換成單詞的列表
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

#讀取詞庫, vocabulary是詞的列表,實際也是一個一個的句子,就是訓練資料集,將所有的這些詞也用來構成詞典
vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1)) #most_common:返回計數值最高的top n個,結果已經按詞頻降序排列
  dictionary = dict()
  #將所有的詞典的詞排個順序,假設詞典有這些詞: 我,你,他,和,還。編個順序:我(1),你(2),他(3),和(4),還(5)
  #那麼將'我'轉換成one-hot向量就是[1,0,0,0,0,], '他'轉換成one-hot向量就是[0,0,1,0,0]
  for word, _ in count:
    dictionary[word] = len(dictionary) #len(dictionary)就是當前的詞word的順序, 詞頻最高的單詞的編碼為0,其次為1,依次增加
  data = list()
  unk_count = 0
  for word in words:
    #count中只有top的那些詞,所以dictionary中也是隻有top的那些詞,所以對於非top的詞index=0,
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1 #unk_count是非top的詞的個數
    data.append(index)
  count[0][1] = unk_count #所有非top詞出現的次數
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  #dictionary: 每個詞的順序值, 詞:編碼
  #reversed_dictionary:將dictionary中的key和value顛倒過來, 編碼:詞
  #count: 詞典中的詞出現的次數,非top的合併起來了
  #data: 詞典中詞的順序值, 就是單詞的編碼, 將讀取的訓練資料(句子)中的詞全部轉換成對應的詞的編碼 
  return data, count, dictionary, reversed_dictionary

# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0  #單詞的序號

# Step 3: Function to generate a training batch for the skip-gram model.
#batch_size: 每個批次生成的訓練例項個數
#skip_window: 中心詞左右的視窗大小,即中心詞左邊詞的個數或者右邊詞的個數
#num_skips: 對於一箇中心詞,從其視窗內隨機選擇的例項個數, 例項的x是中心詞語,y是隨機取樣的上下文詞語
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  #batch_size是每個批次要生成的訓練例項的個數,而num_skips是每個詞的上下文中隨機選擇的訓練例項數,num_skips * k = batch_size, k是選擇目標詞個數
  assert batch_size % num_skips == 0 
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ] #整個區域的長度,包括中心詞和上下文的詞
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data): #data中是單詞的編碼, 詞頻最高的單詞的編碼為0,其次為1,依次增加
    data_index = 0
  buffer.extend(data[data_index:data_index + span]) #buffer用於儲存長度為 span 的單詞編號
  data_index += span

  for i in range(batch_size // num_skips):  #batch_size // num_skips: 選取目標中心詞語的次數,視窗滑動的次數
    context_words = [w for w in range(span) if w != skip_window] #context_words: 這是一箇中心詞的上下文詞的索引集合
    words_to_use = random.sample(context_words, num_skips) #隨機從上下文詞中選擇num_skips個詞的索引值返回
    #生成一個目標單詞的訓練樣本
    for j, context_word in enumerate(words_to_use):
      #可以看到batch裡面的值都是一樣的,都是中心詞, labels裡面是中心詞對應的上下文
      #對應關係是: 中心詞-> 上下文-2, 中心詞->上下文-1, 中心詞->上下文+1, 中心詞->上下文+2
      batch[i * num_skips + j] = buffer[skip_window] #skip_window對應的是中心詞的位置, buffer[skip_window]是中心詞的編碼
      labels[i * num_skips + j, 0] = buffer[context_word] #buffer[context_word]是上下文詞的編碼
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index]) #由於buffer長度有限,這會將隊首的擠出去
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample. # 訓練時用來做負樣本的噪聲單詞的數量

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
# 生成驗證資料,隨機抽取一些頻數最高的單詞,看向量空間上跟它們距離最近的單詞是否相關性比較高
# 驗證單詞其實就是我們要從全部單詞中找出與驗證單詞相近的8個單詞,所以會計算全部單詞的向量與驗證單詞的向量的相似度
valid_size = 16     # Random set of words to evaluate similarity on. # 抽取的驗證單詞數
valid_window = 100  # Only pick dev samples in the head of the distribution. # 驗證單詞只從頻數最高的 100 個單詞中抽取,
#從頻數最高的valid_window個單詞中隨機選擇valid_size個詞作為驗證單詞 
valid_examples = np.random.choice(valid_window, valid_size, replace=False) #從(0,valid_window)的區間隨機選擇valid_size個數字返回


graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) #1維, [1,2]
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) #2維 [[1], [2]]
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #valid_dataset: 是驗證資料集

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    #embedding其實就是模型中間的那個維度, 就是輸入和輸出之間的那個維度,就是所謂的投影層
    #這行程式碼是在構造一個詞典
    #embedding_size:是將單詞轉換為稠密向量的維度
    # 隨機生成所有單詞的詞向量 embeddings,單詞表大小 5000,向量維度 128
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) #[vocabulary_size, embedding_size]是形狀,-1.0, 1.0是最小值和最大值
    #從embeddings詞典中查詢訓練輸入train_inputs對應的向量,這樣就將每個輸入表示成了一個向量 
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    #產生一個正態分佈,均值mean預設為0, 方差stddev, nce_weights莫非就是輸入與投影層之間連線的權重值
    #wx + b, 這裡nce_weights應該就是w,nce_biases應該就是b
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  #num_sampled是負取樣的個數,標籤向量只會有一個元素為1,即標籤向量肯定是one-hot向量,為1的那一位是正例項,其他為0的是負例項
  #我們學習的目的主要是使正例項那一位為1,其他的負例項可以隨機選擇一部分來計算損失函式
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size)) #num_classes是分類的個數,所以可以認為是詞典中詞的個數

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) #可以認為是詞典的長度
  normalized_embeddings = embeddings / norm #標準化詞典, embeddings是所有單詞的詞向量,normalized_embeddings是所有單詞的詞向量的標準化形式,即長度為1
  valid_embeddings = tf.nn.embedding_lookup( #valid_embeddings是驗證資料集單詞對應的巢狀向量(巢狀向量其實就是一個向量)
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True) #這可以認為是在計算驗證詞向量和所有詞向量的相似度

  # Add variable initializer. #初始化tensorflow向量,並激活
  init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 100001 #最大的迭代訓練次數

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    #準備每個批次的訓練資料
    #每次訓練後,調整normalized_embeddings的值,即調整每個詞的詞向量?如何調整的?????????????,nce_loss函式內部對輸入向量做了調整?
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val
    
    #每2000次迴圈,計算一個平均loss並顯示出來
    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    #每10000次迴圈,計算一次驗證單詞和全部單詞的相似度,並將每個驗證單詞最相近的8個單詞顯示出來
    if step % 10000 == 0:
      sim = similarity.eval() #驗證資料集詞和所有詞相似度的計算結果
      #遍歷每個驗證單詞
      for i in xrange(valid_size): 
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        #sim[i, :]是第i個詞和所有詞的相似度,argsort返回陣列從小到大元素的索引值,
        #因為sim取了負號,所以實際(-sim[i, :]).argsort()的結果是相似度從大到小排列,並取了相似度最高的top_k個詞的編碼,注意,最高的是他本身,要忽略
        nearest = (-sim[i, :]).argsort()[1:top_k + 1] 
        log_str = 'Nearest to %s:' % valid_word
        #對每個驗證單詞選擇最相似的top_k個
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]] #根據詞的編碼拿到對應的詞
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

# Step 6: Visualize the embeddings.


# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :] #low_dim_embs是降維到2維的單詞的空間向量
    plt.scatter(x, y) #顯示散點圖(單詞的位置)
    plt.annotate(label,  #展示單詞本身
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)  #儲存圖片到本地檔案

try:
  # pylint: disable=g-import-not-at-top
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt
  #sklearn.manifold.TSNE 實現降維,這裡直接將原始的 128 維的嵌入向量降到 2 維 
  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500 #這裡只展示詞頻最高的500個單詞的視覺化結果
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
  #從視覺化的結果可以看出,距離相近的單詞在語義上具有很高的相似性,在訓練Word2Vec模型時,為了獲得比較好的結構,我們可以使用大規模的預料庫,同時需要對引數進行除錯,選取最合適的值
except ImportError as ex:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  print(ex)

輸出結果如下:

Average loss at step  92000 :  4.66911555839
Average loss at step  94000 :  4.72632780838
Average loss at step  96000 :  4.70242216027
Average loss at step  98000 :  4.59530179763
Average loss at step  100000 :  4.70903932524
Nearest to may: can, would, will, should, must, could, might, thibetanus,
Nearest to on: in, through, upon, mcghee, under, comprises, agouti, at,
Nearest to than: or, but, and, peacocks, circ, while, absorbing, universality,
Nearest to other: many, various, different, some, ssbn, these, two, are,
Nearest to when: if, however, while, where, but, before, after, though,
Nearest to th: eight, six, supervision, nine, kylix, seven, four, amalthea,
Nearest to he: it, she, they, there, who, but, maus, hoffmann,
Nearest to i: we, pontificia, they, t, j, callithrix, circ, you,
Nearest to by: michelob, kapoor, was, be, with, as, takeover, drying,
Nearest to state: agave, lobbying, rgya, degeneration, ursus, donaghy, kapoor, centralised,
Nearest to up: out, pear, them, calypso, off, down, infectors, altenberg,
Nearest to time: rgya, lauderdale, period, circ, saturday, agouti, progenitors, reactionary,
Nearest to as: michelob, microcebus, circ, ursus, leontopithecus, by, conjugate, kapoor,
Nearest to five: four, seven, three, eight, six, nine, zero, two,
Nearest to are: were, is, have, although, while, be, pulmonic, merger,
Nearest to only: michelob, pontificia, theodosius, but, circ, ursus, stenella, agouti,