Tensorflow練習2-Word2vec模型計算詞語相似度

阿新 • • 發佈：2019-01-30

#-*- coding: utf8 -*-
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic word2vec example."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')  #jason
import collections
import math
import os
import random
from tempfile import gettempdir
import zipfile
import pdb
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


# pylint: disable=redefined-outer-name
#下載並驗證text8資料集
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  local_filename = os.path.join(gettempdir(), filename)
  #先看是否已經下載過了 
  if not os.path.exists(local_filename):
    local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
  statinfo = os.stat(local_filename)
  #校驗檔案的尺寸
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
  return local_filename

pdb.set_trace()
filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
#讀取資料集，轉化為列表vocabulary(每個元素為單詞)
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  #解壓檔案
  with zipfile.ZipFile(filename) as f:
    #tf.compat.as_str將資料轉換成單詞的列表
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

#讀取詞庫, vocabulary是詞的列表，實際也是一個一個的句子，就是訓練資料集，將所有的這些詞也用來構成詞典
vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1)) #most_common：返回計數值最高的top n個,結果已經按詞頻降序排列
  dictionary = dict()
  #將所有的詞典的詞排個順序，假設詞典有這些詞： 我，你，他，和，還。編個順序:我(1),你(2),他(3),和(4),還(5)
  #那麼將'我'轉換成one-hot向量就是[1,0,0,0,0,], '他'轉換成one-hot向量就是[0,0,1,0,0]
  for word, _ in count:
    dictionary[word] = len(dictionary) #len(dictionary)就是當前的詞word的順序, 詞頻最高的單詞的編碼為0，其次為1，依次增加
  data = list()
  unk_count = 0
  for word in words:
    #count中只有top的那些詞，所以dictionary中也是隻有top的那些詞，所以對於非top的詞index=0，
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1 #unk_count是非top的詞的個數
    data.append(index)
  count[0][1] = unk_count #所有非top詞出現的次數
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  #dictionary: 每個詞的順序值, 詞：編碼
  #reversed_dictionary：將dictionary中的key和value顛倒過來， 編碼：詞
  #count： 詞典中的詞出現的次數，非top的合併起來了
  #data： 詞典中詞的順序值, 就是單詞的編碼, 將讀取的訓練資料(句子)中的詞全部轉換成對應的詞的編碼 
  return data, count, dictionary, reversed_dictionary

# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0  #單詞的序號

# Step 3: Function to generate a training batch for the skip-gram model.
#batch_size: 每個批次生成的訓練例項個數
#skip_window: 中心詞左右的視窗大小,即中心詞左邊詞的個數或者右邊詞的個數
#num_skips: 對於一箇中心詞，從其視窗內隨機選擇的例項個數， 例項的x是中心詞語，y是隨機取樣的上下文詞語
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  #batch_size是每個批次要生成的訓練例項的個數，而num_skips是每個詞的上下文中隨機選擇的訓練例項數，num_skips * k = batch_size， k是選擇目標詞個數
  assert batch_size % num_skips == 0 
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ] #整個區域的長度，包括中心詞和上下文的詞
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data): #data中是單詞的編碼, 詞頻最高的單詞的編碼為0，其次為1，依次增加
    data_index = 0
  buffer.extend(data[data_index:data_index + span]) #buffer用於儲存長度為 span 的單詞編號
  data_index += span

  for i in range(batch_size // num_skips):  #batch_size // num_skips： 選取目標中心詞語的次數，視窗滑動的次數
    context_words = [w for w in range(span) if w != skip_window] #context_words: 這是一箇中心詞的上下文詞的索引集合
    words_to_use = random.sample(context_words, num_skips) #隨機從上下文詞中選擇num_skips個詞的索引值返回
    #生成一個目標單詞的訓練樣本
    for j, context_word in enumerate(words_to_use):
      #可以看到batch裡面的值都是一樣的，都是中心詞, labels裡面是中心詞對應的上下文
      #對應關係是： 中心詞-> 上下文-2, 中心詞->上下文-1, 中心詞->上下文+1， 中心詞->上下文+2
      batch[i * num_skips + j] = buffer[skip_window] #skip_window對應的是中心詞的位置, buffer[skip_window]是中心詞的編碼
      labels[i * num_skips + j, 0] = buffer[context_word] #buffer[context_word]是上下文詞的編碼
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index]) #由於buffer長度有限，這會將隊首的擠出去
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample. # 訓練時用來做負樣本的噪聲單詞的數量

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
# 生成驗證資料，隨機抽取一些頻數最高的單詞，看向量空間上跟它們距離最近的單詞是否相關性比較高
# 驗證單詞其實就是我們要從全部單詞中找出與驗證單詞相近的8個單詞，所以會計算全部單詞的向量與驗證單詞的向量的相似度
valid_size = 16     # Random set of words to evaluate similarity on. # 抽取的驗證單詞數
valid_window = 100  # Only pick dev samples in the head of the distribution. # 驗證單詞只從頻數最高的 100 個單詞中抽取,
#從頻數最高的valid_window個單詞中隨機選擇valid_size個詞作為驗證單詞 
valid_examples = np.random.choice(valid_window, valid_size, replace=False) #從(0,valid_window)的區間隨機選擇valid_size個數字返回


graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) #1維, [1,2]
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) #2維 [[1], [2]]
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #valid_dataset： 是驗證資料集

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    #embedding其實就是模型中間的那個維度, 就是輸入和輸出之間的那個維度，就是所謂的投影層
    #這行程式碼是在構造一個詞典
    #embedding_size：是將單詞轉換為稠密向量的維度
    # 隨機生成所有單詞的詞向量 embeddings，單詞表大小 5000，向量維度 128
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) #[vocabulary_size, embedding_size]是形狀，-1.0, 1.0是最小值和最大值
    #從embeddings詞典中查詢訓練輸入train_inputs對應的向量,這樣就將每個輸入表示成了一個向量 
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    #產生一個正態分佈,均值mean預設為0, 方差stddev, nce_weights莫非就是輸入與投影層之間連線的權重值
    #wx + b, 這裡nce_weights應該就是w，nce_biases應該就是b
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  #num_sampled是負取樣的個數，標籤向量只會有一個元素為1，即標籤向量肯定是one-hot向量，為1的那一位是正例項，其他為0的是負例項
  #我們學習的目的主要是使正例項那一位為1，其他的負例項可以隨機選擇一部分來計算損失函式
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size)) #num_classes是分類的個數，所以可以認為是詞典中詞的個數

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) #可以認為是詞典的長度
  normalized_embeddings = embeddings / norm #標準化詞典, embeddings是所有單詞的詞向量,normalized_embeddings是所有單詞的詞向量的標準化形式,即長度為1
  valid_embeddings = tf.nn.embedding_lookup( #valid_embeddings是驗證資料集單詞對應的巢狀向量(巢狀向量其實就是一個向量)
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True) #這可以認為是在計算驗證詞向量和所有詞向量的相似度

  # Add variable initializer. #初始化tensorflow向量，並激活
  init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 100001 #最大的迭代訓練次數

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    #準備每個批次的訓練資料
    #每次訓練後，調整normalized_embeddings的值，即調整每個詞的詞向量?如何調整的？？？？？？？？？？？？？,nce_loss函式內部對輸入向量做了調整？
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val
    
    #每2000次迴圈，計算一個平均loss並顯示出來
    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    #每10000次迴圈，計算一次驗證單詞和全部單詞的相似度，並將每個驗證單詞最相近的8個單詞顯示出來
    if step % 10000 == 0:
      sim = similarity.eval() #驗證資料集詞和所有詞相似度的計算結果
      #遍歷每個驗證單詞
      for i in xrange(valid_size): 
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        #sim[i, :]是第i個詞和所有詞的相似度，argsort返回陣列從小到大元素的索引值，
        #因為sim取了負號，所以實際(-sim[i, :]).argsort()的結果是相似度從大到小排列，並取了相似度最高的top_k個詞的編碼，注意，最高的是他本身，要忽略
        nearest = (-sim[i, :]).argsort()[1:top_k + 1] 
        log_str = 'Nearest to %s:' % valid_word
        #對每個驗證單詞選擇最相似的top_k個
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]] #根據詞的編碼拿到對應的詞
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

# Step 6: Visualize the embeddings.


# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :] #low_dim_embs是降維到2維的單詞的空間向量
    plt.scatter(x, y) #顯示散點圖(單詞的位置)
    plt.annotate(label,  #展示單詞本身
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)  #儲存圖片到本地檔案

try:
  # pylint: disable=g-import-not-at-top
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt
  #sklearn.manifold.TSNE 實現降維，這裡直接將原始的 128 維的嵌入向量降到 2 維 
  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500 #這裡只展示詞頻最高的500個單詞的視覺化結果
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
  #從視覺化的結果可以看出，距離相近的單詞在語義上具有很高的相似性，在訓練Word2Vec模型時，為了獲得比較好的結構，我們可以使用大規模的預料庫，同時需要對引數進行除錯，選取最合適的值
except ImportError as ex:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  print(ex)

輸出結果如下：

Average loss at step  92000 :  4.66911555839
Average loss at step  94000 :  4.72632780838
Average loss at step  96000 :  4.70242216027
Average loss at step  98000 :  4.59530179763
Average loss at step  100000 :  4.70903932524
Nearest to may: can, would, will, should, must, could, might, thibetanus,
Nearest to on: in, through, upon, mcghee, under, comprises, agouti, at,
Nearest to than: or, but, and, peacocks, circ, while, absorbing, universality,
Nearest to other: many, various, different, some, ssbn, these, two, are,
Nearest to when: if, however, while, where, but, before, after, though,
Nearest to th: eight, six, supervision, nine, kylix, seven, four, amalthea,
Nearest to he: it, she, they, there, who, but, maus, hoffmann,
Nearest to i: we, pontificia, they, t, j, callithrix, circ, you,
Nearest to by: michelob, kapoor, was, be, with, as, takeover, drying,
Nearest to state: agave, lobbying, rgya, degeneration, ursus, donaghy, kapoor, centralised,
Nearest to up: out, pear, them, calypso, off, down, infectors, altenberg,
Nearest to time: rgya, lauderdale, period, circ, saturday, agouti, progenitors, reactionary,
Nearest to as: michelob, microcebus, circ, ursus, leontopithecus, by, conjugate, kapoor,
Nearest to five: four, seven, three, eight, six, nine, zero, two,
Nearest to are: were, is, have, although, while, be, pulmonic, merger,
Nearest to only: michelob, pontificia, theodosius, but, circ, ursus, stenella, agouti,

Tensorflow練習2-Word2vec模型計算詞語相似度

【python + word2vec】計算語義相似度

本方法是使用python語言使用word2vec 的方法來進行語義相似度的計算、 1、首先配置python27下的word2vec的環境（gensim），如下所示： 2、訓練文字 # -*- cod

jieba分詞以及word2vec詞語相似度

自然語言處理中文分詞去除標點符號，下一步開始文本相似度計算：參考文章： http://www.jb51.net/article/139690.htm from gensim.models import Word2Vec model = Word2Vec(sentences,

機器學習演算法Python實現：word2vec 求詞語相似度

#!/usr/bin/env Python3 # coding=utf-8 import jieba jieba.load_userdict("C:\\Users\\Desktop\\s_proj\\dict.txt") #自定義分詞詞典 #分詞並將結果存入txt f1

使用 TF-IDF 加權的空間向量模型實現句子相似度計算

使用 TF-IDF 加權的空間向量模型實現句子相似度計算字元匹配層次計算句子相似度計算兩個句子相似度的演算法有很多種，但是對於從未了解過這方面演算法的人來說，可能最容易想到的就是使用字串匹配相關的演算法，來檢查兩個句子所對應的字串的字元相似程度。比如單純的進行子串匹配，搜尋 A 串中能與 B 串匹配的

基於編輯距離來判斷詞語相似度方法（scala版）

使用 ref ray 只需要 art 算法位置 spark else 詞語相似性比較，最容易想到的就是編輯距離，也叫做Levenshtein Distance算法。在Python中是有現成的模塊可以幫助做這個的，不過代碼也很簡單，我這邊就用scala實現了一版。編輯

系統學習NLP（七）--詞語相似度

目的是為了計算詞語之間的各種語義關係。分為以下幾類：基於語義字典的方法：這類方法，主要使用WordNet,MeSH這樣的線上語義字典來度量兩個義項之間的距離。基於WordNet中的概念是由概念間關係連線在一起的，每個概念都通過關係和其他概念相

用gensim doc2vec計算文字相似度，Python可以跑通的程式碼

Python3.7版本，轉載自：https://blog.csdn.net/juanjuan1314/article/details/75124046 wangyi_title.txt檔案下載地址：連結:https://pan.baidu.com/s/1uL75P13t98YHMqgv3Kx7T

NLP入門（一）詞袋模型及句子相似度

本文作為筆者NLP入門系列文章第一篇，以後我們就要步入NLP時代。本文將會介紹NLP中常見的詞袋模型（Bag of Words）以及如何利用詞袋模型來計算句子間的相似度（餘弦相似度，cosine similarity）。首先，讓我們來看一下，什麼是詞袋模型。我們以下面兩個

計算字串相似度的一些方法

產品出了一個奇怪的需求，想通過字串相似度取匹配城市= =（當然，最後證實通過字串相似度取判斷兩個字串是不是一個城市是不對的！！！）這裡就記錄一下我計算字串(英文字串)相似度的方法吧～參考文件： L

DSSM演算法-計算文字相似度

轉載請註明出處： http://blog.csdn.net/u013074302/article/details/76422551 導語在NLP領域，語義相似度的計算一直是個難題：搜尋場景下query和Doc的語義相似度、feeds場景下Doc和Doc的語義相似度、機器翻譯場景下A句

計算文字相似度方法大全-簡單說

本編文章是方法論-主要給大家介紹原理思路簡單講解基於關鍵詞的空間向量模型的演算法，將使用者的喜好以文件描述並轉換成向量模型，對商品也是這麼處理，然後再通過計算商品文件和使用者偏好文件的餘弦相似度。文字相似度計算在資訊檢索、資料探勘、機器翻譯、文件複製檢測等領域

應用實戰: 如何利用Spark叢集計算物品相似度

本文是Spark調研筆記的最後一篇，以程式碼例項說明如何藉助Spark平臺高效地實現推薦系統CF演算法中的物品相似度計算。在推薦系統中，最經典的推薦演算法無疑是協同過濾（Collaborative Filtering, CF），而item-cf又是CF演算法中一個實現簡單

LeetCode之計算字串相似度或編輯距離EditDistance

問題描述： /** * Given two words word1 and word2, find the minimum number of steps required to * convert word1 to word2. (each oper

用gensim doc2vec計算文字相似度

最近開始接觸gensim庫，之前訓練word2vec用Mikolov的c版本程式，看了很久才把程式看明白，在gensim庫中，word2vec和doc2vec只需要幾個介面就可以實現，實在是方便。python，我越來越愛你了。這個程式很簡單，直接上程式了。 # codin

Doc2Vec計算句子相似度

X_train 就是自己的訓練語料 “”“ date:2018_7_25 doc2vec計算句子相似性 ”“” # coding:utf-8 import sys import time import csv import glob import gensim impo

java實現編輯距離演算法，計算字串相似度

這是Levenshtein Distance演算法的java實現，另外oracle 10g r2當中好像自帶了這樣的函式，utl_match包當中public class LD { /** * 計算向量距離 * Levenshtein Distan

lucene計算文字相似度演算法

Leveraging term vectors 所謂term vector, 就是對於documents的某一field,如title,body這種文字型別的, 建立詞頻的多維向量空間.每一個詞就是一維, 這維的值就是這個詞在這個field中的頻率.

計算字串相似度

問題描述：有兩個不同的字串，通過使用一套操作方法可以把兩個字串變成一樣的。例如： 1） "a" 和 "b" ==> 把a變成b，或把b變成a 變化了一次 2） "abc" 和 "ade" ==> 把bc變成de，或把de變成bc 變化了兩次

simhash計算文字相似度

轉自http://www.lanceyan.com/tech/arch/simhash_hamming_distance_similarity.html 通過採集系統我們採集了大量文字資料，但是文字中有很多重複資料影響我們對於結果的分析。分析前我們需要對這些資料去除

Tensorflow練習2-Word2vec模型計算詞語相似度

相關推薦