Tensorflow練習2-Word2vec模型計算詞語相似度
阿新 • • 發佈:2019-01-30
#-*- coding: utf8 -*- # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Basic word2vec example.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import matplotlib matplotlib.use('Agg') #jason import collections import math import os import random from tempfile import gettempdir import zipfile import pdb import numpy as np from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf # Step 1: Download the data. url = 'http://mattmahoney.net/dc/' # pylint: disable=redefined-outer-name #下載並驗證text8資料集 def maybe_download(filename, expected_bytes): """Download a file if not present, and make sure it's the right size.""" local_filename = os.path.join(gettempdir(), filename) #先看是否已經下載過了 if not os.path.exists(local_filename): local_filename, _ = urllib.request.urlretrieve(url + filename, local_filename) statinfo = os.stat(local_filename) #校驗檔案的尺寸 if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: print(statinfo.st_size) raise Exception('Failed to verify ' + local_filename + '. Can you get to it with a browser?') return local_filename pdb.set_trace() filename = maybe_download('text8.zip', 31344016) # Read the data into a list of strings. #讀取資料集,轉化為列表vocabulary(每個元素為單詞) def read_data(filename): """Extract the first file enclosed in a zip file as a list of words.""" #解壓檔案 with zipfile.ZipFile(filename) as f: #tf.compat.as_str將資料轉換成單詞的列表 data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data #讀取詞庫, vocabulary是詞的列表,實際也是一個一個的句子,就是訓練資料集,將所有的這些詞也用來構成詞典 vocabulary = read_data(filename) print('Data size', len(vocabulary)) # Step 2: Build the dictionary and replace rare words with UNK token. vocabulary_size = 50000 def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(n_words - 1)) #most_common:返回計數值最高的top n個,結果已經按詞頻降序排列 dictionary = dict() #將所有的詞典的詞排個順序,假設詞典有這些詞: 我,你,他,和,還。編個順序:我(1),你(2),他(3),和(4),還(5) #那麼將'我'轉換成one-hot向量就是[1,0,0,0,0,], '他'轉換成one-hot向量就是[0,0,1,0,0] for word, _ in count: dictionary[word] = len(dictionary) #len(dictionary)就是當前的詞word的順序, 詞頻最高的單詞的編碼為0,其次為1,依次增加 data = list() unk_count = 0 for word in words: #count中只有top的那些詞,所以dictionary中也是隻有top的那些詞,所以對於非top的詞index=0, index = dictionary.get(word, 0) if index == 0: # dictionary['UNK'] unk_count += 1 #unk_count是非top的詞的個數 data.append(index) count[0][1] = unk_count #所有非top詞出現的次數 reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #dictionary: 每個詞的順序值, 詞:編碼 #reversed_dictionary:將dictionary中的key和value顛倒過來, 編碼:詞 #count: 詞典中的詞出現的次數,非top的合併起來了 #data: 詞典中詞的順序值, 就是單詞的編碼, 將讀取的訓練資料(句子)中的詞全部轉換成對應的詞的編碼 return data, count, dictionary, reversed_dictionary # Filling 4 global variables: # data - list of codes (integers from 0 to vocabulary_size-1). # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) # reverse_dictionary - maps codes(integers) to words(strings) data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) data_index = 0 #單詞的序號 # Step 3: Function to generate a training batch for the skip-gram model. #batch_size: 每個批次生成的訓練例項個數 #skip_window: 中心詞左右的視窗大小,即中心詞左邊詞的個數或者右邊詞的個數 #num_skips: 對於一箇中心詞,從其視窗內隨機選擇的例項個數, 例項的x是中心詞語,y是隨機取樣的上下文詞語 def generate_batch(batch_size, num_skips, skip_window): global data_index #batch_size是每個批次要生成的訓練例項的個數,而num_skips是每個詞的上下文中隨機選擇的訓練例項數,num_skips * k = batch_size, k是選擇目標詞個數 assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] #整個區域的長度,包括中心詞和上下文的詞 buffer = collections.deque(maxlen=span) if data_index + span > len(data): #data中是單詞的編碼, 詞頻最高的單詞的編碼為0,其次為1,依次增加 data_index = 0 buffer.extend(data[data_index:data_index + span]) #buffer用於儲存長度為 span 的單詞編號 data_index += span for i in range(batch_size // num_skips): #batch_size // num_skips: 選取目標中心詞語的次數,視窗滑動的次數 context_words = [w for w in range(span) if w != skip_window] #context_words: 這是一箇中心詞的上下文詞的索引集合 words_to_use = random.sample(context_words, num_skips) #隨機從上下文詞中選擇num_skips個詞的索引值返回 #生成一個目標單詞的訓練樣本 for j, context_word in enumerate(words_to_use): #可以看到batch裡面的值都是一樣的,都是中心詞, labels裡面是中心詞對應的上下文 #對應關係是: 中心詞-> 上下文-2, 中心詞->上下文-1, 中心詞->上下文+1, 中心詞->上下文+2 batch[i * num_skips + j] = buffer[skip_window] #skip_window對應的是中心詞的位置, buffer[skip_window]是中心詞的編碼 labels[i * num_skips + j, 0] = buffer[context_word] #buffer[context_word]是上下文詞的編碼 if data_index == len(data): buffer[:] = data[:span] data_index = span else: buffer.append(data[data_index]) #由於buffer長度有限,這會將隊首的擠出去 data_index += 1 # Backtrack a little bit to avoid skipping words in the end of a batch data_index = (data_index + len(data) - span) % len(data) return batch, labels batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) # Step 4: Build and train a skip-gram model. batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. num_sampled = 64 # Number of negative examples to sample. # 訓練時用來做負樣本的噪聲單詞的數量 # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. # 生成驗證資料,隨機抽取一些頻數最高的單詞,看向量空間上跟它們距離最近的單詞是否相關性比較高 # 驗證單詞其實就是我們要從全部單詞中找出與驗證單詞相近的8個單詞,所以會計算全部單詞的向量與驗證單詞的向量的相似度 valid_size = 16 # Random set of words to evaluate similarity on. # 抽取的驗證單詞數 valid_window = 100 # Only pick dev samples in the head of the distribution. # 驗證單詞只從頻數最高的 100 個單詞中抽取, #從頻數最高的valid_window個單詞中隨機選擇valid_size個詞作為驗證單詞 valid_examples = np.random.choice(valid_window, valid_size, replace=False) #從(0,valid_window)的區間隨機選擇valid_size個數字返回 graph = tf.Graph() with graph.as_default(): # Input data. train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) #1維, [1,2] train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) #2維 [[1], [2]] valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #valid_dataset: 是驗證資料集 # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Look up embeddings for inputs. #embedding其實就是模型中間的那個維度, 就是輸入和輸出之間的那個維度,就是所謂的投影層 #這行程式碼是在構造一個詞典 #embedding_size:是將單詞轉換為稠密向量的維度 # 隨機生成所有單詞的詞向量 embeddings,單詞表大小 5000,向量維度 128 embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) #[vocabulary_size, embedding_size]是形狀,-1.0, 1.0是最小值和最大值 #從embeddings詞典中查詢訓練輸入train_inputs對應的向量,這樣就將每個輸入表示成了一個向量 embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss #產生一個正態分佈,均值mean預設為0, 方差stddev, nce_weights莫非就是輸入與投影層之間連線的權重值 #wx + b, 這裡nce_weights應該就是w,nce_biases應該就是b nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. # Explanation of the meaning of NCE loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ #num_sampled是負取樣的個數,標籤向量只會有一個元素為1,即標籤向量肯定是one-hot向量,為1的那一位是正例項,其他為0的是負例項 #我們學習的目的主要是使正例項那一位為1,其他的負例項可以隨機選擇一部分來計算損失函式 loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) #num_classes是分類的個數,所以可以認為是詞典中詞的個數 # Construct the SGD optimizer using a learning rate of 1.0. optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) #可以認為是詞典的長度 normalized_embeddings = embeddings / norm #標準化詞典, embeddings是所有單詞的詞向量,normalized_embeddings是所有單詞的詞向量的標準化形式,即長度為1 valid_embeddings = tf.nn.embedding_lookup( #valid_embeddings是驗證資料集單詞對應的巢狀向量(巢狀向量其實就是一個向量) normalized_embeddings, valid_dataset) similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True) #這可以認為是在計算驗證詞向量和所有詞向量的相似度 # Add variable initializer. #初始化tensorflow向量,並激活 init = tf.global_variables_initializer() # Step 5: Begin training. num_steps = 100001 #最大的迭代訓練次數 with tf.Session(graph=graph) as session: # We must initialize all variables before we use them. init.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): #準備每個批次的訓練資料 #每次訓練後,調整normalized_embeddings的值,即調整每個詞的詞向量?如何調整的?????????????,nce_loss函式內部對輸入向量做了調整? batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) average_loss += loss_val #每2000次迴圈,計算一個平均loss並顯示出來 if step % 2000 == 0: if step > 0: average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 batches. print('Average loss at step ', step, ': ', average_loss) average_loss = 0 # Note that this is expensive (~20% slowdown if computed every 500 steps) #每10000次迴圈,計算一次驗證單詞和全部單詞的相似度,並將每個驗證單詞最相近的8個單詞顯示出來 if step % 10000 == 0: sim = similarity.eval() #驗證資料集詞和所有詞相似度的計算結果 #遍歷每個驗證單詞 for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors #sim[i, :]是第i個詞和所有詞的相似度,argsort返回陣列從小到大元素的索引值, #因為sim取了負號,所以實際(-sim[i, :]).argsort()的結果是相似度從大到小排列,並取了相似度最高的top_k個詞的編碼,注意,最高的是他本身,要忽略 nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word #對每個驗證單詞選擇最相似的top_k個 for k in xrange(top_k): close_word = reverse_dictionary[nearest[k]] #根據詞的編碼拿到對應的詞 log_str = '%s %s,' % (log_str, close_word) print(log_str) final_embeddings = normalized_embeddings.eval() # Step 6: Visualize the embeddings. # pylint: disable=missing-docstring # Function to draw visualization of distance between embeddings. def plot_with_labels(low_dim_embs, labels, filename): assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings' plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] #low_dim_embs是降維到2維的單詞的空間向量 plt.scatter(x, y) #顯示散點圖(單詞的位置) plt.annotate(label, #展示單詞本身 xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) #儲存圖片到本地檔案 try: # pylint: disable=g-import-not-at-top from sklearn.manifold import TSNE import matplotlib.pyplot as plt #sklearn.manifold.TSNE 實現降維,這裡直接將原始的 128 維的嵌入向量降到 2 維 tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') plot_only = 500 #這裡只展示詞頻最高的500個單詞的視覺化結果 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) labels = [reverse_dictionary[i] for i in xrange(plot_only)] plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png')) #從視覺化的結果可以看出,距離相近的單詞在語義上具有很高的相似性,在訓練Word2Vec模型時,為了獲得比較好的結構,我們可以使用大規模的預料庫,同時需要對引數進行除錯,選取最合適的值 except ImportError as ex: print('Please install sklearn, matplotlib, and scipy to show embeddings.') print(ex)
輸出結果如下:
Average loss at step 92000 : 4.66911555839 Average loss at step 94000 : 4.72632780838 Average loss at step 96000 : 4.70242216027 Average loss at step 98000 : 4.59530179763 Average loss at step 100000 : 4.70903932524 Nearest to may: can, would, will, should, must, could, might, thibetanus, Nearest to on: in, through, upon, mcghee, under, comprises, agouti, at, Nearest to than: or, but, and, peacocks, circ, while, absorbing, universality, Nearest to other: many, various, different, some, ssbn, these, two, are, Nearest to when: if, however, while, where, but, before, after, though, Nearest to th: eight, six, supervision, nine, kylix, seven, four, amalthea, Nearest to he: it, she, they, there, who, but, maus, hoffmann, Nearest to i: we, pontificia, they, t, j, callithrix, circ, you, Nearest to by: michelob, kapoor, was, be, with, as, takeover, drying, Nearest to state: agave, lobbying, rgya, degeneration, ursus, donaghy, kapoor, centralised, Nearest to up: out, pear, them, calypso, off, down, infectors, altenberg, Nearest to time: rgya, lauderdale, period, circ, saturday, agouti, progenitors, reactionary, Nearest to as: michelob, microcebus, circ, ursus, leontopithecus, by, conjugate, kapoor, Nearest to five: four, seven, three, eight, six, nine, zero, two, Nearest to are: were, is, have, although, while, be, pulmonic, merger, Nearest to only: michelob, pontificia, theodosius, but, circ, ursus, stenella, agouti,