Tensorflow實戰學習(十八)【詞向量、維基百科語料庫訓練詞向量模型】
詞向量嵌入需要高效率處理大規模文字語料庫。word2vec。簡單方式,詞送入獨熱編碼(one-hot encoding)學習系統,長度為詞彙表長度的向量,詞語對應位置元素為1,其餘元素為0。向量維數很高,無法刻畫不同詞語的語義關聯。共生關係(co-occurrence)表示單詞,解決語義關聯,遍歷大規模文字語料庫,統計每個單詞一定距離範圍內的周圍詞彙,用附近詞彙規範化數量表示每個詞語。類似語境中詞語語義相似。用PCA或類似方法降維出現向量(occurrence vector),得到更稠密表示。效能好,追蹤所有詞彙共生矩陣,寬度、高度為詞彙表長度。2013年,Mikolov、Tomas等提出上下文計算詞表示方法,《Efficient estimation of word representations in vector space》(arXiv preprint arXiv:1301.3781(2013))。skip-gram模型,從隨機表示開始,依據當前詞語預測上下文詞語簡單分類器,誤差通過分類器權值和詞表示傳播,對兩者調整減少預測誤差。大規模語料庫訓練模型表示賂量逼近壓縮後共生向量。
下載轉儲檔案,提取頁面詞語。統計詞語出現次數,構建常見詞彙表。用詞彙表對提取頁面編碼。逐行讀取檔案,結果立即寫入磁碟。在不同步驟間儲存檢查點,避免程式崩潰重來。
__iter__
遍歷詞語索引列表頁面。encode獲取字串詞語詞彙索引。decode依據詞彙索引返回字串詞語。_read_pages從維基百科轉儲檔案(壓縮XML)提取單詞,儲存到頁面檔案,每個頁面一行空格分隔的單詞。bz2模組open函式讀取檔案。中間結果壓縮處理。正則表示式捕捉任意連續字母序列或單獨特殊字母。_build_vocabulary統計頁面檔案單詞數,出現頻率高詞語寫入檔案。獨熱編碼需要詞彙表。詞彙表索引編碼。移除拼寫錯誤、極不常見詞語,詞彙表只包含vocabulary_size - 1個最常見詞語。所有不在詞彙表詞語<unk>
動態形成訓練樣本,組織到大批資料,分類器不佔大量記憶體。skip-gram模型預測當前詞語的上下文詞語。遍歷文字,當前詞語資料,周圍詞語目標,建立訓練樣本。上下文尺寸R,每個單詞生成2R樣本,當前詞左右各R個詞。語義上下文,距離近重要,儘量少建立遠上下文詞語訓練樣本,範圍[1,D=10]隨機選擇詞上下文尺寸。依據skip-gram模型形成訓練對。Numpy陣列生成數值流批資料。
初始,單詞表示為隨機向量。分類器根據中層表示預測上下文單詞當前表示。傳播誤差,微調權值、輸入單詞表示。MomentumOptimizer 模型優化,智慧不足,效率高。
分類器是模型核心。噪聲對比估計損失(noisecontrastive estimation loss)效能優異。softmax分類器建模。tf.nn.nce_loss 新隨機向量負樣本(對比樣本),近似softmax分類器。
import bz2
import collections
import os
import re
from lxml import etree
from helpers import download
class Wikipedia:
TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
def __init__(self, url, cache_dir, vocabulary_size=10000):
self._cache_dir = os.path.expanduser(cache_dir)
self._pages_path = os.path.join(self._cache_dir, 'pages.bz2')
self._vocabulary_path = os.path.join(self._cache_dir, 'vocabulary.bz2')
if not os.path.isfile(self._pages_path):
print('Read pages')
self._read_pages(url)
if not os.path.isfile(self._vocabulary_path):
print('Build vocabulary')
self._build_vocabulary(vocabulary_size)
with bz2.open(self._vocabulary_path, 'rt') as vocabulary:
print('Read vocabulary')
self._vocabulary = [x.strip() for x in vocabulary]
self._indices = {x: i for i, x in enumerate(self._vocabulary)}
def __iter__(self):
with bz2.open(self._pages_path, 'rt') as pages:
for page in pages:
words = page.strip().split()
words = [self.encode(x) for x in words]
yield words
@property
def vocabulary_size(self):
return len(self._vocabulary)
def encode(self, word):
return self._indices.get(word, 0)
def decode(self, index):
return self._vocabulary[index]
def _read_pages(self, url):
wikipedia_path = download(url, self._cache_dir)
with bz2.open(wikipedia_path) as wikipedia, \
bz2.open(self._pages_path, 'wt') as pages:
for _, element in etree.iterparse(wikipedia, tag='{*}page'):
if element.find('./{*}redirect') is not None:
continue
page = element.findtext('./{*}revision/{*}text')
words = self._tokenize(page)
pages.write(' '.join(words) + '\n')
element.clear()
def _build_vocabulary(self, vocabulary_size):
counter = collections.Counter()
with bz2.open(self._pages_path, 'rt') as pages:
for page in pages:
words = page.strip().split()
counter.update(words)
common = ['<unk>'] + counter.most_common(vocabulary_size - 1)
common = [x[0] for x in common]
with bz2.open(self._vocabulary_path, 'wt') as vocabulary:
for word in common:
vocabulary.write(word + '\n')
@classmethod
def _tokenize(cls, page):
words = cls.TOKEN_REGEX.findall(page)
words = [x.lower() for x in words]
return words
import tensorflow as tf
import numpy as np
from helpers import lazy_property
class EmbeddingModel:
def __init__(self, data, target, params):
self.data = data
self.target = target
self.params = params
self.embeddings
self.cost
self.optimize
@lazy_property
def embeddings(self):
initial = tf.random_uniform(
[self.params.vocabulary_size, self.params.embedding_size],
-1.0, 1.0)
return tf.Variable(initial)
@lazy_property
def optimize(self):
optimizer = tf.train.MomentumOptimizer(
self.params.learning_rate, self.params.momentum)
return optimizer.minimize(self.cost)
@lazy_property
def cost(self):
embedded = tf.nn.embedding_lookup(self.embeddings, self.data)
weight = tf.Variable(tf.truncated_normal(
[self.params.vocabulary_size, self.params.embedding_size],
stddev=1.0 / self.params.embedding_size ** 0.5))
bias = tf.Variable(tf.zeros([self.params.vocabulary_size]))
target = tf.expand_dims(self.target, 1)
return tf.reduce_mean(tf.nn.nce_loss(
weight, bias, embedded, target,
self.params.contrastive_examples,
self.params.vocabulary_size))
import collections
import tensorflow as tf
import numpy as np
from batched import batched
from EmbeddingModel import EmbeddingModel
from skipgrams import skipgrams
from Wikipedia import Wikipedia
from helpers import AttrDict
WIKI_DOWNLOAD_DIR = './wikipedia'
params = AttrDict(
vocabulary_size=10000,
max_context=10,
embedding_size=200,
contrastive_examples=100,
learning_rate=0.5,
momentum=0.5,
batch_size=1000,
)
data = tf.placeholder(tf.int32, [None])
target = tf.placeholder(tf.int32, [None])
model = EmbeddingModel(data, target, params)
corpus = Wikipedia(
'https://dumps.wikimedia.org/enwiki/20160501/'
'enwiki-20160501-pages-meta-current1.xml-p000000010p000030303.bz2',
WIKI_DOWNLOAD_DIR,
params.vocabulary_size)
examples = skipgrams(corpus, params.max_context)
batches = batched(examples, params.batch_size)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
average = collections.deque(maxlen=100)
for index, batch in enumerate(batches):
feed_dict = {data: batch[0], target: batch[1]}
cost, _ = sess.run([model.cost, model.optimize], feed_dict)
average.append(cost)
print('{}: {:5.1f}'.format(index + 1, sum(average) / len(average)))
if index > 100000:
break
embeddings = sess.run(model.embeddings)
np.save(WIKI_DOWNLOAD_DIR + '/embeddings.npy', embeddings)
參考資料:
《面向機器智慧的TensorFlow實踐》