深度學習語言模型(3)-word2vec負取樣(Negative Sampling) 模型(keras版本)
阿新 • • 發佈:2019-01-10
目錄:
深度學習語言模型(1)-word2vec的發展歷程
深度學習語言模型(2)-詞向量,神經概率網路模型(keras版本)
深度學習語言模型(3)-word2vec負取樣(Negative Sampling) 模型(keras版本)
程式碼參考了:https://spaces.ac.cn/archives/4515
但他採用的是隨機取樣,我這裡採用的是負取樣,但還是有一些細節沒有去實現,但大體框架就這樣
# coding=utf-8
'''
Created on 2018年9月15日
@author: admin
'''
from gensim import corpora, models, similarities
import numpy as np
import keras.backend as K
from keras.engine.topology import Layer
class NegativeLayer(Layer):
def __init__(self, nb_negative,M,M_num, **kwargs):
self.nb_negative = nb_negative
self.M = M
self.M_num = M_num
super(NegativeLayer, self).__init__(**kwargs)
def build(self, input_shape):
super(NegativeLayer, self).build(input_shape)
def call(self, x, mask=None):
batch = 0
if str(x.shape[0]).isdigit() == False:
batch = 4
else:
batch = x.shape[0]
#負取樣
final_output = np.array([[M[i] for i in j]for j in np.random.randint(0, self.M_num+1, size=(batch, self.nb_negative))])
#變成tensor格式
final_output = K.tensorflow_backend._to_tensor(final_output,dtype=np.int32)
return final_output
def compute_output_shape(self, input_shape):
return (input_shape[0], self.nb_negative)
if __name__ == '__main__':
text = [["我","今天","打","籃球"],
["我","今天","打","足球"],
["我","今天","打","羽毛球"],
["我","今天","打","網球"],
["我","今天","打","排球"],
["我","今天","打","氣球"],
["我","今天","打","遊戲"],
["我","今天","打","冰球"],
["我","今天","打","人"],
["我","今天","打","檯球"],
["我","今天","打","桌球"],
["我","今天","打","水"],
["我","今天","打","籃球"],
["我","今天","打","足球"],
["我","今天","打","羽毛球"],
["我","今天","打","網球"],
["我","今天","打","排球"],
["我","今天","打","氣球"],
]
#使用gensim生成詞典
dictionary = corpora.Dictionary(text,prune_at=2000000)
#列印詞典中的詞
for key in dictionary.iterkeys():
print(key,dictionary.get(key),dictionary.dfs[key])
#儲存詞典
dictionary.save_as_text('word_dict.dict', sort_by_word=True)
#載入詞典
dictionary = dictionary.load_from_text('word_dict.dict')
L = {}
#計算出詞出現的總數,dictionary.dfs{單詞id,在多少文件中出現}
allword_num = np.sum(list(dictionary.dfs.values()))
print(allword_num)
#72
#構造負取樣dict
#進行歸一化,然後按照0-1排列,然後再使用M個均等值來評分0-1,方便對應詞的id
sum = 0
M = {}
M_num = 1000
for id,num in dictionary.dfs.items():
#向上取整
left = int(np.ceil(sum/(1/M_num)))
sum = sum + num/allword_num
L[id] = sum
#向下取整
right = int(sum/(1/M_num))
print(id,left,right)
# 11 0 13
# 0 14 263
# 10 264 277
# 12 278 291
# 1 292 541
# 2 542 791
# 7 792 819
# 13 820 833
# 8 834 861
# 14 862 875
# 9 875 888
# 3 889 916
# 6 917 944
# 5 945 972
# 4 973 1000
for i in range(left,right+1):
M[i] = id
print(L)
#{11: 0.013888888888888888, 0: 0.25, 10: 0.013888888888888888, 12: 0.013888888888888888, 1: 0.25, 2: 0.25, 7: 0.027777777777777776, 13: 0.013888888888888888, 8: 0.027777777777777776, 14: 0.013888888888888888, 9: 0.013888888888888888, 3: 0.027777777777777776, 6: 0.027777777777777776, 5: 0.027777777777777776, 4: 0.027777777777777776}
#詞語個數
word_num = len(dictionary.keys())
#使用多少編文章生成每個batch資料
sentence_batch_size = 1
#滑動視窗
window = 3
def data_generator(): #訓練資料生成器
while True:
x,y = [],[]
_ = 0
for sentence in text:
#使用word_num的值作為padding
sentence = [word_num]*window + [dictionary.token2id[w] for w in sentence if w in dictionary.token2id] + [word_num]*window
for i in range(window, len(sentence)-window):
x.append(sentence[i-window:i]+sentence[i+1:i+1+window])
#因為使用的loss函式為sparse_categorical_crossentropy,所以不用one-hot
y.append([sentence[i]])
_ += 1
if _ == sentence_batch_size:
x,y = np.array(x),np.array(y)
#因為正例為輸出層第一個神經元,所以這裡都使用0標籤,也是因為loss函式為sparse_categorical_crossentropy
z = np.zeros((len(x), 1))
print("輸入的資料 :",x.shape)
print("對應的標籤 :",y.shape)
print("對應的標籤 2:",z.shape)
yield [x,y],z
x,y = [],[]
_ = 0
from keras.models import Sequential
from keras.layers import Dense, Activation,Embedding,Reshape,Flatten,Input,Embedding,Lambda
from keras.models import Model
#詞向量維度
word_size = 100
#負樣本個數
nb_negative = 16
input_words = Input(shape=(window*2,), dtype='int32')
input_vecs = Embedding(word_num+1, word_size, name='word2vec')(input_words)
input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) #CBOW模型,直接將上下文詞向量求和
#構造隨機負樣本,與目標組成抽樣
target_word = Input(shape=(1,), dtype='int32')
negatives = NegativeLayer(16,M,M_num)(target_word)
samples = Lambda(lambda x: K.concatenate(x))([target_word,negatives]) #構造抽樣,負樣本隨機抽。負樣本也可能抽到正樣本,但概率小。
#使用Embedding層代替dense主要原因是隻更新正例和負例相對應的輸出層神經元的權重,這樣可以大量減少記憶體佔用和計算量
softmax_weights = Embedding(word_num+1, word_size, name='W')(samples)
softmax_biases = Embedding(word_num+1, 1, name='b')(samples)
softmax = Lambda(lambda x:
K.softmax((K.batch_dot(x[0], K.expand_dims(x[1],2))+x[2])[:,:,0])
)([softmax_weights,input_vecs_sum,softmax_biases]) #用Embedding層存引數,用K後端實現矩陣乘法,以此復現Dense層的功能
#留意到,我們構造抽樣時,把目標放在了第一位,也就是說,softmax的目標id總是0,這可以從data_generator中的z變數的寫法可以看出
model = Model(inputs=[input_words,target_word], outputs=softmax)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit_generator(data_generator(),steps_per_epoch =np.ceil(dictionary.num_docs/sentence_batch_size),epochs=100,max_queue_size=1,workers=1)
# #儲存模型
model.save_weights("DNNword-vec2.h5")
# #載入模型
model.load_weights("DNNword-vec2.h5",by_name=True)
#
#獲取embeding的權重,也就是詞向量
embeddings = model.get_weights()[0]
#向量標準化
normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5
dictionary.id2token = {j:i for i,j in dictionary.token2id.items()}
#獲取前面最相似的15個詞語
def most_similar(w,dictionary):
v = normalized_embeddings[dictionary.token2id[w]]
#向量標準化之後分母就是1,所以直接相乘就好
sims = np.dot(normalized_embeddings, v)
sort = sims.argsort()[::-1]
sort = sort[sort > 0]
return [(dictionary.id2token[i],sims[i]) for i in sort[:15] if i in dictionary.id2token]
for sim in most_similar(u'網球',dictionary):
print(sim[0],sim[1])
# 網球 0.99999994
# 羽毛球 0.9787248
# 籃球 0.978495
# 排球 0.9773369
# 人 0.9761201
# 水 0.9760275
# 氣球 0.9753146
# 桌球 0.9731983
# 冰球 0.97278094
# 遊戲 0.9711289
# 足球 0.9660615
# 檯球 0.96072686
# 我 -0.3409065
# 打 -0.42166257