tensorflow如何正確加載預訓練詞向量
阿新 • • 發佈:2019-01-10
global color news doc ... elf import loading initial
使用預訓練詞向量和隨機初始化詞向量的差異還是挺大的,現在說一說我使用預訓練詞向量的流程。
一、構建本語料的詞匯表,作為我的基礎詞匯
二、遍歷該詞匯表,從預訓練詞向量中提取出該詞對應的詞向量
三、初始化embeddings遍歷,將數據賦值給tensor
樣例代碼:
1 #-*- coding: UTF-8 -*- 2 import numpy as np 3 import tensorflow as tf 4 ‘‘‘本程序只是對word2vec進行了簡單的預處理,應用到復雜模型中還需要根據實際情況做必要的改動‘‘‘ 5 6 class Wordlist(object):7 def __init__(self, filename, maxn = 100000): 8 lines = map(lambda x: x.split(), open(filename).readlines()[:maxn]) 9 self.size = len(lines) 10 11 self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))] 12 self.voc = dict(self.voc) 13 14 defgetID(self, word): 15 try: 16 return self.voc[word] 17 except: 18 return 0 19 20 def get_W(word_vecs, k=300): 21 """ 22 Get word matrix. W[i] is the vector for word indexed by i 23 """ 24 vocab_size = len(word_vecs) 25 word_idx_map = dict() 26W = np.zeros(shape=(vocab_size+1, k), dtype=‘float32‘) 27 W[0] = np.zeros(k, dtype=‘float32‘) 28 i = 1 29 for word in word_vecs: 30 W[i] = word_vecs[word] 31 word_idx_map[word] = i 32 i += 1 33 return W, word_idx_map 34 35 def load_bin_vec(fname, vocab): 36 """ 37 Loads 300x1 word vecs from Google (Mikolov) word2vec 38 """ 39 i=0 40 word_vecs = {} 41 pury_word_vec = [] 42 with open(fname, "rb") as f: 43 header = f.readline() 44 print ‘header‘,header 45 vocab_size, layer1_size = map(int, header.split()) 46 print ‘vocabsize:‘,vocab_size,‘layer1_size:‘,layer1_size 47 binary_len = np.dtype(‘float32‘).itemsize * layer1_size 48 for line in xrange(vocab_size): 49 word = [] 50 while True: 51 ch = f.read(1) 52 #print ch 53 if ch == ‘ ‘: 54 word = ‘‘.join(word) 55 #print ‘single word:‘,word 56 break 57 if ch != ‘\n‘: 58 word.append(ch) 59 #print word 60 #print word 61 if word in vocab: 62 word_vecs[word] = np.fromstring(f.read(binary_len), dtype=‘float32‘) 63 pury_word_vec.append(word_vecs[word]) 64 if i==0: 65 print ‘word‘,word 66 i=1 67 else: 68 f.read(binary_len) 69 #np.savetxt(‘googleembedding.txt‘,pury_word_vec) 70 return word_vecs,pury_word_vec 71 72 def add_unknown_words(word_vecs, vocab, min_df=1, k=300): 73 """ 74 For words that occur in at least min_df documents, create a separate word vector. 75 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones 76 """ 77 for word in vocab: 78 if word not in word_vecs and vocab[word] >= min_df: 79 word_vecs[word] = np.random.uniform(-0.25,0.25,k) 80 81 if __name__=="__main__": 82 w2v_file = "GoogleNews-vectors-negative300.bin"#Google news word2vec bin文件 83 print "loading data...", 84 vocab = Wordlist(‘vocab.txt‘)#自己的數據集要用到的詞表 85 w2v,pury_word2vec = load_bin_vec(w2v_file, vocab.voc) 86 add_unknown_words(w2v, vocab.voc) 87 W, word_idx_map = get_W(w2v) 88 89 ‘‘‘embedding lookup簡單應用‘‘‘ 90 Wa = tf.Variable(W) 91 embedding_input = tf.nn.embedding_lookup(Wa, [0,1,2])#正常使用時要替換成相應的doc 92 93 with tf.Session() as sess: 94 sess.run(tf.global_variables_initializer()) 95 input = sess.run(Wa) 96 #print np.shape(Wa)
tensorflow如何正確加載預訓練詞向量