Assignment1詞向量
提前導包:
1 import sys 2 assert sys.version_info[0]==3 3 assert sys.version_info[1] >= 5 4 5 from gensim.models import KeyedVectors 6 from gensim.test.utils import datapath 7 import pprint 8 import matplotlib.pyplot as plt 9 plt.rcParams['figure.figsize'] = [10, 5] 10 import nltk 11 from nltk.corpus importreuters 12 import numpy as np 13 import random 14 import scipy as sp 15 from sklearn.decomposition import TruncatedSVD 16 from sklearn.decomposition import PCA 17 18 START_TOKEN = '<START>' 19 END_TOKEN = '<END>' 20 21 np.random.seed(0) 22 random.seed(0)
Tip:from nltk.corpus import reuters這一句需要提前下載reuters,解壓後放置C:\Users\Administrator\nltk_data\corpora目錄下,這樣不用執行時再下載。
1.基於共現矩陣得到的詞向量
1.1載入語料,給每句話都加上start_token和end_token
1 def read_corpus(category="crude"): 2 """ Read files from the specified Reuter's category. 3 Params: 4 category (string): category name 5 Return: 6 list of lists, with words from each of the processed files7 """ 8 files = reuters.fileids(category) 9 return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]
測試一下:
1 reuters_corpus = read_corpus() 2 pprint.pprint(reuters_corpus[:1], compact=True, width=100) #pprint美觀列印
[['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the',
'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise',
'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to',
'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry',
'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for',
'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres',
'(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.', 'the', 'decision', 'follows',
'the', 'emergence', 'of', 'structural', 'changes', 'in', 'japanese', 'industry', 'following',
'the', 'rise', 'in', 'the', 'value', 'of', 'the', 'yen', 'and', 'a', 'decline', 'in', 'domestic',
'electric', 'power', 'demand', '.', 'miti', 'is', 'planning', 'to', 'work', 'out', 'a', 'revised',
'energy', 'supply', '/', 'demand', 'outlook', 'through', 'deliberations', 'of', 'committee',
'meetings', 'of', 'the', 'agency', 'of', 'natural', 'resources', 'and', 'energy', ',', 'the',
'officials', 'said', '.', 'they', 'said', 'miti', 'will', 'also', 'review', 'the', 'breakdown',
'of', 'energy', 'supply', 'sources', ',', 'including', 'oil', ',', 'nuclear', ',', 'coal', 'and',
'natural', 'gas', '.', 'nuclear', 'energy', 'provided', 'the', 'bulk', 'of', 'japan', "'", 's',
'electric', 'power', 'in', 'the', 'fiscal', 'year', 'ended', 'march', '31', ',', 'supplying',
'an', 'estimated', '27', 'pct', 'on', 'a', 'kilowatt', '/', 'hour', 'basis', ',', 'followed',
'by', 'oil', '(', '23', 'pct', ')', 'and', 'liquefied', 'natural', 'gas', '(', '21', 'pct', '),',
'they', 'noted', '.', '<END>']]
1.2得到詞典(去掉重複詞)
1 def distinct_words(corpus): 2 """ Determine a list of distinct words for the corpus. 3 Params: 4 corpus (list of list of strings): corpus of documents 5 Return: 6 corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function) 7 num_corpus_words (integer): number of distinct words across the corpus 8 """ 9 corpus_words = [] 10 num_corpus_words = -1 11 12 # ------------------ 13 # Write your implementation here. 14 corpus_words = sorted(list(set([word for wordlst in corpus for word in wordlst]))) 15 num_corpus_words = len(corpus_words) 16 # ------------------ 17 18 return corpus_words, num_corpus_words
測試一下:
1 # Define toy corpus 2 test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")] 3 test_corpus_words, num_corpus_words = distinct_words(test_corpus) 4 5 # Correct answers 6 ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN]) 7 ans_num_corpus_words = len(ans_test_corpus_words) 8 9 # Test correct number of words 10 assert(num_corpus_words == ans_num_corpus_words), "Incorrect number of distinct words. Correct: {}. Yours: {}".format(ans_num_corpus_words, num_corpus_words) 11 12 # Test correct words 13 assert (test_corpus_words == ans_test_corpus_words), "Incorrect corpus_words.\nCorrect: {}\nYours: {}".format(str(ans_test_corpus_words), str(test_corpus_words)) 14 15 # Print Success 16 print ("-" * 80) 17 print("Passed All Tests!") 18 print ("-" * 80)
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
1.3計算共現矩陣
舉個栗子:Co-Occurrence with Fixed Window of n=1
Document 1: "all that glitters is not gold"
Document 2: "all is well that ends well"
1 def compute_co_occurrence_matrix(corpus, window_size=4): 2 """ Compute co-occurrence matrix for the given corpus and window_size (default of 4). 3 4 Note: Each word in a document should be at the center of a window. Words near edges will have a smaller 5 number of co-occurring words. 6 7 For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4, 8 "All" will co-occur with "<START>", "that", "glitters", "is", and "not". 9 10 Params: 11 corpus (list of list of strings): corpus of documents 12 window_size (int): size of context window 13 Return: 14 M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 15 Co-occurence matrix of word counts. 16 The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function. 17 word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M. 18 """ 19 words, num_words = distinct_words(corpus) 20 M = None 21 word2Ind = {} 22 23 # ------------------ 24 # Write your implementation here. 25 M = np.zeros((num_words, num_words)) 26 word2Ind = dict(zip(words, range(num_words))) 27 28 for sen in corpus: 29 for i in range(len(sen)): #sen[i]為中心詞 30 center_idx = word2Ind[sen[i]] #center_idx為中心詞對應的索引 31 for w in sen[i-window_size : i] + sen[i+1 : i+window_size+1]: #遍歷中心詞的上下文 32 context_idx = word2Ind[w] #context_idx為上下文對應的索引 33 M[center_idx, context_idx] += 1 34 # ------------------ 35 36 return M, word2Ind
測試一下:
1 # Define toy corpus and get student's co-occurrence matrix 2 test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")] 3 M_test, word2Ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1) 4 5 # Correct M and word2Ind 6 M_test_ans = np.array( 7 [[0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,], 8 [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,], 9 [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,], 10 [0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,], 11 [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,], 12 [0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,], 13 [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,], 14 [0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,], 15 [0., 0., 1., 0., 1., 1., 0., 0., 0., 1.,], 16 [1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,]] 17 ) 18 ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN]) 19 word2Ind_ans = dict(zip(ans_test_corpus_words, range(len(ans_test_corpus_words)))) 20 21 # Test correct word2Ind 22 assert (word2Ind_ans == word2Ind_test), "Your word2Ind is incorrect:\nCorrect: {}\nYours: {}".format(word2Ind_ans, word2Ind_test) 23 24 # Test correct M shape 25 assert (M_test.shape == M_test_ans.shape), "M matrix has incorrect shape.\nCorrect: {}\nYours: {}".format(M_test.shape, M_test_ans.shape) 26 27 # Test correct M values 28 for w1 in word2Ind_ans.keys(): 29 idx1 = word2Ind_ans[w1] 30 for w2 in word2Ind_ans.keys(): 31 idx2 = word2Ind_ans[w2] 32 student = M_test[idx1, idx2] 33 correct = M_test_ans[idx1, idx2] 34 if student != correct: 35 print("Correct M:") 36 print(M_test_ans) 37 print("Your M: ") 38 print(M_test) 39 raise AssertionError("Incorrect count at index ({}, {})=({}, {}) in matrix M. Yours has {} but should have {}.".format(idx1, idx2, w1, w2, student, correct)) 40 41 # Print Success 42 print ("-" * 80) 43 print("Passed All Tests!") 44 print ("-" * 80)
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
1.4使用奇異值分解對共現矩陣降維(降成二維是為了在座標軸中展示)
1 def reduce_to_k_dim(M, k=2): 2 """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words) 3 to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn: 4 - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html 5 6 Params: 7 M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts 8 k (int): embedding size of each word after dimension reduction 9 Return: 10 M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings. 11 In terms of the SVD from math class, this actually returns U * S 12 """ 13 n_iters = 10 # Use this parameter in your call to `TruncatedSVD` 14 M_reduced = None 15 print("Running Truncated SVD over %i words..." % (M.shape[0])) 16 17 # ------------------ 18 # Write your implementation here. 19 svd = TruncatedSVD(n_components=k, n_iter=n_iters) 20 M_reduced = svd.fit_transform(M) 21 # ------------------ 22 23 print("Done.") 24 return M_reduced
測試一下:
1 # Define toy corpus and run student code 2 test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")] 3 M_test, word2Ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1) 4 M_test_reduced = reduce_to_k_dim(M_test, k=2) 5 6 # Test proper dimensions 7 assert (M_test_reduced.shape[0] == 10), "M_reduced has {} rows; should have {}".format(M_test_reduced.shape[0], 10) 8 assert (M_test_reduced.shape[1] == 2), "M_reduced has {} columns; should have {}".format(M_test_reduced.shape[1], 2) 9 10 # Print Success 11 print ("-" * 80) 12 print("Passed All Tests!") 13 print ("-" * 80)
Running Truncated SVD over 10 words...
Done.
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
1.5使用matplotlib展示
1 def plot_embeddings(M_reduced, word2Ind, words): 2 """ Plot in a scatterplot the embeddings of the words specified in the list "words". 3 NOTE: do not plot all the words listed in M_reduced / word2Ind. 4 Include a label next to each point. 5 6 Params: 7 M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings 8 word2Ind (dict): dictionary that maps word to indices for matrix M 9 words (list of strings): words whose embeddings we want to visualize 10 """ 11 # ------------------ 12 # Write your implementation here. 13 for word in words: 14 idx = word2Ind[word] 15 x = M_reduced[idx, 0] 16 y = M_reduced[idx, 1] 17 18 plt.scatter(x, y, marker='x', color='red') 19 plt.text(x, y, word) 20 # ------------------
測試一下:
1 print ("-" * 80) 2 print ("Outputted Plot:") 3 4 M_reduced_plot_test = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0, 0]]) 5 word2Ind_plot_test = {'test1': 0, 'test2': 1, 'test3': 2, 'test4': 3, 'test5': 4} 6 words = ['test1', 'test2', 'test3', 'test4', 'test5'] 7 plot_embeddings(M_reduced_plot_test, word2Ind_plot_test, words) 8 9 print ("-" * 80)
--------------------------------------------------------------------------------
Outputted Plot:
--------------------------------------------------------------------------------
1.6呼叫以上方法
1 reuters_corpus = read_corpus() 2 M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus) 3 M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2) 4 5 # Rescale (normalize) the rows to make them each of unit-length 6 M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1) 7 M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting 8 9 words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela'] 10 11 plot_embeddings(M_normalized, word2Ind_co_occurrence, words)
Running Truncated SVD over 8185 words...
Done.
2.基於Glove得到的詞向量
2.1載入向量模型
1 def load_embedding_model(): 2 """ Load GloVe Vectors 3 Return: 4 wv_from_bin: All 400000 embeddings, each lengh 200 5 """ 6 import gensim.downloader as api 7 wv_from_bin = api.load("glove-wiki-gigaword-200") 8 print("Loaded vocab size %i" % len(wv_from_bin.vocab.keys())) 9 return wv_from_bin 10 11 wv_from_bin = load_embedding_model()
[==================================================] 100.0% 252.1/252.1MB downloaded
Loaded vocab size 400000
2.2計算嵌入矩陣
1 def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']): 2 """ Put the GloVe vectors into a matrix M. 3 Param: 4 wv_from_bin: KeyedVectors object; the 400000 GloVe vectors loaded from file 5 Return: 6 M: numpy matrix shape (num words, 200) containing the vectors 7 word2Ind: dictionary mapping each word to its row number in M 8 """ 9 import random 10 words = list(wv_from_bin.vocab.keys()) 11 print("Shuffling words ...") 12 random.seed(224) 13 random.shuffle(words) 14 words = words[:10000] 15 print("Putting %i words into word2Ind and matrix M..." % len(words)) 16 word2Ind = {} 17 M = [] 18 curInd = 0 19 for w in words: 20 try: 21 M.append(wv_from_bin.word_vec(w)) 22 word2Ind[w] = curInd 23 curInd += 1 24 except KeyError: 25 continue 26 for w in required_words: 27 if w in words: 28 continue 29 try: 30 M.append(wv_from_bin.word_vec(w)) 31 word2Ind[w] = curInd 32 curInd += 1 33 except KeyError: 34 continue 35 M = np.stack(M) 36 print("Done.") 37 return M, word2Ind 38 39 40 M, word2Ind = get_matrix_of_vectors(wv_from_bin) 41 M_reduced = reduce_to_k_dim(M, k=2) #M_reduced.shape= (10010, 2) 42 43 # Rescale (normalize) the rows to make them each of unit-length 44 M_lengths = np.linalg.norm(M_reduced, axis=1) #求L2範數,M_lengths.shape= (10010,) 45 M_reduced_normalized = M_reduced / M_lengths[:, np.newaxis] # broadcasting,np.newaxis插入新維度 46 47 words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela'] 48 plot_embeddings(M_reduced_normalized, word2Ind, words)
Shuffling words ...
Putting 10000 words into word2Ind and matrix M...
Done.
Running Truncated SVD over 10010 words...
Done.
和共現矩陣得到的結果類似。
2.3前十個最相似的單詞,根據給定單詞的餘弦相似度對錶中的其他單詞進行排序
1 # ------------------ 2 # Write your implementation here. 3 similarwords = wv_from_bin.most_similar('leaves') 4 print(similarwords) 5 # ------------------
[('ends', 0.6128067970275879), ('leaf', 0.6027014255523682), ('stems', 0.5998532772064209), ('takes', 0.5902855396270752), ('leaving', 0.5761634111404419), ('grows', 0.5663397312164307), ('flowers', 0.5600922107696533), ('turns', 0.5536050796508789), ('leave', 0.5496848225593567), ('goes', 0.5434924960136414)]
2.4計算兩個單詞間的餘弦距離
1 # ------------------ 2 # Write your implementation here. 3 w1, w2, w3 = 'happy', 'cheerful', 'sad' 4 w1_w2_dis = wv_from_bin.distance(w1, w2) 5 w1_w3_dis = wv_from_bin.distance(w1, w3) 6 7 print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dis)) 8 print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dis)) 9 # ------------------
Synonyms happy, cheerful have cosine distance: 0.5172466933727264
Antonyms happy, sad have cosine distance: 0.40401363372802734
1 pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))
[('queen', 0.6978678703308105),
('princess', 0.6081745028495789),
('monarch', 0.5889754891395569),
('throne', 0.5775108933448792),
('prince', 0.5750998258590698),
('elizabeth', 0.5463595986366272),
('daughter', 0.5399125814437866),
('kingdom', 0.5318052172660828),
('mother', 0.5168544054031372),
('crown', 0.5164473056793213)]
2.5嵌入向量存在偏見
1 pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'worker'], negative=['man'])) 2 print() 3 pprint.pprint(wv_from_bin.most_similar(positive=['man', 'worker'], negative=['woman']))
[('employee', 0.6375863552093506),
('workers', 0.6068919897079468),
('nurse', 0.5837947130203247),
('pregnant', 0.5363885760307312),
('mother', 0.5321309566497803),
('employer', 0.5127025842666626),
('teacher', 0.5099577307701111),
('child', 0.5096741914749146),
('homemaker', 0.5019455552101135),
('nurses', 0.4970571994781494)]
[('workers', 0.611325740814209),
('employee', 0.5983108878135681),
('working', 0.5615329742431641),
('laborer', 0.5442320108413696),
('unemployed', 0.5368517637252808),
('job', 0.5278826951980591),
('work', 0.5223963260650635),
('mechanic', 0.5088937282562256),
('worked', 0.5054520964622498),
('factory', 0.4940453767776489)]