TensorFlow--新聞閱讀與個性化搜尋系統(程式碼)
阿新 • • 發佈:2019-02-04
1.匯入模組
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os #作業系統:操作檔案模組
import zipfile #給檔案解壓
import numpy as np
from tempfile import gettempdir
from six.moves import urllib
from six.moves import xrange #迴圈
import tensorflow as tf
2. 獲取檔案並解壓、初步處理
def zip_file(filename,expected_bytes): # 對相應檔案進行解壓
# expected_bytes 用於檢驗檔案的完整性和正確性 /filename 用來訓練模型 的語料檔案,語料就是一片文章
local_filename = os.path.join('.\\',filename) # 輸入檔案路徑
statinfo = os.stat(local_filename) # 獲取檔案的屬性,stat statistics 檔案統計資訊
if statinfo.st_size == expected_bytes: # 檔案大小是否與(屬性)統計的大小一致,判斷檔案是否損壞
print('goodfile',filename)
else:
raise Exception('bad file')
#對檔案解壓
with zipfile.ZipFile(local_filename) as f: # 一種簡化的處理異常exception的方法
data = tf.compat.as_str(f,read(f.namelist()[0 ])).split() #解壓轉換成字串,變換列表
return data
瞭解文章詞
vocabulary = zip_file('text8.zip',31344016) #得到訓練語料包含的所有詞(沒有進行分詞,輸入是英文)
print('Data size',len(vocabulary)) #一共多少個詞
vocabulary_size = 50000 # 限定5萬(輸入層編碼的維度)高頻詞,低頻詞意義不大
# 具體做專案工程的時候,根據統計來確定
3.構造輸入資料 Batch
#統計,原始詞列表結構化,生成batch 樣本
def build_dataset(words,n_words): # words:原始資料vocabulary;n_words:vocabulary_size
count = [['UNK',-1]] # count[]列表用來統計詞頻,
#UNK(unknow):代表被過濾掉的所有低頻詞,例如囧,鰲等被UNK替代;-1表示還未出現,即初始化
count.extend(collections.Counter(words).most_common(n_words -1))
# UNK extend新增到列表的尾部
dictionary = {} # 編碼,排在前面的,詞頻越高,編碼值越小
i = 0
for word, _ in cout:
dictionary[word] = i
i += 1
data = [] # 用來儲存 原文編碼
unk_count = 0 # 用來統計被過濾的低頻詞
for word in words: # words:原文
index = dictionary.get(word,0) #default:若key不存在,則返回預設值
unk_count += 1
data.append(index)
cout[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values():dictionary.key()))
#把key和value 反轉,zip用來轉換行列
return data,count,dictionary,reversed_dictionary
檢視統計詞
data,count,dictionary,reversed_dictionary = build_dataset(vocabulary,vocabulary_size)
print('most common words:',count[:5]) #頻數最高的五個詞
data_index = 0 #對原文定位
4. skip-gram方法:用於生成 Batch的樣本
def generate_batch(batch_size,num_skips,skip_windows): #skip_windows:總詞長-1
global data_index
assert batch_size % num_skips ==0 #隨機從文中取num_skips個詞
assert num_skips <= 2 * skip_windows
batch = np.ndarray(shape = (batch_size),dtype = np.int32) #存放訓練樣本,輸入,只有一行
labels = np.ndarray(shape = (batch_size,1),dtype = np.int32) #存放訓練標註,輸出,只有一列
span = 2 * skip_windows+1 #取詞範圍,長度
buffer = collections.deque(maxlen = span) #double-ended 雙向佇列,存放文字
if (data_index + span > len(data)):
data_index = 0 #訓練語料迴圈使用
buffer.extend(data[data_index:data_index + span])
data_index = span
for i in range(batch_size// num_skips):
context_words = [w for w in range(span) if w != skip_windows]
#中間詞的 上下文 例如【0,1,2, 4,5,6】
random.shuffle(context_words) #隨機取樣
word_to_use = collections.deque(context_words)
for j in range (num_skips):
batch[i * num_skips + j] = buffer[skip_windows] #中心詞
context_words = word_to_use.pop()
labels[i * num_skips + j,0] = buffer[context_words]
if data_index == len(data):
buffer[:] = data[:span] #取到末尾資料的時候
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch,labels #輸入和輸出是分開放的
5.定義神經網路初始化資料
batch_size = 128 #適當的偏向
embedding_size = 300 #中間層節點的個數,詞向量(word2vec)的維度
skip_window = 2
num_skips = 2 #取樣值 偏小(經驗值)
num_sampled = 64 #取樣 與soft max的計算有關
valid_size = 16 #測試集的大小
valid_window = 100 #生成隨機列表的值<100
valid_examples = np.random.choice(valid_window,valid_size,replace = Fale)
gragh = tf.Gragh() #直觀理解,所在tensor 構成了一幅圖
6.構造神經網路
with gragh.as_default():
train_input = tf.placebolder(tf.int32,shape = [batch_size]) #tensor 一行 placeholder佔位符,存放batch樣本
train_labels = tf.placebolder(tf.int32, shape=[batch_size,1]) #tensor 一列 placeholder佔位符
with tf.device('/cpu:0'):
#存放所有的詞,random_uniform()初始化,均勻分佈 embeddings是weight矩陣
embeddings = tf.Variable(tf.random_uniform([vocabulary_size.embedding_size]),-1,0,1.0)
#找出一批子矩陣 其實就是找出batch樣本對應的weight
embed = tf.nn.embedding_lookup(embeddings,train_input)
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) #
# truncated_normal是切掉左右尾巴的正態分佈,stddev=1.0 / math.sqrt(embedding_size)很有名的初始化技巧
nce_biases = tf.Variable(tf.zeros(vocabulary_size)) #定義常數bias
# 構造損失函式
# nce:noise contranstive estimation 噪聲對比估計
# 預設使用了 softmax函式(封裝了平均交叉熵損失函式),損失函式近似計算
loss = tf.reduce_mean(tf.nn_loss(weights = nce_weights,
biases = nce_biases,
labels = train_labels,
inputs = embed,
num_sampled = num_sampled,
num_classes = vocabulary_size))
# 優化器:梯度提升優化器(1.0: learning rate 學習率)
optimizer = tf.train.GradientDcisionOptimizer(1.0).minimize(loss) #小步長,防止抖動,梯度下降演算法
#測試用的:歸一化後,計算詞與詞距離(夾角的餘弦值)
norm = tf.sqrt(tf.reduce_sum(tf.queare(embeddings),1,keep_dims = True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings,
normalized_embeddings,
transpose_b = True) #內積:夾角的餘弦值
init = tf.global_variable_initializer() #初始化全域性變數
num_steps = 100000 #訓練迭代次數
7. 訓練
with tf.Session(gragh = gragh) as sesion:
init.run()
average_loss = 0
for step in arange(num_steps):
batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
feed_dict = {train_input:batch_inputs,train_labels:batch_labels}
_.loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
#run()函式計算又先後
average_loss += loss_val
if (step % 2000 ==0): # 每2000次 統計一下模型誤差
if(step > 0):
average_loss /= 2000
print('average loss at step is'.step,':',average_loss)
average_loss = 0
if step % 10000 ==0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reversed_dictionary[valid_examples[1]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i,:]).argsort()[1:top_k + 1]
log_str = 'Nearst to %s:' % valid_word
for k in xrange(top_k):
close_word = reversed_dictionary[nearest[k]]
log_str = '%s %s,'% (log_str,close_word)
print(log_str)
final.embeddings = normalized_embeddings.eval() #歸一化,輸出詞向量