1. 程式人生 > >TensorFlow--新聞閱讀與個性化搜尋系統(程式碼)

TensorFlow--新聞閱讀與個性化搜尋系統(程式碼)

1.匯入模組
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os                        #作業系統:操作檔案模組
import zipfile                   #給檔案解壓
import numpy as np
from tempfile import gettempdir
from six.moves import
urllib from six.moves import xrange #迴圈 import tensorflow as tf
2. 獲取檔案並解壓、初步處理
def zip_file(filename,expected_bytes):              # 對相應檔案進行解壓  
# expected_bytes 用於檢驗檔案的完整性和正確性            /filename 用來訓練模型 的語料檔案,語料就是一片文章
    local_filename = os.path.join('.\\',filename)   # 輸入檔案路徑
    statinfo = os.stat(local_filename)              # 獲取檔案的屬性,stat statistics 檔案統計資訊
if statinfo.st_size == expected_bytes: # 檔案大小是否與(屬性)統計的大小一致,判斷檔案是否損壞 print('goodfile',filename) else: raise Exception('bad file') #對檔案解壓 with zipfile.ZipFile(local_filename) as f: # 一種簡化的處理異常exception的方法 data = tf.compat.as_str(f,read(f.namelist()[0
])).split() #解壓轉換成字串,變換列表 return data
瞭解文章詞
vocabulary = zip_file('text8.zip',31344016) #得到訓練語料包含的所有詞(沒有進行分詞,輸入是英文)
print('Data size',len(vocabulary))          #一共多少個詞

vocabulary_size = 50000                     # 限定5萬(輸入層編碼的維度)高頻詞,低頻詞意義不大
                                            # 具體做專案工程的時候,根據統計來確定
3.構造輸入資料 Batch
#統計,原始詞列表結構化,生成batch 樣本 

def build_dataset(words,n_words):  # words:原始資料vocabulary;n_words:vocabulary_size
    count = [['UNK',-1]]           # count[]列表用來統計詞頻, 
    #UNK(unknow):代表被過濾掉的所有低頻詞,例如囧,鰲等被UNK替代;-1表示還未出現,即初始化
    count.extend(collections.Counter(words).most_common(n_words -1))     
                                   # UNK  extend新增到列表的尾部
    dictionary = {}                # 編碼,排在前面的,詞頻越高,編碼值越小
    i = 0
    for word, _ in cout:
        dictionary[word] = i
        i += 1
    data = []                      # 用來儲存 原文編碼
    unk_count = 0                  # 用來統計被過濾的低頻詞
    for word in words:             # words:原文
        index = dictionary.get(word,0)  #default:若key不存在,則返回預設值
        unk_count += 1
        data.append(index)
    cout[0][1] = unk_count

    reversed_dictionary = dict(zip(dictionary.values():dictionary.key()))   
                                #把key和value 反轉,zip用來轉換行列
    return data,count,dictionary,reversed_dictionary
檢視統計詞
data,count,dictionary,reversed_dictionary = build_dataset(vocabulary,vocabulary_size)
print('most common words:',count[:5])       #頻數最高的五個詞

data_index = 0 #對原文定位
4. skip-gram方法:用於生成 Batch的樣本
def generate_batch(batch_size,num_skips,skip_windows): #skip_windows:總詞長-1
    global data_index
    assert batch_size % num_skips ==0                  #隨機從文中取num_skips個詞
    assert num_skips <= 2 * skip_windows

    batch = np.ndarray(shape = (batch_size),dtype = np.int32)     #存放訓練樣本,輸入,只有一行
    labels = np.ndarray(shape = (batch_size,1),dtype = np.int32)  #存放訓練標註,輸出,只有一列
    span = 2 * skip_windows+1                    #取詞範圍,長度
    buffer = collections.deque(maxlen = span)    #double-ended 雙向佇列,存放文字
    if (data_index + span > len(data)):
        data_index = 0                           #訓練語料迴圈使用
    buffer.extend(data[data_index:data_index + span])

    data_index = span
    for i in range(batch_size// num_skips):         
        context_words = [w for w in range(span) if w != skip_windows]    
        #中間詞的 上下文 例如【0,1,2,  4,5,6】
        random.shuffle(context_words)            #隨機取樣
        word_to_use = collections.deque(context_words)
        for j in range (num_skips):
            batch[i * num_skips + j] = buffer[skip_windows] #中心詞
            context_words = word_to_use.pop()
            labels[i * num_skips + j,0] = buffer[context_words]
        if data_index == len(data):
            buffer[:] = data[:span]              #取到末尾資料的時候
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    data_index = (data_index + len(data) - span) % len(data)
    return batch,labels                         #輸入和輸出是分開放的
5.定義神經網路初始化資料
batch_size = 128     #適當的偏向
embedding_size = 300 #中間層節點的個數,詞向量(word2vec)的維度
skip_window = 2
num_skips = 2        #取樣值 偏小(經驗值)
num_sampled = 64     #取樣 與soft max的計算有關

valid_size = 16      #測試集的大小
valid_window = 100   #生成隨機列表的值<100
valid_examples = np.random.choice(valid_window,valid_size,replace = Fale)

gragh = tf.Gragh()   #直觀理解,所在tensor 構成了一幅圖
6.構造神經網路
with gragh.as_default():
    train_input = tf.placebolder(tf.int32,shape = [batch_size])    #tensor 一行  placeholder佔位符,存放batch樣本
    train_labels = tf.placebolder(tf.int32, shape=[batch_size,1])  #tensor 一列  placeholder佔位符


    with tf.device('/cpu:0'): 
        #存放所有的詞,random_uniform()初始化,均勻分佈  embeddings是weight矩陣 
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size.embedding_size]),-1,0,1.0)    

        #找出一批子矩陣 其實就是找出batch樣本對應的weight
        embed = tf.nn.embedding_lookup(embeddings,train_input)      

        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))  #
        # truncated_normal是切掉左右尾巴的正態分佈,stddev=1.0 / math.sqrt(embedding_size)很有名的初始化技巧
        nce_biases = tf.Variable(tf.zeros(vocabulary_size))         #定義常數bias

    # 構造損失函式 
    # nce:noise contranstive estimation 噪聲對比估計  
    # 預設使用了 softmax函式(封裝了平均交叉熵損失函式),損失函式近似計算
    loss = tf.reduce_mean(tf.nn_loss(weights = nce_weights,
                                    biases = nce_biases,
                                    labels = train_labels,
                                    inputs = embed,
                                    num_sampled = num_sampled,
                                    num_classes = vocabulary_size))


    # 優化器:梯度提升優化器(1.0: learning rate 學習率)
    optimizer = tf.train.GradientDcisionOptimizer(1.0).minimize(loss)   #小步長,防止抖動,梯度下降演算法

    #測試用的:歸一化後,計算詞與詞距離(夾角的餘弦值)
    norm = tf.sqrt(tf.reduce_sum(tf.queare(embeddings),1,keep_dims = True))
    normalized_embeddings = embeddings/norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity = tf.matmul(valid_embeddings,
                            normalized_embeddings,
                            transpose_b = True)        #內積:夾角的餘弦值

    init = tf.global_variable_initializer()            #初始化全域性變數

num_steps = 100000                                     #訓練迭代次數
7. 訓練
with tf.Session(gragh = gragh) as sesion:
    init.run()

    average_loss = 0    
    for step in arange(num_steps):
        batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        feed_dict = {train_input:batch_inputs,train_labels:batch_labels}

        _.loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
        #run()函式計算又先後
        average_loss += loss_val

        if (step % 2000 ==0):       # 每2000次 統計一下模型誤差
            if(step > 0):
                average_loss /= 2000
            print('average loss at step is'.step,':',average_loss)
            average_loss = 0

    if step % 10000 ==0:
        sim = similarity.eval()
        for i in xrange(valid_size):
            valid_word = reversed_dictionary[valid_examples[1]]
            top_k = 8               # number of nearest neighbors
            nearest = (-sim[i,:]).argsort()[1:top_k + 1]
            log_str = 'Nearst to %s:' % valid_word
            for k in xrange(top_k):
                close_word = reversed_dictionary[nearest[k]]
                log_str = '%s %s,'% (log_str,close_word)
            print(log_str)

    final.embeddings = normalized_embeddings.eval() #歸一化,輸出詞向量