推薦系統3-1

阿新 • • 發佈：2018-12-07

使用http://mattmahoney.net/dc/textdata訓練word2vect skip-gram
1、匯入庫檔案

from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
print('check：libs well prepared')

2、下載資料集並解壓

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  #判斷檔案是否存在
  if not os.path.exists(filename):
    #下載
    print('download...')
    filename, _ = urlretrieve(url + filename, filename)
  #校驗大小
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print('exception %s' % statinfo.st_size)
  return filename

filename = maybe_download('text8.zip', 31344016)

3、編碼並替換低頻次

vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  #每個詞出現的次數
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  #單詞到數字的對映
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0 
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  #數字到單詞的對映
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

#對映之後的訓練資料
data, count, dictionary, reverse_dictionary = build_dataset(words)
#
print('Most common words (+UNK)', count[:5])
print('original data', words[:10])
print('training data', data[:10])

輸出：

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
original data ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
training data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

4、生成skip-gram訓練資料

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  # x y
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # context word context
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    # 迴圈使用
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # 
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels


print('data:', [reverse_dictionary[di] for di in data[:8]])
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)
print('    batch:', [reverse_dictionary[bi] for bi in batch])
print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

輸出：

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']
    batch: ['as', 'as', 'a', 'a', 'term', 'term', 'of', 'of']
    labels: ['originated', 'anarchism', 'of', 'term', 'as', 'a', 'abuse', 'first']

5、定義網路結構

batch_size = 128
embedding_size = 128 # 
skip_window = 1 # 
num_skips = 2 # 
valid_size = 16 # 
valid_window = 100 #
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 #

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # 輸入資料
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # 定義變數
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  #本次訓練資料對應的embedding
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # batch loss
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
  #優化loss，更新引數
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  #歸一化
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  #用已有embedding計算valid的相似次
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

6、執行訓練流程

num_steps = 100000

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  average_loss = 0
  for step in range(num_steps+1):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    #2000次列印loss
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # 列印valid效果
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 5 #相似度最高的5個詞
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()

執行結果示例：

Average loss at step 0: 8.125039
Nearest to their: liliuokalani, kobe, aeolian, judeo, gutman,
Nearest to state: emulates, matching, heritage, coder, rebounding,
Nearest to the: super, represent, whitacre, swine, clothing,
Nearest to system: populace, harshness, bungee, pounds, nist,
Nearest to between: infimum, macedonians, abyss, ziegler, lorica,
Nearest to such: wrath, comecon, ignite, winfield, revolution,
Nearest to up: coexist, breads, applesoft, azores, dogs,
Nearest to this: apart, vorarlberg, par, jardines, syntax,
Nearest to if: knowles, hindi, defeated, biochemical, lonergan,
Nearest to from: usp, martov, hormonal, pd, clouds,
Nearest to s: mediating, bit, challenges, lys, lavos,
Nearest to however: eps, lambdamoo, eternally, zanetti, cavers,
Nearest to over: yardbirds, duct, mayer, breaks, plagues,
。。。。。。
。。。。。。
Nearest to between: with, within, among, through, against,
Nearest to such: well, intelligent, known, certain, these,
Nearest to up: out, off, down, back, them,
Nearest to this: which, it, another, itself, some,
Nearest to if: when, though, where, before, because,
Nearest to from: through, into, protestors, muir, in,
Nearest to s: whose, his, isbn, my, dedicates,
Nearest to however: but, although, that, though, especially,
Nearest to over: overshadowed, around, between, through, within,
Nearest to also: often, still, now, never, sometimes,
Nearest to used: designed, referred, seen, considered, known,
Nearest to on: upon, in, through, under, bathroom,

7、視覺化

num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(20,20))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

在這裡插入圖片描述

從零開始搭搭建系統3.1——頂級pom制定

enc .com clas 系統開始 htm href HR .html 從零開始搭搭建系統3.1——頂級pom制定從零開始搭搭建系統3.1——頂級pom制定

基於模型融合的推薦系統實現(1)：基於SGD的PMF

(1)PMF演算法 PMF的基本的思路,就是定義兩個基本的引數W,U,然後對於任意一個組合(u,m),利用 Wi∗Uj W^i*U^j,來獲取預測值。這些基本的演算法思路網上很多,就不細說了。簡單說一下程式 [0]:一開始我們要將訓練資料劃分為3部分,第一部

若依後臺管理系統 3.1 釋出，新增多項功能

若依管理系統 v3.1 已釋出，更新日誌： 1、新增內網不獲取IP地址2、新增cron表示式有效校驗3、定時任務新增詳細資訊4、定時任務預設策略修改（不觸發立即執行）5、定時任務詳細顯示下一個執行週期6、支援前端任意日期格式處理7、上傳頭像刪除多餘提交按鈕8、表格增加行間隔色配置項9、表格增

基於深度學習的推薦系統綜述 (arxiv 1707.07435) 譯文 3.1 ~ 3.3

基於深度學習的推薦：最先進的技術在本節中，我們首先介紹基於深度學習的推薦模型的類別，然後突出最先進的研究原型，旨在確定近年來最顯著和最有希望的進步。基於深度學習的推薦模型的類別 **圖 1：**基於深度神經網路的推薦模型的類別。為了提供該領域的全景圖，

《推薦系統實踐》程式實現 —— 1.3.2 評測指標

預測準確度 2.1 評分預測評分預測的預測準確度可以通過均方根誤差（RMSE）和平均絕對誤差（MAE）計算。那麼RMSE和MAE的公式是怎麼推匯出來的呢？請看下面的解析：（1）我們對某一使用

深入理解計算機系統（3.1）------匯編語言和機器語言

找到生產有著 shu 符號 ces pc機高效率機器語言　　《深入理解計算機系統》第三章——程序的機器級表示。作者首先講解了匯編代碼和機器代碼的關系，闡述了匯編承上啟下的作用；接著從機器語言IA32著手，分別講述了如何存儲數據、如何訪問數據

vs2012+Qt5.3.1，xp系統無法運行，靜態編譯失敗

ips use left ucd rdl wpf one amp oai 8w剄19PFV囤衛5http://huiyi.docin.com/ogudb55786 kw覆97X技蘊1倜A澈慫http://shequ.docin.com/sina_5850645250 0

基於模型融合的推薦系統實現(3):模型融合

基本思路很簡單，最小二乘法就好了: 我們假設兩個演算法得到的結果權重分別是a,b利用最小二乘法和我們分出來的第二部分資料就可以獲取a,b使得誤差最小。其實最小二乘法就是求一個廣義的逆即可。最後的RMSE比起單一的模型有所提高,變成了(0.86~~~~) import numpy

易學筆記-系統分析師考試-第3章作業系統基本原理/3.3 記憶體管理/3.3.1 地址變換

幾種程式源程式：使用者用開發語言編寫的程式編譯程式（彙編程式）：專門編譯源程式的程式目標程式：編譯後的程式地址邏輯地址概念：指的是目標程式使用的地址，也稱為相對地址或者虛擬地址格式：一般以0為基地址

易學筆記-計算機底層-第1章：計算機系統漫遊/1.3 瞭解編譯系統的作用

瞭解編譯系統的作用優化程式效能，特別是C語言，不同的寫法編譯器翻譯不同的結果，導致不同的執行效率 switch語句為什麼比if-else更加有效 while迴圈比for迴圈更有效指標引用比陣列應用更有效連結出現的錯誤

易學筆記-系統分析師考試-第6章系統配置與效能評價/6.3 輸入輸出系統/6.3.1 輸入輸出方式

I/O系統組成 I/O裝置 I/O介面（I/O 控制器） I/O控制管理軟體輸入操作：將計算機外部資訊輸入到計算機內部，並進行加工處理輸出操作：將計算機內部資訊經過處理輸出到計算機外部裝置 I/O系統

最全面的EventBus 3.1的使用教程及官方推薦的結合訂閱者索引processor顯著提升效能和實際專案中的使用注意事項

需求場景無論是現在的專案還是以前的專案中，都會遇見執行緒之間通訊，元件之間通訊的需求，我們知道這些需求都可以使用EventBus來處理，為了對比體現出EventBus使用的方便簡潔，我們先來回顧下在EventBus出現以前我們是怎麼處理執行緒間通訊和元件間通訊的。 1，執行緒間通訊

[吳恩達機器學習筆記]16推薦系統1-2基於內容的推薦系統

16.推薦系統 Recommender System 覺得有用的話,歡迎一起討論相互學習~Follow Me 16.1 問題形式化Problem Formulation 推薦系統的改善

推薦系統3-1

推薦系統3-1

從零開始搭搭建系統3.1——頂級pom制定

基於模型融合的推薦系統實現(1)：基於SGD的PMF

若依後臺管理系統 3.1 釋出，新增多項功能

推薦系統3

推薦系統3種主要演算法學習筆記與總結

基於深度學習的推薦系統綜述 (arxiv 1707.07435) 譯文 3.1 ~ 3.3

《推薦系統實踐》程式實現 —— 1.3.2 評測指標

推薦系統(1)

深入理解計算機系統（3.1）------匯編語言和機器語言

vs2012+Qt5.3.1，xp系統無法運行，靜態編譯失敗

推薦系統實踐（項亮）— 第3章推薦系統冷啟動問題

基於模型融合的推薦系統實現(3):模型融合

易學筆記-系統分析師考試-第3章作業系統基本原理/3.3 記憶體管理/3.3.1 地址變換

易學筆記-計算機底層-第1章：計算機系統漫遊/1.3 瞭解編譯系統的作用

易學筆記-系統分析師考試-第6章系統配置與效能評價/6.3 輸入輸出系統/6.3.1 輸入輸出方式

最全面的EventBus 3.1的使用教程及官方推薦的結合訂閱者索引processor顯著提升效能和實際專案中的使用注意事項

推薦系統1---bandits

[吳恩達機器學習筆記]16推薦系統1-2基於內容的推薦系統

推薦系統演算法理論與實踐（1）

推薦系統3-1

相關推薦