1. 程式人生 > >Factorization Machines簡介與程式碼實現

Factorization Machines簡介與程式碼實現

介紹

FM是聯合SVM與因式分解模型的優點所得。在有比較大的資料稀疏情況下,也能從中找出聯絡。FM可以線上性時間內優化。

優點

  1. 可以在非常稀疏的資料中進行合理的引數估計

  2. FM模型的時間複雜度是線性的

  3. FM是一個通用模型,它可以用於任何特徵為實值的情況

特徵向量例子

在這裡插入圖片描述

演算法原理

  1. model equation:
    在這裡插入圖片描述
  2. Expressiveness:

對於一個W總是存在 W =

V V t W=V·V^t ,也就說對於任何W只要V的列k取得適當,總是能從 W =
V V t W=V·V^t
獲得。但是在資料非常稀疏的時候,因為沒有足夠的資料來得到W,那麼就可以通過 W =
V V t W=V·V^t
,V的k取得足夠小來得到W。

  1. Parameter Estimation Under Sparsity:

因為FM的因式分解,打破了變數之間的獨立性,使我們可以通過一個互動來估計相關互動的引數

  1. Computation:
    在這裡插入圖片描述
    對於上述公式,時間複雜度是O(k n 2 n^2 )

但是對於上述公式成對互動可以重新化簡為:
在這裡插入圖片描述
第一步推導可以從下圖得出:
在這裡插入圖片描述
則複雜度變為了O(kn)

FM as Predictor

  • 可以做迴歸

  • 二分類

  • 排序

上述都可以使用L2正則來優化防止過擬合

Learning FM

在這裡插入圖片描述
利用梯度來更新

程式碼實現

簡單資料:


import numpy as np 

import tensorflow as tf 

x_data = np.matrix([ 

# Users | Movies | Movie Ratings | Time | Last Movies Rated 

# A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST 

[1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0 ], 

[1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0 ], 

[1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0 ], 

[0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0 ], 

[0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0 ], 

[0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0 ], 

[0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0 ] 

]) 

# ratings 

y_data = np.array([5, 3, 1, 4, 5, 1, 5]) 

# Let's add an axis to make tensoflow happy. 

y_data.shape += (1, ) 

n, p = x_data.shape 

# number of latent factors 

k = 5 

# design matrix 

X = tf.placeholder('float32', [n, p]) 

# target vector 

y = tf.placeholder('float32', [n, 1]) 

# bias and weights 

w0 = tf.Variable(tf.zeros([1])) 

W = tf.Variable(tf.zeros([p])) 

# interaction factors, randomly initialized 

V = tf.Variable(tf.random_normal([k, p], stddev=0.01)) 

# estimate of y, initialized to 0. 

y_hat = tf.Variable(tf.zeros([n, 1])) 

linear_terms = tf.add(w0, 

tf.reduce_sum( 

tf.multiply(W, X), 1, keepdims=True)) 

interactions = (tf.multiply(0.5, 

tf.reduce_sum( 

tf.subtract( 

tf.pow(tf.matmul(X, tf.transpose(V)), 2), 

tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))), 

1, keepdims=True))) 

y_hat = tf.add(linear_terms, interactions) 

# L2 regularized sum of squares loss function over W and V 

lambda_w = tf.constant(0.001, name='lambda_w') 

lambda_v = tf.constant(0.001, name='lambda_v') 

l2_norm = (tf.reduce_sum( 

tf.add( 

tf.multiply(lambda_w, tf.pow(W, 2)), 

tf.multiply(lambda_v, tf.pow(V, 2))))) 

error = tf.reduce_mean(tf.square(tf.subtract(y, y_hat))) 

loss = tf.add(error, l2_norm) 

eta = tf.constant(0.1) 

optimizer = tf.train.AdagradOptimizer(eta).minimize(loss) 

# that's a lot of iterations 

N_EPOCHS = 1000 

# Launch the graph. 

init = tf.global_variables_initializer() 

with tf.Session() as sess: 

sess.run(init) 

for epoch in range(N_EPOCHS): 

# indices = np.arange(n) 

# np.random.shuffle(indices) 

# x_data, y_data = x_data[indices], y_data[indices] 

sess.run(optimizer, feed_dict={X: x_data, y: y_data}) 

print('MSE: ', sess.run(error, feed_dict={X: x_data, y: y_data})) 

print('Loss (regularized error):', sess.run(loss, feed_dict={X: x_data, y: y_data})) 

print('Predictions:', sess.run(y_hat, feed_dict={X: x_data, y: y_data})) 

print('Learnt weights:', sess.run(W, feed_dict={X: x_data, y: y_data})) 

print('Learnt factors:', sess.run(V, feed_dict={X: x_data, y: y_data})) 

複雜資料版:使用資料來自MovieLens100K Dataset


from scipy.sparse import csr 

import pandas as pd 

import numpy as np 

import tensorflow as tf 

def vectorize_dic(dic,ix=None,p=None,n=0,g=0): 

""" 

dic -- dictionary of feature lists. Keys are the name of features 

ix -- index generator (default None) 

p -- dimension of feature space (number of columns in the sparse matrix) (default None) 

""" 

if ix==None: 

ix = dict() 

nz = n * g 

col_ix = np.empty(nz,dtype = int) 

i = 0 

numofUsers=0 

flag=True 

for k,lis in dic.items(): 

for t in range(len(lis)): 

if k=='users': 

ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),lis[t]-1) 

elif k=='items': 

if flag==True: 

numofUsers=len(ix) 

flag=False 

ix[str(lis[t]) + str(k)] = lis[t]-1+numofUsers 

col_ix[i+t*g] = ix[str(lis[t]) + str(k)] 

i += 1 

row_ix = np.repeat(np.arange(0,n),g) 

data = np.ones(nz) 

if p == None: 

p = len(ix) 

ixx = np.where(col_ix < p) 

return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix 

#where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k]. 

def batcher(X_, y_=None, batch_size=-1): 

n_samples = X_.shape[0] 

if batch_size == -1: 

batch_size = n_samples 

if batch_size < 1: 

raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size)) 

for i in range(0, n_samples, batch_size): 

upper_bound = min(i + batch_size, n_samples) 

ret_x = X_[i:upper_bound] 

if y_ is not None: 

ret_y = y_[i:upper_bound] 

yield (ret_x, ret_y) 

cols = ['user','item','rating','timestamp'] 

train = pd.read_csv('ua.base',delimiter='\t',names = cols) 

test = pd.read_csv('ua.test',delimiter='\t',names = cols) 

x_train,ix = vectorize_dic({'users':train['user'].values, 

'items':train['item'].values},n=len(train.index),g=2) 

x_test,ix = vectorize_dic({'users':test['user'].values, 

'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2) 

y_train = train['rating'].values 

y_test = test['rating'].values 

x_train = x_train.todense() 

x_test = x_test.todense() 

n,p = x_train.shape 

k = 10 

x = tf.placeholder('float',[None,p]) 

y = tf.placeholder('float',[None,1]) 

w0 = tf.Variable(tf.zeros([1])) 

w = tf.Variable(tf.zeros([p])) 

v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01)) 

#y_hat = tf.Variable(tf.zeros([n,1])) 

linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keepdims=True)) # n * 1 

pair_interactions = 0.5 * tf.reduce_sum( 

tf.subtract( 

tf.pow( 

tf.matmul(x,tf.transpose(v)),2), 

tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2))) 

),axis = 1 , keepdims=True) 

y_hat = tf.add(linear_terms,pair_interactions) 

lambda_w = tf.constant(0.001,name='lambda_w') 

lambda_v = tf.constant(0.001,name='lambda_v') 

l2_norm = tf.reduce_sum( 

tf.add( 

tf.multiply(lambda_w,tf.pow(w,2)), 

tf.multiply(lambda_v,tf.pow(v,2)) 

) 

) 

error = tf.reduce_mean(tf.square(y-y_hat)) 

loss = tf.add(error,l2_norm) 

train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) 

epochs = 10 

batch_size = 1000 

# Launch the graph 

init = tf.global_variables_initializer() 

with tf.Session() as sess: 

sess.run(init) 

for epoch in range(epochs): 

perm = np.random.permutation(x_train.shape[0]) 

# iterate over batches 

for bX, bY in batcher(x_train[perm], y_train[perm], batch_size): 

_,t = sess.run([train_op,loss], feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)}) 

print(t) 

print('MSE: ', sess.run(error, feed_dict={x: x_test.reshape(-1, p), y: y_test.reshape(-1, 1)})) 

print('Predictions:', sess.run(y_hat, feed_dict={x: x_test.reshape(-1, p), y: y_test.reshape(-1, 1)}))