1. 程式人生 > 其它 >論文復現——AutoRec: Autoencoders Meet Collaborative Filtering

論文復現——AutoRec: Autoencoders Meet Collaborative Filtering

《AutoRec: Autoencoders Meet Collaborative Filtering》是2015年Suvash等人發表在“The Web Conference”會議上的一篇論文,作者提出用自編碼器預測使用者對電影的評分。論文比較短,只有兩頁,可以說是深度學習在推薦系統領域應用的開端。

ABSTRACT

本文提出了一個新穎的基於自編碼器的協同過濾框架——AutoRec。實驗表明,AutoRec在Movielens資料集上的表現優於目前最好的方法(矩陣分解、受限玻爾茲曼機、LLORMA)。

THE AUTOREC MODEL

假設有\(m\)個使用者,\(n\)個商品,並且有使用者對商品的評分矩陣\(R\in \mathbb{R}^{m\times n}\)

,則使用者\(u\)對所有商品的評分可以用不完全的向量\(r^{(u)}={R_{u1},...,R_{u2}}\)表示(不完全意思是,\(r^{(u)}\)中的元素有的是真實的評分資料,有的是需要我們預測的)。自編碼器的作用就是將\(r^{(u)}\)作為輸入資料,經過編碼器將向量對映維一個低維的向量,然後通過解碼器重構輸出向量,使輸出向量趨近於輸入向量,同時能夠補全原始輸入向量中的缺失值。自編碼器模型可以表示為:

\[min\sum_{r\in S}^{}\left \| r-h(r;\theta ) \right \|^{2}_{2} \]

程式碼復現

完整程式碼及資料集已上傳至github


import os
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.utils.data as Data
import matplotlib.pyplot as plt

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'


col_name = ["userid", "movieid", "rating", "timestrap"]
u1_base_path = "data/u1.base"
u1_base = pd.read_table(u1_base_path, sep='\t', header=None, names=col_name)
# print(u1_base.head(5))

u1_test_path = "data/u1.test"
u1_test = pd.read_table(u1_test_path, sep='\t', header=None, names=col_name)
# print(u1_test.head(5))


# 將資料轉換為 user-item 互動矩陣
def TranslateData(data):
    user_num = data.userid.nunique()  # 使用者的個數
    movie_num = 1682  # 電影個數(資料中標明的所有電影數)
    data_mat = np.zeros(user_num * movie_num).reshape((-1, movie_num)) + 3

    k = 0
    for i in range(data.shape[0]):
        data_mat[k][data.iloc[i, 1] - 1] = data.iloc[i, 2]
        if i > 0 and data.iloc[i, 0] != data.iloc[i - 1, 0]:
            k += 1

    return data_mat


class AutoRec(nn.Module):
    def __init__(self, input_num, hidden_num):
        super(AutoRec, self).__init__()
        self.input_num = input_num
        self.hidden_num = hidden_num
        self.encoder = nn.Linear(self.input_num, self.hidden_num, bias=True)
        self.relu = nn.ReLU()
        self.decoder = nn.Linear(self.hidden_num, self.input_num, bias=True)

    def forward(self, x):
        hidden = self.encoder(x)
        hidden = self.relu(hidden)
        out = self.decoder(hidden)

        return out


def GetData(data_mat):
    dataset = Data.TensorDataset(torch.tensor(data_mat, dtype=torch.float32),
                                 torch.zeros(data_mat.shape[0], 1).view(-1, 1))

    loader = Data.DataLoader(
        dataset=dataset,
        batch_size=64,
        shuffle=False
    )

    return loader


epochs = 100
input_num, hidden_num = 1682, 200
model = AutoRec(input_num, hidden_num)
learning_rate = 0.0003
optimizer = torch.optim.Adam([
        {'params': (p for name, p in model.named_parameters() if 'bias' not in name)},
        {'params': (p for name, p in model.named_parameters() if 'bias' in name), 'weight_decay': 0.}
    ], lr=learning_rate, weight_decay=0.001)
loss_func = torch.nn.MSELoss()
loss_train_set = []
loss_test_set = []


def run():
    train()
    draw(loss_train_set)


def train():
    train_data_mat = TranslateData(u1_base)
    r = train_data_mat[0]
    train_loader = GetData(train_data_mat)

    for epoch in range(epochs):
        rmse_loss = 0
        for step, (X, y) in enumerate(train_loader):
            out = model(X)
            rmse_loss = torch.sqrt(loss_func(out, X))
            rmse_loss.backward()
            optimizer.step()
        loss_train_set.append(rmse_loss)

        if epoch % 100 == 0:
            print("epoch %d" % (epoch + 1))
    test()


def test():
    test_data_mat = TranslateData(u1_test)
    test_loader = GetData(test_data_mat)
    with torch.no_grad():
        rmse_loss = 0
        for step, (X, y) in enumerate(test_loader):
            out = model(X)
            rmse_loss += torch.sqrt(loss_func(out, X))
        print("test_loss: %f" % (rmse_loss / test_data_mat.shape[0]))


def draw(loss_train_set):
    x = [i for i in range(len(loss_train_set))]
    plt.plot(x, loss_train_set, label="Training loss")
    plt.xlabel("epochs")
    plt.ylabel("rmse")
    plt.legend()
    plt.show()


if __name__ == "__main__":
    run()