Surprise——Python推薦系統庫
阿新 • • 發佈:2018-11-17
轉載自:https://blog.csdn.net/mycafe_/article/details/79146764
Surprise
在推薦系統的建模過程中,我們將用到python庫 Surprise(Simple Python RecommendatIon System Engine),是scikit系列中的一個(很多同學用過scikit-learn和scikit-image等庫)。Surprise的User Guide有詳細的解釋和說明
簡單易用,同時支援多種推薦演算法:
- 基礎演算法/baseline algorithms
- 基於近鄰方法(協同過濾)/neighborhood methods
- 矩陣分解方法/matrix factorization-based (SVD, PMF, SVD++, NMF)
演算法類名 | 說明 |
---|---|
random_pred.NormalPredictor | Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. |
baseline_only.BaselineOnly | Algorithm predicting the baseline estimate for given user and item. |
knns.KNNBasic | A basic collaborative filtering algorithm. |
knns.KNNWithMeans | A basic collaborative filtering algorithm, taking into account the mean ratings of each user. |
knns.KNNBaseline | A basic collaborative filtering algorithm taking into account a baseline rating. |
matrix_factorization.SVD | The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. |
matrix_factorization.SVDpp | The SVD++ algorithm, an extension of SVD taking into account implicit ratings. |
matrix_factorization.NMF | A collaborative filtering algorithm based on Non-negative Matrix Factorization. |
slope_one.SlopeOne | A simple yet accurate collaborative filtering algorithm. |
co_clustering.CoClustering | A collaborative filtering algorithm based on co-clustering. |
其中基於近鄰的方法(協同過濾)可以設定不同的度量準則。
相似度度量標準 | 度量標準說明 |
---|---|
cosine | Compute the cosine similarity between all pairs of users (or items). |
msd | Compute the Mean Squared Difference similarity between all pairs of users (or items). |
pearson | Compute the Pearson correlation coefficient between all pairs of users (or items). |
pearson_baseline | Compute the (shrunk) Pearson correlation coefficient between all pairs of users (or items) using baselines for centering instead of means. |
支援不同的評估準則
評估準則 | 準則說明 |
---|---|
rmse | Compute RMSE (Root Mean Squared Error). |
mae | Compute MAE (Mean Absolute Error). |
fcp | Compute FCP (Fraction of Concordant Pairs). |
使用示例
基本使用方法如下
# 可以使用上面提到的各種推薦系統演算法
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
# 預設載入movielens資料集,會提示是否下載這個資料集,這是非常經典的公開推薦系統資料集——MovieLens資料集之一
data = Dataset.load_builtin('ml-100k')
# k折交叉驗證(k=3)
data.split(n_folds=3)
# 試一把SVD矩陣分解
algo = SVD()
# 在資料集上測試一下效果
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#輸出結果
print_perf(perf)
載入自己的資料集方法
# 指定檔案所在路徑
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 告訴文字閱讀器,文字的格式是怎麼樣的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 載入資料
data = Dataset.load_from_file(file_path, reader=reader)
# 手動切分成5折(方便交叉驗證)
data.split(n_folds=5)
演算法調參(讓推薦系統有更好的效果)
這裡實現的演算法用到的演算法無外乎也是SGD等,因此也有一些超引數會影響最後的結果,我們同樣可以用sklearn中常用到的網格搜尋交叉驗證(GridSearchCV)來選擇最優的引數。簡單的例子如下所示:
# 定義好需要優選的引數網格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
# 使用網格搜尋交叉驗證
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
# 在資料集上找到最好的引數
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
# 輸出調優的引數組
# 輸出最好的RMSE結果
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386
# 輸出對應最好的RMSE結果的引數
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}
# 最好的FCP得分
print(grid_search.best_score['FCP'])
# >>> 0.702279736531
# 對應最高FCP得分的引數
print(grid_search.best_params['FCP'])
# >>> {'reg_all': 0.6, 'lr_all': 0.005, 'n_epochs': 10}
在自己的資料集上訓練模型
首先載入資料
import os
from surprise import Reader, Dataset
# 指定檔案路徑
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
music_data.split(n_folds=5)
使用不同的推薦系統演算法進行建模比較
### 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用基礎版協同過濾
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用均值協同過濾
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用協同過濾baseline
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用SVD
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用SVD++
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
### 使用NMF
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)
建模和儲存模型
1.用協同過濾構建模型並進行預測
1.1 movielens的例子
# 可以使用上面提到的各種推薦系統演算法
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
# 預設載入movielens資料集
data = Dataset.load_builtin('ml-100k')
# k折交叉驗證(k=3)
data.split(n_folds=3)
# 試一把SVD矩陣分解
algo = SVD()
# 在資料集上測試一下效果
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#輸出結果
print_perf(perf)
"""
以下的程式段告訴大家如何在協同過濾演算法建模以後,根據一個item取回相似度最高的item,主要是用到algo.get_neighbors()這個函式
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
def read_item_names():
"""
獲取電影名到電影id 和 電影id到電影名的對映
"""
file_name = (os.path.expanduser('~') +
'/.surprise_data/ml-100k/ml-100k/u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
# 首先,用演算法計算相互間的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)
# 獲取電影名到電影id 和 電影id到電影名的對映
rid_to_name, name_to_rid = read_item_names()
# Retieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
for rid in toy_story_neighbors)
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
print(movie)
1.2 音樂預測的例子
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
import cPickle as pickle
# 重建歌單id到歌單名的對映字典
id_name_dic = pickle.load(open("popular_playlist.pkl","rb"))
print("載入歌單id到歌單名的對映字典完成...")
# 重建歌單名到歌單id的對映字典
name_id_dic = {}
for playlist_id in id_name_dic:
name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("載入歌單名到歌單id的對映字典完成...")
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 計算歌曲和歌曲之間的相似度
print("構建資料集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}
- current_playlist => 歌單名
- playlist_id => 歌單id(網易給的歌單id)
- playlist_inner_id => 內部id(對所有歌單id重新從1開始編碼)
print("開始訓練模型...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()
algo.train(trainset)
current_playlist = name_id_dic.keys()[39]
print(current_playlist)
# 取出近鄰
playlist_id = name_id_dic[current_playlist]
print(playlist_id)
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print(playlist_inner_id)
playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)
# 把歌曲id轉成歌曲名字
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
for playlist_id in playlist_neighbors)
print()
print("和歌單 《", current_playlist, "》 最接近的10個歌單為:\n")
for playlist in playlist_neighbors:
print(playlist)
2.用SVD矩陣分解進行預測
### 使用SVD++
from surprise import SVDpp, evaluate
from surprise import Dataset
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 構建資料集和建模
algo = SVDpp()
trainset = music_data.build_full_trainset()
algo.train(trainset)