Surprise庫的相關實踐(推薦系統)
阿新 • • 發佈:2019-01-09
from surprise import KNNBasic,SVD from surprise import Dataset from surprise import evaluate, print_perf # http://surprise.readthedocs.io/en/stable/index.html # http://files.grouplens.org/datasets/movielens/ml-100k-README.txt # Load the movielens-100k dataset (download it if needed), # and split it into 3 folds for cross-validation. data = Dataset.load_builtin('ml-100k') data.split(n_folds=3) # We'll use the famous KNNBasic algorithm. algo = KNNBasic() # Evaluate performances of our algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
列印結果:
Evaluating RMSE, MAE of algorithm KNNBasic. ------------ Fold 1 Computing the msd similarity matrix... Done computing similarity matrix. RMSE: 0.9867 MAE: 0.7792 ------------ Fold 2 Computing the msd similarity matrix... Done computing similarity matrix. RMSE: 0.9884 MAE: 0.7811 ------------ Fold 3 Computing the msd similarity matrix... Done computing similarity matrix. RMSE: 0.9896 MAE: 0.7826 ------------ ------------ Mean RMSE: 0.9883 Mean MAE : 0.7810 ------------ ------------ Fold 1 Fold 2 Fold 3 Mean RMSE 0.9867 0.9884 0.9896 0.9883 MAE 0.7792 0.7811 0.7826 0.7810
from surprise import GridSearch
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
列印結果:
Running grid search for the following parameter combinations: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4} {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6} {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4} {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6} {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4} {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6} {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
Resulsts: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4} {'RMSE': 0.9969328477745982, 'FCP': 0.683368400695696} ---------- {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6} {'RMSE': 1.0033151605930943, 'FCP': 0.6867249347580507} ---------- {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4} {'RMSE': 0.9734942565850515, 'FCP': 0.6940454873982795} ---------- {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6} {'RMSE': 0.9823131855683238, 'FCP': 0.6944827981040061} ---------- {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4} {'RMSE': 0.977887292257368, 'FCP': 0.6923914815948694} ---------- {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6} {'RMSE': 0.9862324957086702, 'FCP': 0.69290504024308} ---------- {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} {'RMSE': 0.9636592234524777, 'FCP': 0.6981147216456689} ---------- {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6} {'RMSE': 0.9734215277751971, 'FCP': 0.6982590050003091} ----------
# best RMSE score
print(grid_search.best_score['RMSE'])
# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])
# best FCP score
print(grid_search.best_score['FCP'])
# combination of parameters that gave the best FCP score
print(grid_search.best_params['FCP'])
列印結果:
0.9636592234524777 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} 0.6982590050003091 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
import pandas as pd
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
results_df
列印結果:
params | scores | |
---|---|---|
0 | {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4} | {'RMSE': 0.9969328477745982, 'FCP': 0.68336840... |
1 | {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6} | {'RMSE': 1.0033151605930943, 'FCP': 0.68672493... |
2 | {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4} | {'RMSE': 0.9734942565850515, 'FCP': 0.69404548... |
3 | {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6} | {'RMSE': 0.9823131855683238, 'FCP': 0.69448279... |
4 | {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4} | {'RMSE': 0.977887292257368, 'FCP': 0.692391481... |
5 | {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6} | {'RMSE': 0.9862324957086702, 'FCP': 0.69290504... |
6 | {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} | {'RMSE': 0.9636592234524777, 'FCP': 0.69811472... |
7 | {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6} | {'RMSE': 0.9734215277751971, 'FCP': 0.6982590 |
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
def read_item_names():
file_name = ('u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()#轉換成最初標準的矩陣形式
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)
rid_to_name, name_to_rid = read_item_names()
toy_story_raw_id = name_to_rid['Now and Then (1995)']#先找電影的Id
toy_story_raw_id
列印結果:1053'
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id#在矩陣中的ID
列印結果:
961
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=5)#找離當前電影最近的5個電影
toy_story_neighbors
列印結果:
[291, 82, 366, 528, 179]
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
for inner_id in toy_story_neighbors)#先轉回電影的Id
toy_story_neighbors = (rid_to_name[rid]
for rid in toy_story_neighbors)#再轉回電影的名字
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
print(movie)
列印結果:
The 10 nearest neighbors of Toy Story are: While You Were Sleeping (1995) Batman (1989) Dave (1993) Mrs. Doubtfire (1993) Groundhog Day (1993)