1. 程式人生 > >Surprise庫的相關實踐(推薦系統)

Surprise庫的相關實踐(推薦系統)

from surprise import KNNBasic,SVD
from surprise import Dataset
from surprise import evaluate, print_perf
# http://surprise.readthedocs.io/en/stable/index.html
# http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# We'll use the famous KNNBasic algorithm.
algo = KNNBasic()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

列印結果:

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9867
MAE:  0.7792
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9884
MAE:  0.7811
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9896
MAE:  0.7826
------------
------------
Mean RMSE: 0.9883
Mean MAE : 0.7810
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9867  0.9884  0.9896  0.9883  
MAE     0.7792  0.7811  0.7826  0.7810  

from surprise import GridSearch

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

grid_search.evaluate(data)

列印結果:

Running grid search for the following parameter combinations:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
Resulsts:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.9969328477745982, 'FCP': 0.683368400695696}
----------
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 1.0033151605930943, 'FCP': 0.6867249347580507}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9734942565850515, 'FCP': 0.6940454873982795}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9823131855683238, 'FCP': 0.6944827981040061}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.977887292257368, 'FCP': 0.6923914815948694}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 0.9862324957086702, 'FCP': 0.69290504024308}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9636592234524777, 'FCP': 0.6981147216456689}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9734215277751971, 'FCP': 0.6982590050003091}
----------

# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])


# best FCP score
print(grid_search.best_score['FCP'])


# combination of parameters that gave the best FCP score
print(grid_search.best_params['FCP'])

列印結果:

0.9636592234524777
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
0.6982590050003091
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

import pandas as pd  

results_df = pd.DataFrame.from_dict(grid_search.cv_results)
results_df

列印結果:

paramsscores
0{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}{'RMSE': 0.9969328477745982, 'FCP': 0.68336840...
1{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}{'RMSE': 1.0033151605930943, 'FCP': 0.68672493...
2{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}{'RMSE': 0.9734942565850515, 'FCP': 0.69404548...
3{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}{'RMSE': 0.9823131855683238, 'FCP': 0.69448279...
4{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}{'RMSE': 0.977887292257368, 'FCP': 0.692391481...
5{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}{'RMSE': 0.9862324957086702, 'FCP': 0.69290504...
6{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}{'RMSE': 0.9636592234524777, 'FCP': 0.69811472...
7{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}{'RMSE': 0.9734215277751971, 'FCP': 0.6982590

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset


def read_item_names():


    file_name = ('u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid



data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()#轉換成最初標準的矩陣形式
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)
rid_to_name, name_to_rid = read_item_names()

toy_story_raw_id = name_to_rid['Now and Then (1995)']#先找電影的Id
toy_story_raw_id
列印結果:
1053'

toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id#在矩陣中的ID

列印結果:

961

toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=5)#找離當前電影最近的5個電影
toy_story_neighbors

列印結果:

[291, 82, 366, 528, 179]

toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)#先轉回電影的Id
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)#再轉回電影的名字

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

列印結果:

The 10 nearest neighbors of Toy Story are:
While You Were Sleeping (1995)
Batman (1989)
Dave (1993)
Mrs. Doubtfire (1993)
Groundhog Day (1993)