【知識發現】隱語義模型LFM演算法python實現(二)
阿新 • • 發佈:2019-01-26
http://blog.csdn.net/fjssharpsword/article/details/78015956
基於該篇文章中的程式碼優化,主要是在生成負樣例上提高執行速度,程式碼參考如下:
# -*- coding: utf-8 -*- ''' Created on 2017年10月16日 @author: Administrator ''' import numpy as np import pandas as pd from math import exp import time import math class LFM: def __init__(self,lclass,iters,alpha,lamda,topk,ratio,traindata): self.lclass = lclass#隱類數量,對效能有影響 self.iters = iters#迭代次數,收斂的最佳迭代次數未知 self.alpha =alpha#梯度下降步長 self.lamda = lamda#正則化引數 self.topk =topk #推薦top k項 self.ratio =ratio #正負樣例比率,對效能最大影響 self.traindata=traindata #初始化開始..... def getUserPositiveItem(self, userid):#生成正樣例 traindata=self.traindata series = traindata[traindata['userid'] == userid]['itemid'] positiveItemList = list(series.values) return positiveItemList def getUserNegativeItem(self, userid):#生成負樣例 traindata=self.traindata itemLen=self.itemLen ratio=self.ratio userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #使用者評分過的物品 negativeItemList = [] count = ratio*len(userItemlist)#生成負樣例的數量 for key,value in itemLen.iteritems():#itemLen.index if count==0: break if key in userItemlist: continue negativeItemList.append(key) count=count-1 return negativeItemList def initUserItem(self, userid): #traindata=self.traindata positiveItem = self.getUserPositiveItem( userid) negativeItem = self.getUserNegativeItem( userid) itemDict = {} for item in positiveItem: itemDict[item] = 1 for item in negativeItem: itemDict[item] = 0 return itemDict def initModel(self): traindata=self.traindata lcalss=self.lclass #隱類數量 userID = list(set(traindata['userid'].values)) self.userID=userID itemID = list(set(traindata['itemid'].values)) self.itemID=itemID itemCount=[len(traindata[traindata['itemid'] == item]['userid']) for item in itemID ] self.itemLen = pd.Series(itemCount, index=itemID).sort_values(ascending=False)#統計每個物品對應的熱門度(次數並降序 #初始化p、q矩陣 arrayp = np.random.rand(len(userID), lcalss) #構造p矩陣,[0,1]內隨機值 arrayq = np.random.rand(lcalss, len(itemID)) #構造q矩陣,[0,1]內隨機值 p = pd.DataFrame(arrayp, columns=range(0,lcalss), index=userID) q = pd.DataFrame(arrayq, columns=itemID, index=range(0,lcalss)) #生成負樣例 userItem = [] for userid in userID: itemDict = self.initUserItem(userid) userItem.append({userid:itemDict}) return p, q, userItem #初始化結束..... def sigmod(self,x): # 單位階躍函式,將興趣度限定在[0,1]範圍內 y = 1.0/(1+exp(-x)) return y def lfmPredict(self,p, q, userID, itemID): #利用引數p,q預測目標使用者對目標物品的興趣度 p = np.mat(p.ix[userID].values) q = np.mat(q[itemID].values).T r = (p * q).sum() r = self.sigmod(r) return r def latenFactorModel(self): #traindata=self.traindata lclass=self.lclass iters=self.iters #迭代次數 alpha = self.alpha #梯度下降步長 lamda = self.lamda #正則化引數 p, q, userItem = self.initModel() for step in range(0, iters): for user in userItem: for userID, samples in user.items(): for itemID, rui in samples.items(): eui = rui - self.lfmPredict(p, q, userID, itemID) for f in range(0, lclass): #print('step %d user %d class %d' % (step, userID, f)) p[f][userID] += alpha * (eui * q[itemID][f] - lamda * p[f][userID]) q[itemID][f] += alpha * (eui * p[f][userID] - lamda * q[itemID][f]) alpha *= 0.9#學習速率 return p, q def recommend(self,userid,p,q): itemID=self.itemID Topk=self.topk #traindata=self.traindata #userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #otherItemList = [item for item in set(traindata['itemid'].values) if item not in userItemlist] predictList = [self.lfmPredict(p, q, userid, itemid) for itemid in itemID] series = pd.Series(predictList, index=itemID) series = series.sort_values(ascending=False)[:Topk] return series def recallAndPrecision(self,p,q):#召回率和準確率 traindata = self.traindata #itemID=self.itemID userID=self.userID hit = 0 recall = 0 precision = 0 for userid in userID: trueItem = traindata[traindata['userid'] == userid]['itemid'] preitem=self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: if item in trueItem: hit += 1 recall += len(trueItem) precision += len(preItem) return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,p,q):#覆蓋率 traindata = self.traindata recommend_items = set() all_items = set() userID=self.userID for userid in userID: trueItem = traindata[traindata['userid'] == userid]['itemid'] for item in trueItem: all_items.add(item) preitem = self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: recommend_items.add(item) return len(recommend_items) / (len(all_items) * 1.0) def popularity(self,p,q):#流行度 #traindata = self.traindata itemLen=self.itemLen #itemID=self.itemID userID=self.userID ret = 0 n = 0 for userid in userID: preitem = self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: ret += math.log(1+itemLen[item]) n += 1 return ret / (n * 1.0) if __name__ == "__main__": start = time.clock() #匯入資料 #df_sample = pd.read_csv("D:\\dev\\workspace\\PyRecSys\\demo\\ratings.csv",names=['userid','itemid','ratings'],header=0) df_sample = pd.read_csv("D:\\tmp\\ratings.csv",names=['userid','itemid','ratings'],header=0) traindata=df_sample[['userid','itemid']] for ratio in [1,2,3,5,10,20]: for lclass in [5,10,20,30,50]: lfm=LFM(lclass,2,0.02,0.01,10,ratio,traindata) #隱類引數 p,q=lfm.latenFactorModel() #推薦 #preitem = lfm.recommend(1, p, q) #print (preitem) #模型評估 print ("%3s%20s%20s%20s%20s%20s" % ('ratio','lcalss',"recall",'precision','coverage','popularity')) recall,precision = lfm.recallAndPrecision(p,q) coverage =lfm.coverage(p,q) popularity =lfm.popularity(p,q) print ("%3d%20d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (ratio,lclass,recall * 100,precision * 100,coverage * 100,popularity)) end = time.clock() print('finish all in %s' % str(end - start))
關注三點:
1)效能受正負樣例比率、隱類數量影響最大,要訓練出一個最佳引數。
2)對於梯度下降的收斂條件,即迭代次數,限定步長為0.02,迭代次數n要訓練出一個最佳值。
3)對於增量資料的訓練:儲存p、q矩陣,對於增量樣本集,可以在p、q基礎上訓練,有待實踐驗證,避免每次全量訓練耗費效能。