PersonalRank學習的一些記錄
阿新 • • 發佈:2018-12-23
想學點推薦的東西,學了點PersonRank演算法,網上資料挺多,做了些實踐後些了下面的一些程式碼,只作為以後複習的資料:
不得不提到自己看過的一個部落格,挺好,至少我看懂了在幹嘛。
https://www.cnblogs.com/zhangchaoyang/articles/5470763.html
import numpy as np import pymysql class LoadDB(): def __init__(self): pass def readFromDB(self): """ 從資料庫中獲使用者與物品之間的喜歡關係資料,處理成矩陣模型後,並匯出到本地CSV檔案中 :return: """ db = pymysql.connect("localhost", "root", "admin") cursor = db.cursor() cursor.execute("use cherishmall;") try: # 查詢所有的使用者 cursor.execute("select user_id_lys from rate_lys group by user_id_lys") useridTmpTuple = cursor.fetchall() useridList = ['user_' + str(id[0]) for id in useridTmpTuple] # 查詢所有的商品 cursor.execute("select goods_id_lys from rate_lys group by goods_id_lys") goodsidTmpTuple = cursor.fetchall() goodsidList = ['goods_' + str(id[0]) for id in goodsidTmpTuple] # 計算頂點個數 PointList = useridList + goodsidList # 查詢某使用者喜歡的商品列表 M = np.array(np.zeros([len(PointList), len(PointList)])) for i, userid in enumerate(useridTmpTuple): cursor.execute("select goods_id_lys from rate_lys where user_id_lys='%s'" % (userid[0])) likeGoodsList = cursor.fetchall() for j, likeGood in enumerate(likeGoodsList): findIndex = PointList.index('goods_' + str(likeGood[0])) M[i][findIndex] = 1 tempM = M.T modelM = M + tempM # 處理完成模型後後,輸出到csv self.write2CSV(modelM, PointList) # 關閉資料庫連線 except Exception as e: print("exception", e) db.close() def write2CSV(self,modelM,PointList): """ 將矩陣模型匯出到CSV檔案中 :param modelM: 矩陣模型 :param PointList: 行/列資料含義 :return: """ header=np.array(PointList) writeModel=np.vstack((header,modelM)) try: np.savetxt('model.csv', writeModel, fmt='%s', delimiter=',') except Exception as e: print(e) if __name__=="__main__": LoadDB().readFromDB()
import numpy as np import json import datetime from numpy.linalg import solve import pymysql import pickle import time import csv from scipy.sparse.linalg import gmres, lgmres from scipy.sparse import csr_matrix from functools import cmp_to_key class LoadModel(): def __init__(self,csvFilePath,alpha): """ 初始化 """ self.init(csvFilePath,alpha) def init(self,csvFilePath,alpha): """ 讀取模型檔案,獲得推薦必須的引數 :return: """ resultTuple=self.readFromCSV(csvFilePath) # 矩陣行/列資料的含義 self.vertex = resultTuple[0] # 模型矩陣 self.M=resultTuple[1] # 某個頂點往下游走的概率alpha,停留在本頂點的是1-alpha self.alpha=alpha def readFromCSV(self,csvFilePath): """ 從csv檔案中讀取模型資料 :return: """ with open(csvFilePath,'r',newline='') as f: csvReader=csv.reader(f) vertex=[] for index,row in enumerate(csvReader): vertex=row break modelM=np.loadtxt(open(csvFilePath,'rb'),dtype='float',delimiter=",",skiprows=1) return (vertex,self.doChangeM(modelM)) def doChangeM(self, M): """ 將0/1模型歸一化後轉換為概率模型 :param M: 0/1模型 :return: 轉換後的概率模型 """ for index, _ in enumerate(M): row = np.reshape(_, (_.size,)) rowSum = np.sum(row) M[index] = M[index] / rowSum return np.matrix(M) def predict(self, vertex, M, alpha): """ 預測推薦結果 :param vertex: 行、列資料表達的意思 :param M: 概率模型引數 :param alpha: 每次遊走離開結點的概率d :return: (所有推薦結果,計算時間) """ # 一次性計算出從任意節點開始遊走的PersonalRank結果。從總體上看,這種方法是最快的 n = M.shape[0] A = np.eye(n) - alpha * M.T begin = time.time() D = A.I end = time.time() useTime=end=begin allDict={} for j in range(n): oneDict={} score = {} total = 0.0 # 用於歸一化 for i in range(n): score[vertex[i]] = D[i, j] total += D[i, j] li = sorted(score.items(), key=cmp_to_key(lambda x, y: x[1] - y[1]), reverse=True) for ele in li: oneDict[ele[0]] = float('%.3f' % (ele[1] / total)) allDict[vertex[j]]=oneDict return (allDict,useTime) def analysis(self,vertex,allDict,M,N): """ 分析最後推薦結果(過濾物品頂點出遊;過濾每個頂點出遊時與使用者的相關度;以及取出toN推薦記錄) :param vertex: 模型行列資料對應值得含義 :param allDict: 所有頂點出遊的各個頂點的相關度結果字典 :param M: 概率模型資料(暫未使用) :param N: top N的N值 :return: 返回最終推薦結果 """ tempM=np.around(M, decimals=4) vertex=self.vertex # print(vertex) # print(tempM) # 排除使用者頂點 # print(allDict) resultDict={} for pointKdy, tmpdict in allDict.items(): if 'user' in pointKdy: recomDict = {} for pointName,value in tmpdict.items(): if 'user' in pointName: continue recomDict[pointName]=value if recomDict: resultDict[pointKdy]=recomDict # print(resultDict) #包括自己喜歡的商品頂點,因為與自己相關但是自己可能沒買過 #最後取出top N作為推薦結果 result={} for userkey, tmpdict in resultDict.items(): sortList=sorted(tmpdict.items(), key=lambda x: x[1], reverse=True) # 不足N個會選取所有 topNList=sortList[:N] result[userkey]=topNList # print(result) return result def writeRecomResult(self,recomResult,recommendedResultOBJPath): """ 使用pickle模組將python物件持久化到本地檔案中,同時也存一份到.txt中參考 :param recomResult: 推薦結果的python物件 :param recommendedResultOBJPath: 持久化檔案路徑 :return: """ with open(recommendedResultOBJPath,'wb') as p_file: pickle.dump(recomResult,p_file) with open(recommendedResultOBJPath+'.txt','w') as f: for userKey,recomList in recomResult.items(): f.write(userKey+"::"+str(recomList)+"\n") self.readRecomResult(recommendedResultOBJPath) def readRecomResult(self,recommendedResultOBJPath): """ 使用pickle模組將資料從持久化檔案中讀取出來 :param recommendedResultOBJPath: 持久化檔案路徑 :return: 推薦結果的python物件 """ recomResult=None with open(recommendedResultOBJPath,'rb') as p_file: recomResult = pickle.load(p_file) return recomResult def upload2DB(self,recomResult): """ 上傳最後的推薦結果到資料庫的推薦表中 :param recomResult: 最後的推薦結果 :return: """ db = pymysql.connect("localhost", "root", "admin") cursor = db.cursor() cursor.execute("use cherishmall;") try: for userKey,recomList in recomResult.items(): tmpList=[] for goodsId,num in recomList: tmpDict={"goodId":goodsId,"num":num} tmpList.append(tmpDict) jsonStr=json.dumps(tmpList) nowTime=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") userid=int(userKey.split("_")[1]) sql="insert into recommended_lys(user_id_lys,goods_id_lys,add_time_lys) values('%d','%s','%s')" %(userid,jsonStr,nowTime) cursor.execute(sql) db.commit() except Exception as e: print("Exception",e) db.rollback() db.close() if __name__=="__main__": loadModel=LoadModel('model.csv',0.8) # allDict=loadModel.predict(loadModel.vertex,loadModel.M,loadModel.alpha)[0] # # print(allDict) # recomResult=loadModel.analysis(loadModel.vertex,allDict,loadModel.M,5) # loadModel.writeRecomResult(recomResult,'recommendedResultOBJ') recomResult=loadModel.readRecomResult("recommendedResultOBJ") print(recomResult) loadModel.upload2DB(recomResult)
資料庫
對應的二部圖:
注:A、B、C分別為user_1、user_2、user_3、user_4 ; a、b、c、d分別為good_1、good_2、good_3、good_4