谷歌pagerank演算法
阿新 • • 發佈:2020-12-23
谷歌pagerank演算法
資料描述
- Directed graph (each unordered pair of nodes is saved once): web-Google.txt
- Webgraph from the Google programming contest, 2002
- Nodes: 875713 Edges: 5105039
- FromNodeId ToNodeId
程式碼
- 輸出: score top 100的node
import numpy as np from graphUtil import * P = 0.8 N = 875713 BIG_N = 930000 # N = 4 # BIG_N = 8 base = (1 / N) * 0.2 init_value = -1 allNodeList = [init_value] * 930000 # value[i] = -1: i is not a valid nodeId; >=0 i is nodeId nodeTable = [[] for i in range(BIG_N)] my_old_row = np.zeros(BIG_N) my_new_row = np.zeros(BIG_N) my_future_row = np.zeros(BIG_N) id2rank = dict() # pre process def preprocess(): for i in range(BIG_N): nodeTable[i].append(-1) for i in range(BIG_N): nodeTable[i].append(0) with open("web-Google.txt", "r") as f: # with open("web_test.txt", "r") as f: count = 0 for line in f: # iter each line # count = count + 1 # if count == 50: # break sourceNodeID, targetNodeID = line.split() sourceNodeID = int(sourceNodeID) targetNodeID = int(targetNodeID) allNodeList[sourceNodeID] = sourceNodeID allNodeList[targetNodeID] = targetNodeID if nodeTable[sourceNodeID][0] == -1: nodeTable[sourceNodeID][0] = sourceNodeID nodeTable[sourceNodeID][1] = nodeTable[sourceNodeID][1] + 1 nodeTable[sourceNodeID].append(targetNodeID) else: nodeTable[sourceNodeID][1] = nodeTable[sourceNodeID][1] + 1 nodeTable[sourceNodeID].append(targetNodeID) for i in range(BIG_N): if allNodeList[i] >= 0: # my_new_row[i] = 0 no random # my_new_row[i] = base my_old_row[i] = 1 / N if __name__ == '__main__': preprocess() # for i in range(10): # print(nodeTable[i]) # print(nodeTable) for m_iter in range(10): S = 0 for i in range(BIG_N): if allNodeList[i] >= 0: for j in nodeTable[i][2:]: my_new_row[j] = my_new_row[j] + P * my_old_row[i] / nodeTable[i][1] S = my_new_row.sum() print("S: ", S) # print(my_new_row) for k in range(BIG_N): if allNodeList[k] >= 0: my_future_row[k] = my_new_row[k] + (1 - S) / N for k in range(BIG_N): if allNodeList[k] >= 0: my_old_row[k] = my_future_row[k] my_new_row[k] = 0 print("range ", m_iter, "finish") for k in range(BIG_N): if allNodeList[k] >= 0: id2rank[k] = my_old_row[k] count = 0 items = id2rank.items() final_result = sorted(items, key=lambda kv: (kv[1], kv[0])) for key, value in reversed(final_result): print(key, " ", value) count += 1 if count == 100: break # result = np.sort(my_old_row) # print(result) # print(result.sum()) # for i in range(92900, 93000): # print(result[i])