1. 程式人生 > 實用技巧 >谷歌pagerank演算法

谷歌pagerank演算法

谷歌pagerank演算法

資料描述

  • Directed graph (each unordered pair of nodes is saved once): web-Google.txt
  • Webgraph from the Google programming contest, 2002
  • Nodes: 875713 Edges: 5105039
  • FromNodeId ToNodeId

程式碼

  • 輸出: score top 100的node
import numpy as np
from graphUtil import *

P = 0.8
N = 875713
BIG_N = 930000
# N = 4
# BIG_N = 8
base = (1 / N) * 0.2

init_value = -1
allNodeList = [init_value] * 930000  # value[i] = -1: i is not a valid nodeId; >=0 i is nodeId
nodeTable = [[] for i in range(BIG_N)]

my_old_row = np.zeros(BIG_N)
my_new_row = np.zeros(BIG_N)
my_future_row = np.zeros(BIG_N)
id2rank = dict()

# pre process
def preprocess():

    for i in range(BIG_N):
        nodeTable[i].append(-1)
    for i in range(BIG_N):
        nodeTable[i].append(0)

    with open("web-Google.txt", "r") as f:
    # with open("web_test.txt", "r") as f:
        count = 0
        for line in f:  # iter each line
            # count = count + 1
            # if count == 50:
            #     break
            sourceNodeID, targetNodeID = line.split()
            sourceNodeID = int(sourceNodeID)
            targetNodeID = int(targetNodeID)
            allNodeList[sourceNodeID] = sourceNodeID
            allNodeList[targetNodeID] = targetNodeID
            if nodeTable[sourceNodeID][0] == -1:
                nodeTable[sourceNodeID][0] = sourceNodeID
                nodeTable[sourceNodeID][1] = nodeTable[sourceNodeID][1] + 1
                nodeTable[sourceNodeID].append(targetNodeID)
            else:
                nodeTable[sourceNodeID][1] = nodeTable[sourceNodeID][1] + 1
                nodeTable[sourceNodeID].append(targetNodeID)

    for i in range(BIG_N):
        if allNodeList[i] >= 0:
            # my_new_row[i] = 0  no random
            # my_new_row[i] = base
            my_old_row[i] = 1 / N


if __name__ == '__main__':
    preprocess()
    # for i in range(10):
    #     print(nodeTable[i])
    # print(nodeTable)

    for m_iter in range(10):
        S = 0
        for i in range(BIG_N):
            if allNodeList[i] >= 0:
                for j in nodeTable[i][2:]:
                    my_new_row[j] = my_new_row[j] + P * my_old_row[i] / nodeTable[i][1]

        S = my_new_row.sum()
        print("S: ", S)
        # print(my_new_row)
        for k in range(BIG_N):
            if allNodeList[k] >= 0:
                my_future_row[k] = my_new_row[k] + (1 - S) / N
        for k in range(BIG_N):
            if allNodeList[k] >= 0:
                my_old_row[k] = my_future_row[k]
                my_new_row[k] = 0
        print("range ", m_iter, "finish")

    for k in range(BIG_N):
        if allNodeList[k] >= 0:
            id2rank[k] = my_old_row[k]

    count = 0
    items = id2rank.items()
    final_result = sorted(items, key=lambda kv: (kv[1], kv[0]))
    for key, value in reversed(final_result):
        print(key, " ", value)
        count += 1
        if count == 100:
            break
    # result = np.sort(my_old_row)
    # print(result)
    # print(result.sum())
    # for i in range(92900, 93000):
    #     print(result[i])