1 '''
2 Created on Aug 18, 2010
3
4 @author: dwmclary
5 '''
6 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
7 from .. import hdmc
8 from .. hdmc import hdfs
9 import hadoop_config as config
10 import networkx as nx
11 import os
12 import sys
13 import string
14 from GraphLoader import GraphLoader
15
16 -def page_rank(G, name=None, max_iterations=10):
17 os.system("echo "+str(1.0/float(len(G))) + " > rank_mass")
18 wd = config.GraphReduce_location
19 ranks = dict(zip(map(str,G.nodes()),[1.0/len(G)]*len(G)))
20 G = GraphLoader(G)
21 if name:
22 G.write_adjlist(name)
23 else:
24 G.write_adjlist("pbfs_input.adjlist")
25 hdfs_handle = G.graph_handle.split("/")[-1]
26 hdfs.rm(hdfs_handle+"/page_rank")
27 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/page_rank/part-00000")
28 ranking = parallel_page_rank(G,hdfs_handle, ranks, 0, max_iterations)
29 return ranking
30
31 -def parallel_page_rank(G, hdfs_handle, old_ranks, iterations, max_iterations):
32 hdfs.rm("PPR")
33 hadoop_call = hdmc.build_generic_hadoop_call("PageRank_mapper.py", "PageRank_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass"])
34 hdmc.execute_and_wait(hadoop_call)
35 listing = hdfs.ls("PPR/part*")["stdout"].rstrip().split("\n")
36 for entry in listing:
37 last_part = entry.split("part-")
38 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
39 lost_mass = 0.0
40 for line in data:
41 line = line.strip().split()
42 if "#lost_mass:" in line:
43 lost_mass += float(line[1])
44 os.system("echo " + str(lost_mass) + " > lost_mass")
45
46 hdfs.rm(hdfs_handle+"/page_rank/part*")
47 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
48 hdfs.rm("PPR")
49 hadoop_call = hdmc.build_generic_hadoop_call("LostMass_mapper.py", "LostMass_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass", "lost_mass"])
50 hdmc.execute_and_wait(hadoop_call)
51
52
53 for entry in listing:
54 last_part = entry.split("part-")
55 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
56 rank_sum = 0.0
57 ranks= {}
58 for line in data:
59 pr_value = line.strip().split("pr:")
60 if len(pr_value) > 1:
61 rank = float(pr_value[-1])
62 node = pr_value[0].split()[0]
63 ranks[node] = rank
64 rank_sum+= rank
65
66 converged = True
67 for key in ranks.keys():
68 if abs(ranks[key] - old_ranks[key]) > 0.0001:
69 converged = False
70 break
71
72 iterations += 1
73
74 hdfs.rm(hdfs_handle+"/page_rank/part*")
75 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
76 hdfs.rm("PPR")
77
78 if not converged and iterations < max_iterations:
79 return parallel_page_rank(G, hdfs_handle, ranks, iterations, max_iterations)
80 else:
81 return ranks
82