1 '''
2 Created on Aug 18, 2010
3
4 @author: dwmclary
5 '''
6 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
7 from .. import hdmc
8 from .. hdmc import hdfs
9 import hadoop_config as config
10 import networkx as nx
11 import os
12 import sys
13 import string
14 from GraphLoader import GraphLoader
15
16 -def page_rank(G, name=None, max_iterations=10):
17 '''Compute page rank in parallel for the networkx graph G.'''
18 os.system("echo "+str(1.0/float(len(G))) + " > rank_mass")
19 wd = config.GraphReduce_location
20 ranks = dict(zip(map(str,G.nodes()),[1.0/len(G)]*len(G)))
21 G = GraphLoader(G)
22 if name:
23 G.write_adjlist(name)
24 else:
25 G.write_adjlist("pbfs_input.adjlist")
26 hdfs_handle = G.graph_handle.split("/")[-1]
27 hdfs.rm(hdfs_handle+"/page_rank")
28 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/page_rank/part-00000")
29 ranking = parallel_page_rank(G,hdfs_handle, ranks, 0, max_iterations)
30 return ranking
31
32 -def parallel_page_rank(G, hdfs_handle, old_ranks, iterations, max_iterations):
33 '''Compute page rank in parallel for the networkx graph G.'''
34 hdfs.rm("PPR")
35 hadoop_call = hdmc.build_generic_hadoop_call("PageRank_mapper.py", "PageRank_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass"])
36 hdmc.execute_and_wait(hadoop_call)
37 listing = hdfs.ls("PPR/part*")["stdout"].rstrip().split("\n")
38 for entry in listing:
39 last_part = entry.split("part-")
40 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
41 lost_mass = 0.0
42 for line in data:
43 line = line.strip().split()
44 if "#lost_mass:" in line:
45 lost_mass += float(line[1])
46 os.system("echo " + str(lost_mass) + " > lost_mass")
47
48 hdfs.rm(hdfs_handle+"/page_rank/part*")
49 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
50 hdfs.rm("PPR")
51 hadoop_call = hdmc.build_generic_hadoop_call("LostMass_mapper.py", "LostMass_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass", "lost_mass"])
52 hdmc.execute_and_wait(hadoop_call)
53
54
55 for entry in listing:
56 last_part = entry.split("part-")
57 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
58 rank_sum = 0.0
59 ranks= {}
60 for line in data:
61 pr_value = line.strip().split("pr:")
62 if len(pr_value) > 1:
63 rank = float(pr_value[-1])
64 node = pr_value[0].split()[0]
65 ranks[node] = rank
66 rank_sum+= rank
67
68 converged = True
69 for key in ranks.keys():
70 if abs(ranks[key] - old_ranks[key]) > 0.0001:
71 converged = False
72 break
73
74 iterations += 1
75
76 hdfs.rm(hdfs_handle+"/page_rank/part*")
77 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
78 hdfs.rm("PPR")
79
80 if not converged and iterations < max_iterations:
81 return parallel_page_rank(G, hdfs_handle, ranks, iterations, max_iterations)
82 else:
83 return ranks
84