Package ziggy :: Package GraphReduce :: Module gr_link_analysis
[hide private]
[frames] | no frames]

Source Code for Module ziggy.GraphReduce.gr_link_analysis

 1  ''' 
 2  Created on Aug 18, 2010 
 3   
 4  @author: dwmclary 
 5  ''' 
 6  __author__ = "D. McClary (dan.mcclary@northwestern.edu)" 
 7  from .. import hdmc 
 8  from .. hdmc import hdfs 
 9  import hadoop_config as config 
10  import networkx as nx 
11  import os 
12  import sys 
13  import string 
14  from GraphLoader import GraphLoader 
15   
16 -def page_rank(G, name=None, max_iterations=10):
17 '''Compute page rank in parallel for the networkx graph G.''' 18 os.system("echo "+str(1.0/float(len(G))) + " > rank_mass") 19 wd = config.GraphReduce_location 20 ranks = dict(zip(map(str,G.nodes()),[1.0/len(G)]*len(G))) 21 G = GraphLoader(G) 22 if name: 23 G.write_adjlist(name) 24 else: 25 G.write_adjlist("pbfs_input.adjlist") 26 hdfs_handle = G.graph_handle.split("/")[-1] 27 hdfs.rm(hdfs_handle+"/page_rank") 28 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/page_rank/part-00000") 29 ranking = parallel_page_rank(G,hdfs_handle, ranks, 0, max_iterations) 30 return ranking
31
32 -def parallel_page_rank(G, hdfs_handle, old_ranks, iterations, max_iterations):
33 '''Compute page rank in parallel for the networkx graph G.''' 34 hdfs.rm("PPR") 35 hadoop_call = hdmc.build_generic_hadoop_call("PageRank_mapper.py", "PageRank_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass"]) 36 hdmc.execute_and_wait(hadoop_call) 37 listing = hdfs.ls("PPR/part*")["stdout"].rstrip().split("\n") 38 for entry in listing: 39 last_part = entry.split("part-") 40 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n") 41 lost_mass = 0.0 42 for line in data: 43 line = line.strip().split() 44 if "#lost_mass:" in line: 45 lost_mass += float(line[1]) 46 os.system("echo " + str(lost_mass) + " > lost_mass") 47 # copy the output to the input 48 hdfs.rm(hdfs_handle+"/page_rank/part*") 49 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/") 50 hdfs.rm("PPR") 51 hadoop_call = hdmc.build_generic_hadoop_call("LostMass_mapper.py", "LostMass_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass", "lost_mass"]) 52 hdmc.execute_and_wait(hadoop_call) 53 54 55 for entry in listing: 56 last_part = entry.split("part-") 57 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n") 58 rank_sum = 0.0 59 ranks= {} 60 for line in data: 61 pr_value = line.strip().split("pr:") 62 if len(pr_value) > 1: 63 rank = float(pr_value[-1]) 64 node = pr_value[0].split()[0] 65 ranks[node] = rank 66 rank_sum+= rank 67 68 converged = True 69 for key in ranks.keys(): 70 if abs(ranks[key] - old_ranks[key]) > 0.0001: 71 converged = False 72 break 73 74 iterations += 1 75 # copy the output to the input 76 hdfs.rm(hdfs_handle+"/page_rank/part*") 77 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/") 78 hdfs.rm("PPR") 79 80 if not converged and iterations < max_iterations: 81 return parallel_page_rank(G, hdfs_handle, ranks, iterations, max_iterations) 82 else: 83 return ranks
84