Package ziggy :: Package GraphReduce :: Module gr_link_analysis
[hide private]
[frames] | no frames]

Source Code for Module ziggy.GraphReduce.gr_link_analysis

 1  ''' 
 2  Created on Aug 18, 2010 
 3   
 4  @author: dwmclary 
 5  ''' 
 6  __author__ = "D. McClary (dan.mcclary@northwestern.edu)" 
 7  from .. import hdmc 
 8  from .. hdmc import hdfs 
 9  import hadoop_config as config 
10  import networkx as nx 
11  import os 
12  import sys 
13  import string 
14  from GraphLoader import GraphLoader 
15   
16 -def page_rank(G, name=None, max_iterations=10):
17 os.system("echo "+str(1.0/float(len(G))) + " > rank_mass") 18 wd = config.GraphReduce_location 19 ranks = dict(zip(map(str,G.nodes()),[1.0/len(G)]*len(G))) 20 G = GraphLoader(G) 21 if name: 22 G.write_adjlist(name) 23 else: 24 G.write_adjlist("pbfs_input.adjlist") 25 hdfs_handle = G.graph_handle.split("/")[-1] 26 hdfs.rm(hdfs_handle+"/page_rank") 27 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/page_rank/part-00000") 28 ranking = parallel_page_rank(G,hdfs_handle, ranks, 0, max_iterations) 29 return ranking
30
31 -def parallel_page_rank(G, hdfs_handle, old_ranks, iterations, max_iterations):
32 hdfs.rm("PPR") 33 hadoop_call = hdmc.build_generic_hadoop_call("PageRank_mapper.py", "PageRank_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass"]) 34 hdmc.execute_and_wait(hadoop_call) 35 listing = hdfs.ls("PPR/part*")["stdout"].rstrip().split("\n") 36 for entry in listing: 37 last_part = entry.split("part-") 38 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n") 39 lost_mass = 0.0 40 for line in data: 41 line = line.strip().split() 42 if "#lost_mass:" in line: 43 lost_mass += float(line[1]) 44 os.system("echo " + str(lost_mass) + " > lost_mass") 45 # copy the output to the input 46 hdfs.rm(hdfs_handle+"/page_rank/part*") 47 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/") 48 hdfs.rm("PPR") 49 hadoop_call = hdmc.build_generic_hadoop_call("LostMass_mapper.py", "LostMass_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass", "lost_mass"]) 50 hdmc.execute_and_wait(hadoop_call) 51 52 53 for entry in listing: 54 last_part = entry.split("part-") 55 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n") 56 rank_sum = 0.0 57 ranks= {} 58 for line in data: 59 pr_value = line.strip().split("pr:") 60 if len(pr_value) > 1: 61 rank = float(pr_value[-1]) 62 node = pr_value[0].split()[0] 63 ranks[node] = rank 64 rank_sum+= rank 65 66 converged = True 67 for key in ranks.keys(): 68 if abs(ranks[key] - old_ranks[key]) > 0.0001: 69 converged = False 70 break 71 72 iterations += 1 73 # copy the output to the input 74 hdfs.rm(hdfs_handle+"/page_rank/part*") 75 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/") 76 hdfs.rm("PPR") 77 78 if not converged and iterations < max_iterations: 79 return parallel_page_rank(G, hdfs_handle, ranks, iterations, max_iterations) 80 else: 81 return ranks
82