Package ziggy :: Package GraphReduce :: Module gr_path
[hide private]
[frames] | no frames]

Source Code for Module ziggy.GraphReduce.gr_path

  1  ''' 
  2  Created on Aug 12, 2010 
  3   
  4  @author: dwmclary 
  5  ''' 
  6   
  7  __author__ = "D. McClary (dan.mcclary@northwestern.edu)" 
  8  __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\ 
  9            ' connected_components', 'num_connected_components',\ 
 10             'single_source_shortest_path', 'single_source_shortest_path_length',\ 
 11             'shortest_path', 'shorested_path_length', 'average_shortest_path_length'] 
 12  from .. import hdmc 
 13  from .. hdmc import hdfs 
 14  import hadoop_config as config 
 15  import networkx as nx 
 16  import os 
 17  import sys 
 18  import string 
 19  from GraphLoader import GraphLoader 
 20   
21 -def connected_components(G, name = None, recompute=False):
22 '''Compute the connected components for a networkx graph G''' 23 paths = shortest_path(G, None, None, name, recompute) 24 components = [] 25 for p in paths.keys(): 26 found = False 27 for c in components: 28 if len(c.intersection(paths[p].keys())) > 0: 29 c_index = components.index(c) 30 components[c_index] = c.union(paths[p].keys()) 31 found = True 32 break 33 if not found: 34 components.append(set(paths[p].keys())) 35 return map(list, components)
36
37 -def num_connected_components(G, name=None, recompute=False):
38 '''Compute the number of connected components for the networkx graph G.''' 39 components = connected_components(G, name, recompute) 40 return len(components)
41
42 -def single_source_shortest_path(G, source, target=None, name=None, recompute=False):
43 '''Computer the shortest path from source to a target or all other nodes in the networkx graph G.''' 44 if not recompute: 45 distance, path = check_for_precomputed_bfs_result(G, name, source) 46 else: 47 distance, path = bfs(G, source) 48 49 if target: 50 try: 51 target_path = path[target] 52 return target_path 53 except KeyError: 54 return None 55 else: 56 for key in path.keys(): 57 if len(path[key]) == 0: 58 del path[key] 59 return path
60
61 -def single_source_shortest_path_length(G, source, target=None, name=None, recompute=False):
62 '''Computer the shortest path length from source to a target or all other nodes in the networkx graph G.''' 63 if not recompute: 64 distance, path = check_for_precomputed_bfs_result(G, name, source) 65 else: 66 distance, path = bfs(G, source) 67 if target: 68 try: 69 target_distance = distance[target] 70 except KeyError: 71 return None 72 else: 73 for key in distance.keys(): 74 if distance[key] == float('inf'): 75 del distance[key] 76 return distance
77
78 -def single_source_average_shortest_path_length(G, source, target=None, name=None, recompute=False):
79 '''Computer the average shortest path length from source to a target or all other nodes in the networkx graph G.''' 80 sum = 0.0 81 count = 0 82 if not recompute: 83 distance, path = check_for_precomputed_bfs_result(G, name, source) 84 else: 85 distance, path = bfs(G, source) 86 87 for key in distance.keys(): 88 if distance[key] != float('inf'): 89 sum += distance[key] 90 count += 1 91 92 return sum/count
93
94 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
95 '''Computer the shortest path from each node to all other nodes in the networkx graph G. 96 A source and target can optionally passed to limit the search.''' 97 if source: 98 single_source_shortest_path(G, source, target, name, recompute) 99 else: 100 paths = {} 101 for n in G.nodes(): 102 this_path = single_source_shortest_path(G, n, target, name, recompute) 103 paths[n] = this_path 104 return paths
105
106 -def shortest_path_length(G, source=None, target=None, name=None, recompute=False):
107 '''Computer the shortest path length from each node to all other nodes in the networkx graph G. 108 A source and target can optionally passed to limit the search.''' 109 if source: 110 single_source_shortest_path(G, source, target, name, recompute) 111 else: 112 distances = {} 113 for n in G.nodes(): 114 this_distance = single_source_shortest_path_length(G, n, target, name, recompute) 115 distances[n] = this_distance 116 return distances
117
118 -def average_shortest_path_length(G,name=None, recompute=False):
119 '''Computer the average shortest path length from each node to all other nodes in the networkx graph G. 120 ''' 121 sum = 0.0 122 count = 0 123 for n in G.nodes(): 124 sum += single_source_average_shortest_path_length(G, n, None, name, recompute) 125 count += 1 126 return sum/count
127
128 -def average_out_degree(G, name=None):
129 '''Compute the average out-degree for the networkx graph G.''' 130 in_d, out_d = degree(G, name) 131 average_out = float(sum(out_d.values()))/len(out_d.values()) 132 return average_out
133
134 -def average_in_degree(G, name=None):
135 '''Compute the average in-degree for the networkx graph G.''' 136 in_d, out_d = degree(G, name) 137 average_in = float(sum(in_d.values()))/len(in_d.values()) 138 return average_in
139
140 -def average_degree(G, name=None):
141 '''Compute the average degree for the networkx graph G.''' 142 in_d, out_d = degree(G, name) 143 average_out = sum(out_d.values()) 144 average_in = sum(in_d.values()) 145 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
146
147 -def out_degree(G, name=None):
148 '''Compute the out-degree for each node in the networkx graph G.''' 149 in_d, out_d = degree(G, name) 150 return out_d
151
152 -def in_degree(G, name=None):
153 '''Compute the in-degree for each node in the networkx graph G.''' 154 in_d, out_d = degree(G, name) 155 return in_d
156
157 -def degree(G, name=None):
158 '''Compute the degree for each node in the networkx graph G.''' 159 G = GraphLoader(G) 160 if name: 161 G.write_adjlist(name) 162 else: 163 G.write_adjlist("pbfs_input.adjlist") 164 hdfs_handle = G.graph_handle.split("/")[-1] 165 hdfs.rm(hdfs_handle+"/degree") 166 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/degree/part-00000") 167 in_degree, out_degree = parallel_degree(hdfs_handle) 168 return in_degree, out_degree
169
170 -def bfs(G, source, name=None):
171 '''Conduct a parallel BFS from the source node to all other reachable nodes in G.''' 172 source = str(source) 173 os.system("echo "+source + " > pbfs_source") 174 wd = config.GraphReduce_location 175 inf_count = len(G) 176 G = GraphLoader(G) 177 if name: 178 G.write_adjlist(name) 179 else: 180 G.write_adjlist("pbfs_input.adjlist") 181 hdfs_handle = G.graph_handle.split("/")[-1] 182 hdfs.rm(hdfs_handle+"/shortest_path/"+source) 183 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000") 184 distance, path = parallel_bfs(source, hdfs_handle, inf_count) 185 return distance, path
186
187 -def parallel_degree(hdfs_handle):
188 '''Compute node degree in parallel for the graph adjacency list stored in hdfs_handle.''' 189 hdfs.rm("pdegree") 190 hadoop_call = hdmc.build_generic_hadoop_call("Degree_mapper.py", "Degree_reducer.py", hdfs_handle+"/degree", "pdegree", []) 191 hdmc.execute_and_wait(hadoop_call) 192 # copy the output to the input 193 hdfs.rm(hdfs_handle+"/degree/part*") 194 hdfs.mv("pdegree/part*", hdfs_handle+"/degree/") 195 hdfs.rm("pdegree") 196 in_d, out_d = fetch_degree_from_hdfs(hdfs_handle) 197 return in_d, out_d
198 199 200
201 -def parallel_bfs(source, hdfs_handle, old_inf_count):
202 '''Compute shortest path from source to all nodes in parallel for the graph adjacency list stored in hdfs_handle.''' 203 hdfs.rm("PBFS-src-"+str(source)) 204 hadoop_call = hdmc.build_generic_hadoop_call("PBFS_mapper.py", "PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"]) 205 hdmc.execute_and_wait(hadoop_call) 206 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n") 207 inf_count = 0 208 for entry in listing: 209 last_part = entry.split("part-") 210 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n") 211 212 for line in tail: 213 tail_entry = line.rstrip().split(":") 214 if len(tail_entry) > 0: 215 if tail_entry[0] == "#inf_count": 216 inf_count += int(tail_entry[1]) 217 218 # copy the output to the input 219 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*") 220 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/") 221 hdfs.rm("PBFS-src-"+str(source)) 222 if inf_count > 0 and old_inf_count > inf_count: 223 results, paths = parallel_bfs(source, hdfs_handle, inf_count) 224 else: 225 results, paths = fetch_sp_from_hdfs(hdfs_handle, source) 226 return results, paths
227
228 -def fetch_sp_from_hdfs(hdfs_handle, source):
229 '''Fetch shortest path results from HDFS.''' 230 results = {} 231 paths = {} 232 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n") 233 for r in output: 234 if len(r) > 0: 235 if r[0] != "#": 236 o = r.rstrip().split("d:") 237 p = r.rstrip().split("path:") 238 nodes = o[0].split() 239 results[nodes[0]] = float(o[1].split()[0]) 240 paths[nodes[0]] = map(string.strip, p[-1].split(",")) 241 if '' in paths[nodes[0]]: 242 paths[nodes[0]].remove('') 243 return results, paths
244
245 -def fetch_degree_from_hdfs(hdfs_handle):
246 '''Fetch degree results from HDFS.''' 247 in_degrees = {} 248 out_degrees = {} 249 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n") 250 for r in output: 251 if len(r) > 0: 252 if r[0] != "#": 253 entry = r.split() 254 key = entry[0] 255 in_index = entry.index("in:") 256 out_index = entry.index("out:") 257 in_count = len(entry[in_index:out_index]) 258 out_count = len(entry[out_index:]) 259 in_degrees[key] = in_count 260 out_degrees[key] = out_count 261 return in_degrees, out_degrees
262
263 -def check_for_precomputed_degree_result(G, name):
264 '''Check to see if degree has been computed for the networkx graph G.''' 265 if not name: 266 name = "pbfs_input.adjlist" 267 try: 268 listing = hdfs.ls(name+'/degree')["stdout"].split("\n") 269 in_d, out_d = fetch_degree_from_hdfs(name) 270 except AttributeError: 271 in_d= None 272 out_d = None 273 return in_d, out_d
274 275
276 -def check_for_precomputed_bfs_result(G, name, source):
277 '''Check to see if shortest path has been computed for the networkx graph G.''' 278 #check for a precomputed result 279 if not name: 280 name = "pbfs_input.adjlist" 281 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n") 282 result_exists = False 283 for line in listing: 284 entry = line.rstrip().split("/")[-1] 285 if source == entry: 286 result_exists = True 287 if result_exists: 288 distance, path = fetch_sp_from_hdfs(name, source) 289 else: 290 distance, path = bfs(G, source) 291 return distance, path
292