Package ziggy :: Package GraphReduce :: Module gr_path
[hide private]
[frames] | no frames]

Source Code for Module ziggy.GraphReduce.gr_path

  1  ''' 
  2  Created on Aug 12, 2010 
  3   
  4  @author: dwmclary 
  5  ''' 
  6   
  7  __author__ = "D. McClary (dan.mcclary@northwestern.edu)" 
  8  __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\ 
  9            ' connected_components', 'num_connected_components',\ 
 10             'single_source_shortest_path', 'single_source_shortest_path_length',\ 
 11             'shortest_path', 'shorested_path_length', 'average_shortest_path_length'] 
 12  from .. import hdmc 
 13  from .. hdmc import hdfs 
 14  import hadoop_config as config 
 15  import networkx as nx 
 16  import os 
 17  import sys 
 18  import string 
 19  from GraphLoader import GraphLoader 
 20   
21 -def connected_components(G, name = None, recompute=False):
22 paths = shortest_path(G, None, None, name, recompute) 23 components = [] 24 for p in paths.keys(): 25 found = False 26 for c in components: 27 if len(c.intersection(paths[p].keys())) > 0: 28 c_index = components.index(c) 29 components[c_index] = c.union(paths[p].keys()) 30 found = True 31 break 32 if not found: 33 components.append(set(paths[p].keys())) 34 return map(list, components)
35
36 -def num_connected_components(G, name=None, recompute=False):
37 components = connected_components(G, name, recompute) 38 return len(components)
39
40 -def single_source_shortest_path(G, source, target=None, name=None, recompute=False):
41 if not recompute: 42 distance, path = check_for_precomputed_bfs_result(G, name, source) 43 else: 44 distance, path = bfs(G, source) 45 46 if target: 47 try: 48 target_path = path[target] 49 return target_path 50 except KeyError: 51 return None 52 else: 53 for key in path.keys(): 54 if len(path[key]) == 0: 55 del path[key] 56 return path
57
58 -def single_source_shortest_path_length(G, source, target=None, name=None, recompute=False):
59 if not recompute: 60 distance, path = check_for_precomputed_bfs_result(G, name, source) 61 else: 62 distance, path = bfs(G, source) 63 if target: 64 try: 65 target_distance = distance[target] 66 except KeyError: 67 return None 68 else: 69 for key in distance.keys(): 70 if distance[key] == float('inf'): 71 del distance[key] 72 return distance
73
74 -def single_source_average_shortest_path_length(G, source, target=None, name=None, recompute=False):
75 sum = 0.0 76 count = 0 77 if not recompute: 78 distance, path = check_for_precomputed_bfs_result(G, name, source) 79 else: 80 distance, path = bfs(G, source) 81 82 for key in distance.keys(): 83 if distance[key] != float('inf'): 84 sum += distance[key] 85 count += 1 86 87 return sum/count
88
89 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
90 if source: 91 single_source_shortest_path(G, source, target, name, recompute) 92 else: 93 paths = {} 94 for n in G.nodes(): 95 this_path = single_source_shortest_path(G, n, target, name, recompute) 96 paths[n] = this_path 97 return paths
98
99 -def shortest_path_length(G, source=None, target=None, name=None, recompute=False):
100 if source: 101 single_source_shortest_path(G, source, target, name, recompute) 102 else: 103 distances = {} 104 for n in G.nodes(): 105 this_distance = single_source_shortest_path_length(G, n, target, name, recompute) 106 distances[n] = this_distance 107 return distances
108
109 -def average_shortest_path_length(G,name=None, recompute=False):
110 sum = 0.0 111 count = 0 112 for n in G.nodes(): 113 sum += single_source_average_shortest_path_length(G, n, None, name, recompute) 114 count += 1 115 return sum/count
116
117 -def average_out_degree(G, name=None):
118 in_d, out_d = degree(G, name) 119 average_out = float(sum(out_d.values()))/len(out_d.values()) 120 return average_out
121
122 -def average_in_degree(G, name=None):
123 in_d, out_d = degree(G, name) 124 average_in = float(sum(in_d.values()))/len(in_d.values()) 125 return average_in
126
127 -def average_degree(G, name=None):
128 in_d, out_d = degree(G, name) 129 average_out = sum(out_d.values()) 130 average_in = sum(in_d.values()) 131 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
132
133 -def out_degree(G, name=None):
134 in_d, out_d = degree(G, name) 135 return out_d
136
137 -def in_degree(G, name=None):
138 in_d, out_d = degree(G, name) 139 return in_d
140
141 -def degree(G, name=None):
142 G = GraphLoader(G) 143 if name: 144 G.write_adjlist(name) 145 else: 146 G.write_adjlist("pbfs_input.adjlist") 147 hdfs_handle = G.graph_handle.split("/")[-1] 148 hdfs.rm(hdfs_handle+"/degree") 149 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/degree/part-00000") 150 in_degree, out_degree = parallel_degree(hdfs_handle) 151 return in_degree, out_degree
152
153 -def bfs(G, source, name=None):
154 source = str(source) 155 os.system("echo "+source + " > pbfs_source") 156 wd = config.GraphReduce_location 157 inf_count = len(G) 158 G = GraphLoader(G) 159 if name: 160 G.write_adjlist(name) 161 else: 162 G.write_adjlist("pbfs_input.adjlist") 163 hdfs_handle = G.graph_handle.split("/")[-1] 164 hdfs.rm(hdfs_handle+"/shortest_path/"+source) 165 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000") 166 distance, path = parallel_bfs(source, hdfs_handle, inf_count) 167 return distance, path
168
169 -def parallel_degree(hdfs_handle):
170 hdfs.rm("pdegree") 171 hadoop_call = hdmc.build_generic_hadoop_call("Degree_mapper.py", "Degree_reducer.py", hdfs_handle+"/degree", "pdegree", []) 172 hdmc.execute_and_wait(hadoop_call) 173 # copy the output to the input 174 hdfs.rm(hdfs_handle+"/degree/part*") 175 hdfs.mv("pdegree/part*", hdfs_handle+"/degree/") 176 hdfs.rm("pdegree") 177 in_d, out_d = fetch_degree_from_hdfs(hdfs_handle) 178 return in_d, out_d
179 180 181
182 -def parallel_bfs(source, hdfs_handle, old_inf_count):
183 hdfs.rm("PBFS-src-"+str(source)) 184 hadoop_call = hdmc.build_generic_hadoop_call("PBFS_mapper.py", "PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"]) 185 hdmc.execute_and_wait(hadoop_call) 186 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n") 187 inf_count = 0 188 for entry in listing: 189 last_part = entry.split("part-") 190 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n") 191 192 for line in tail: 193 tail_entry = line.rstrip().split(":") 194 if len(tail_entry) > 0: 195 if tail_entry[0] == "#inf_count": 196 inf_count += int(tail_entry[1]) 197 198 # copy the output to the input 199 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*") 200 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/") 201 hdfs.rm("PBFS-src-"+str(source)) 202 if inf_count > 0 and old_inf_count > inf_count: 203 results, paths = parallel_bfs(source, hdfs_handle, inf_count) 204 else: 205 results, paths = fetch_sp_from_hdfs(hdfs_handle, source) 206 return results, paths
207
208 -def fetch_sp_from_hdfs(hdfs_handle, source):
209 results = {} 210 paths = {} 211 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n") 212 for r in output: 213 if len(r) > 0: 214 if r[0] != "#": 215 o = r.rstrip().split("d:") 216 p = r.rstrip().split("path:") 217 nodes = o[0].split() 218 results[nodes[0]] = float(o[1].split()[0]) 219 paths[nodes[0]] = map(string.strip, p[-1].split(",")) 220 if '' in paths[nodes[0]]: 221 paths[nodes[0]].remove('') 222 return results, paths
223
224 -def fetch_degree_from_hdfs(hdfs_handle):
225 in_degrees = {} 226 out_degrees = {} 227 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n") 228 for r in output: 229 if len(r) > 0: 230 if r[0] != "#": 231 entry = r.split() 232 key = entry[0] 233 in_index = entry.index("in:") 234 out_index = entry.index("out:") 235 in_count = len(entry[in_index:out_index]) 236 out_count = len(entry[out_index:]) 237 in_degrees[key] = in_count 238 out_degrees[key] = out_count 239 return in_degrees, out_degrees
240
241 -def check_for_precomputed_degree_result(G, name):
242 if not name: 243 name = "pbfs_input.adjlist" 244 try: 245 listing = hdfs.ls(name+'/degree')["stdout"].split("\n") 246 in_d, out_d = fetch_degree_from_hdfs(name) 247 except AttributeError: 248 in_d= None 249 out_d = None 250 return in_d, out_d
251 252
253 -def check_for_precomputed_bfs_result(G, name, source):
254 #check for a precomputed result 255 if not name: 256 name = "pbfs_input.adjlist" 257 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n") 258 result_exists = False 259 for line in listing: 260 entry = line.rstrip().split("/")[-1] 261 if source == entry: 262 result_exists = True 263 if result_exists: 264 distance, path = fetch_sp_from_hdfs(name, source) 265 else: 266 distance, path = bfs(G, source) 267 return distance, path
268