1 '''
2 Created on Aug 12, 2010
3
4 @author: dwmclary
5 '''
6
7 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
8 __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\
9 ' connected_components', 'num_connected_components',\
10 'single_source_shortest_path', 'single_source_shortest_path_length',\
11 'shortest_path', 'shorested_path_length', 'average_shortest_path_length']
12 from .. import hdmc
13 from .. hdmc import hdfs
14 import hadoop_config as config
15 import networkx as nx
16 import os
17 import sys
18 import string
19 from GraphLoader import GraphLoader
20
22 '''Compute the connected components for a networkx graph G'''
23 paths = shortest_path(G, None, None, name, recompute)
24 components = []
25 for p in paths.keys():
26 found = False
27 for c in components:
28 if len(c.intersection(paths[p].keys())) > 0:
29 c_index = components.index(c)
30 components[c_index] = c.union(paths[p].keys())
31 found = True
32 break
33 if not found:
34 components.append(set(paths[p].keys()))
35 return map(list, components)
36
38 '''Compute the number of connected components for the networkx graph G.'''
39 components = connected_components(G, name, recompute)
40 return len(components)
41
43 '''Computer the shortest path from source to a target or all other nodes in the networkx graph G.'''
44 if not recompute:
45 distance, path = check_for_precomputed_bfs_result(G, name, source)
46 else:
47 distance, path = bfs(G, source)
48
49 if target:
50 try:
51 target_path = path[target]
52 return target_path
53 except KeyError:
54 return None
55 else:
56 for key in path.keys():
57 if len(path[key]) == 0:
58 del path[key]
59 return path
60
62 '''Computer the shortest path length from source to a target or all other nodes in the networkx graph G.'''
63 if not recompute:
64 distance, path = check_for_precomputed_bfs_result(G, name, source)
65 else:
66 distance, path = bfs(G, source)
67 if target:
68 try:
69 target_distance = distance[target]
70 except KeyError:
71 return None
72 else:
73 for key in distance.keys():
74 if distance[key] == float('inf'):
75 del distance[key]
76 return distance
77
79 '''Computer the average shortest path length from source to a target or all other nodes in the networkx graph G.'''
80 sum = 0.0
81 count = 0
82 if not recompute:
83 distance, path = check_for_precomputed_bfs_result(G, name, source)
84 else:
85 distance, path = bfs(G, source)
86
87 for key in distance.keys():
88 if distance[key] != float('inf'):
89 sum += distance[key]
90 count += 1
91
92 return sum/count
93
94 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
95 '''Computer the shortest path from each node to all other nodes in the networkx graph G.
96 A source and target can optionally passed to limit the search.'''
97 if source:
98 single_source_shortest_path(G, source, target, name, recompute)
99 else:
100 paths = {}
101 for n in G.nodes():
102 this_path = single_source_shortest_path(G, n, target, name, recompute)
103 paths[n] = this_path
104 return paths
105
107 '''Computer the shortest path length from each node to all other nodes in the networkx graph G.
108 A source and target can optionally passed to limit the search.'''
109 if source:
110 single_source_shortest_path(G, source, target, name, recompute)
111 else:
112 distances = {}
113 for n in G.nodes():
114 this_distance = single_source_shortest_path_length(G, n, target, name, recompute)
115 distances[n] = this_distance
116 return distances
117
119 '''Computer the average shortest path length from each node to all other nodes in the networkx graph G.
120 '''
121 sum = 0.0
122 count = 0
123 for n in G.nodes():
124 sum += single_source_average_shortest_path_length(G, n, None, name, recompute)
125 count += 1
126 return sum/count
127
129 '''Compute the average out-degree for the networkx graph G.'''
130 in_d, out_d = degree(G, name)
131 average_out = float(sum(out_d.values()))/len(out_d.values())
132 return average_out
133
135 '''Compute the average in-degree for the networkx graph G.'''
136 in_d, out_d = degree(G, name)
137 average_in = float(sum(in_d.values()))/len(in_d.values())
138 return average_in
139
141 '''Compute the average degree for the networkx graph G.'''
142 in_d, out_d = degree(G, name)
143 average_out = sum(out_d.values())
144 average_in = sum(in_d.values())
145 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
146
148 '''Compute the out-degree for each node in the networkx graph G.'''
149 in_d, out_d = degree(G, name)
150 return out_d
151
153 '''Compute the in-degree for each node in the networkx graph G.'''
154 in_d, out_d = degree(G, name)
155 return in_d
156
169
170 -def bfs(G, source, name=None):
171 '''Conduct a parallel BFS from the source node to all other reachable nodes in G.'''
172 source = str(source)
173 os.system("echo "+source + " > pbfs_source")
174 wd = config.GraphReduce_location
175 inf_count = len(G)
176 G = GraphLoader(G)
177 if name:
178 G.write_adjlist(name)
179 else:
180 G.write_adjlist("pbfs_input.adjlist")
181 hdfs_handle = G.graph_handle.split("/")[-1]
182 hdfs.rm(hdfs_handle+"/shortest_path/"+source)
183 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000")
184 distance, path = parallel_bfs(source, hdfs_handle, inf_count)
185 return distance, path
186
188 '''Compute node degree in parallel for the graph adjacency list stored in hdfs_handle.'''
189 hdfs.rm("pdegree")
190 hadoop_call = hdmc.build_generic_hadoop_call("Degree_mapper.py", "Degree_reducer.py", hdfs_handle+"/degree", "pdegree", [])
191 hdmc.execute_and_wait(hadoop_call)
192
193 hdfs.rm(hdfs_handle+"/degree/part*")
194 hdfs.mv("pdegree/part*", hdfs_handle+"/degree/")
195 hdfs.rm("pdegree")
196 in_d, out_d = fetch_degree_from_hdfs(hdfs_handle)
197 return in_d, out_d
198
199
200
202 '''Compute shortest path from source to all nodes in parallel for the graph adjacency list stored in hdfs_handle.'''
203 hdfs.rm("PBFS-src-"+str(source))
204 hadoop_call = hdmc.build_generic_hadoop_call("PBFS_mapper.py", "PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"])
205 hdmc.execute_and_wait(hadoop_call)
206 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n")
207 inf_count = 0
208 for entry in listing:
209 last_part = entry.split("part-")
210 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n")
211
212 for line in tail:
213 tail_entry = line.rstrip().split(":")
214 if len(tail_entry) > 0:
215 if tail_entry[0] == "#inf_count":
216 inf_count += int(tail_entry[1])
217
218
219 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*")
220 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/")
221 hdfs.rm("PBFS-src-"+str(source))
222 if inf_count > 0 and old_inf_count > inf_count:
223 results, paths = parallel_bfs(source, hdfs_handle, inf_count)
224 else:
225 results, paths = fetch_sp_from_hdfs(hdfs_handle, source)
226 return results, paths
227
229 '''Fetch shortest path results from HDFS.'''
230 results = {}
231 paths = {}
232 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n")
233 for r in output:
234 if len(r) > 0:
235 if r[0] != "#":
236 o = r.rstrip().split("d:")
237 p = r.rstrip().split("path:")
238 nodes = o[0].split()
239 results[nodes[0]] = float(o[1].split()[0])
240 paths[nodes[0]] = map(string.strip, p[-1].split(","))
241 if '' in paths[nodes[0]]:
242 paths[nodes[0]].remove('')
243 return results, paths
244
246 '''Fetch degree results from HDFS.'''
247 in_degrees = {}
248 out_degrees = {}
249 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n")
250 for r in output:
251 if len(r) > 0:
252 if r[0] != "#":
253 entry = r.split()
254 key = entry[0]
255 in_index = entry.index("in:")
256 out_index = entry.index("out:")
257 in_count = len(entry[in_index:out_index])
258 out_count = len(entry[out_index:])
259 in_degrees[key] = in_count
260 out_degrees[key] = out_count
261 return in_degrees, out_degrees
262
264 '''Check to see if degree has been computed for the networkx graph G.'''
265 if not name:
266 name = "pbfs_input.adjlist"
267 try:
268 listing = hdfs.ls(name+'/degree')["stdout"].split("\n")
269 in_d, out_d = fetch_degree_from_hdfs(name)
270 except AttributeError:
271 in_d= None
272 out_d = None
273 return in_d, out_d
274
275
277 '''Check to see if shortest path has been computed for the networkx graph G.'''
278
279 if not name:
280 name = "pbfs_input.adjlist"
281 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n")
282 result_exists = False
283 for line in listing:
284 entry = line.rstrip().split("/")[-1]
285 if source == entry:
286 result_exists = True
287 if result_exists:
288 distance, path = fetch_sp_from_hdfs(name, source)
289 else:
290 distance, path = bfs(G, source)
291 return distance, path
292