1 '''
2 Created on Aug 12, 2010
3
4 @author: dwmclary
5 '''
6
7 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
8 __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\
9 ' connected_components', 'num_connected_components',\
10 'single_source_shortest_path', 'single_source_shortest_path_length',\
11 'shortest_path', 'shorested_path_length', 'average_shortest_path_length']
12 from .. import hdmc
13 from .. hdmc import hdfs
14 import hadoop_config as config
15 import networkx as nx
16 import os
17 import sys
18 import string
19 from GraphLoader import GraphLoader
20
22 paths = shortest_path(G, None, None, name, recompute)
23 components = []
24 for p in paths.keys():
25 found = False
26 for c in components:
27 if len(c.intersection(paths[p].keys())) > 0:
28 c_index = components.index(c)
29 components[c_index] = c.union(paths[p].keys())
30 found = True
31 break
32 if not found:
33 components.append(set(paths[p].keys()))
34 return map(list, components)
35
39
41 if not recompute:
42 distance, path = check_for_precomputed_bfs_result(G, name, source)
43 else:
44 distance, path = bfs(G, source)
45
46 if target:
47 try:
48 target_path = path[target]
49 return target_path
50 except KeyError:
51 return None
52 else:
53 for key in path.keys():
54 if len(path[key]) == 0:
55 del path[key]
56 return path
57
59 if not recompute:
60 distance, path = check_for_precomputed_bfs_result(G, name, source)
61 else:
62 distance, path = bfs(G, source)
63 if target:
64 try:
65 target_distance = distance[target]
66 except KeyError:
67 return None
68 else:
69 for key in distance.keys():
70 if distance[key] == float('inf'):
71 del distance[key]
72 return distance
73
75 sum = 0.0
76 count = 0
77 if not recompute:
78 distance, path = check_for_precomputed_bfs_result(G, name, source)
79 else:
80 distance, path = bfs(G, source)
81
82 for key in distance.keys():
83 if distance[key] != float('inf'):
84 sum += distance[key]
85 count += 1
86
87 return sum/count
88
89 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
98
108
116
118 in_d, out_d = degree(G, name)
119 average_out = float(sum(out_d.values()))/len(out_d.values())
120 return average_out
121
123 in_d, out_d = degree(G, name)
124 average_in = float(sum(in_d.values()))/len(in_d.values())
125 return average_in
126
128 in_d, out_d = degree(G, name)
129 average_out = sum(out_d.values())
130 average_in = sum(in_d.values())
131 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
132
134 in_d, out_d = degree(G, name)
135 return out_d
136
138 in_d, out_d = degree(G, name)
139 return in_d
140
152
153 -def bfs(G, source, name=None):
154 source = str(source)
155 os.system("echo "+source + " > pbfs_source")
156 wd = config.GraphReduce_location
157 inf_count = len(G)
158 G = GraphLoader(G)
159 if name:
160 G.write_adjlist(name)
161 else:
162 G.write_adjlist("pbfs_input.adjlist")
163 hdfs_handle = G.graph_handle.split("/")[-1]
164 hdfs.rm(hdfs_handle+"/shortest_path/"+source)
165 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000")
166 distance, path = parallel_bfs(source, hdfs_handle, inf_count)
167 return distance, path
168
179
180
181
183 hdfs.rm("PBFS-src-"+str(source))
184 hadoop_call = hdmc.build_generic_hadoop_call("PBFS_mapper.py", "PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"])
185 hdmc.execute_and_wait(hadoop_call)
186 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n")
187 inf_count = 0
188 for entry in listing:
189 last_part = entry.split("part-")
190 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n")
191
192 for line in tail:
193 tail_entry = line.rstrip().split(":")
194 if len(tail_entry) > 0:
195 if tail_entry[0] == "#inf_count":
196 inf_count += int(tail_entry[1])
197
198
199 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*")
200 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/")
201 hdfs.rm("PBFS-src-"+str(source))
202 if inf_count > 0 and old_inf_count > inf_count:
203 results, paths = parallel_bfs(source, hdfs_handle, inf_count)
204 else:
205 results, paths = fetch_sp_from_hdfs(hdfs_handle, source)
206 return results, paths
207
209 results = {}
210 paths = {}
211 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n")
212 for r in output:
213 if len(r) > 0:
214 if r[0] != "#":
215 o = r.rstrip().split("d:")
216 p = r.rstrip().split("path:")
217 nodes = o[0].split()
218 results[nodes[0]] = float(o[1].split()[0])
219 paths[nodes[0]] = map(string.strip, p[-1].split(","))
220 if '' in paths[nodes[0]]:
221 paths[nodes[0]].remove('')
222 return results, paths
223
225 in_degrees = {}
226 out_degrees = {}
227 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n")
228 for r in output:
229 if len(r) > 0:
230 if r[0] != "#":
231 entry = r.split()
232 key = entry[0]
233 in_index = entry.index("in:")
234 out_index = entry.index("out:")
235 in_count = len(entry[in_index:out_index])
236 out_count = len(entry[out_index:])
237 in_degrees[key] = in_count
238 out_degrees[key] = out_count
239 return in_degrees, out_degrees
240
242 if not name:
243 name = "pbfs_input.adjlist"
244 try:
245 listing = hdfs.ls(name+'/degree')["stdout"].split("\n")
246 in_d, out_d = fetch_degree_from_hdfs(name)
247 except AttributeError:
248 in_d= None
249 out_d = None
250 return in_d, out_d
251
252
254
255 if not name:
256 name = "pbfs_input.adjlist"
257 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n")
258 result_exists = False
259 for line in listing:
260 entry = line.rstrip().split("/")[-1]
261 if source == entry:
262 result_exists = True
263 if result_exists:
264 distance, path = fetch_sp_from_hdfs(name, source)
265 else:
266 distance, path = bfs(G, source)
267 return distance, path
268