1 '''
2 Module for running monte carlo and other batch jobs on a Hadoop instance.
3 The module allows for the submission of scripts (and supporting files)
4 to a Hadoop MapReduce cluster for batch execution. Default operation runs
5 the submitted script for the specified number of iterations on the configured
6 Hadoop instance. By supplying an additional reducer script, data generated in
7 the batch process can be reduced/filtered/processed before it is written to HDFS
8 and made available to the user.
9
10 WARNING: Piped UNIX commands tend to fail when used as mappers and reducers. Instead
11 write a BASH or python script.
12
13 Created on Jul 28, 2010
14
15 @author: dwmclary
16 '''
17
18 import hadoop_config as config
19 from hdfs import hdfs_access as hdfs
20 import shlex
21 import subprocess
22 import sys
23 import os
24 import stat
25 from code_generator import CodeGenerator
26
28 '''Generates a python script which filters checkpointing results from HDMC.'''
29 c = CodeGenerator()
30 c.begin()
31 c.write("#! /usr/bin/env python\n\n")
32 c.write("import sys\n\n")
33 c.write("def read_input(file):\n")
34 c.indent()
35 c.write("for line in file:\n")
36 c.indent()
37 c.write("line = line.strip()\n")
38 c.write("try:\n")
39 c.indent()
40 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n")
41 c.dedent()
42 c.write("except ValueError:\n")
43 c.indent()
44 c.write("key='moredata'\n")
45 c.write("value=line\n")
46 c.dedent()
47 c.write("yield key, value\n")
48 c.dedent()
49 c.dedent()
50 c.write("\n\n")
51 c.write("def main():\n")
52 c.indent()
53 c.write("seen_keys = []\n")
54 c.write("data = read_input(sys.stdin)\n")
55 c.write("for key, value in data:\n")
56 c.indent()
57 c.write("if key=='moredata':\n")
58 c.indent()
59 c.write("print value\n")
60 c.dedent()
61 c.write("elif len(value)>0 and key not in seen_keys:\n")
62 c.indent()
63 c.write("seen_keys.append(key)\n")
64 c.write("print value\n")
65 c.dedent()
66 c.dedent()
67 c.dedent()
68 c.write("\n\n")
69 c.write('if __name__ == \"__main__\":\n')
70 c.indent()
71 c.write("main()\n")
72 c.write("\n")
73
74 frame_file = open("checkpoint_filter.py", "w")
75 print >> frame_file, c.end()
76 frame_file.close()
77 os.system("chmod a+rwx checkpoint_filter.py")
78
80 '''Generates a python script which given a list of files to be processed,
81 executes the specified script in over the files in parallel via MapReduce.'''
82
83 c = CodeGenerator()
84 c.begin()
85 c.write("#! /usr/bin/env python\n\n")
86 c.write("from glob import glob\n")
87 c.write("import sys, os, subprocess, shlex, random, time, re\n\n")
88 c.write("def main():\n")
89 c.indent()
90 c.write("wait_counter = 1\n")
91 c.write("time.sleep(random.random())\n")
92
93 c.write("all_checkpoints = "+str(checkpoint_names)+"\n")
94 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
95 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
96 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
97 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
98 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
99 c.write("while len(final_checkpoints) < len(all_checkpoints):\n")
100 c.indent()
101 c.write("for i in range(len(current_checkpoints)):\n")
102 c.indent()
103 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
104 c.dedent()
105 c.write("while this_checkpoint_end in current_checkpoints:\n")
106 c.indent()
107 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
108 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
109 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
110 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
111 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
112 c.write("for i in range(len(current_checkpoints)):\n")
113 c.indent()
114 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
115 c.dedent()
116 c.write("if len(final_checkpoints) == len(all_checkpoints):\n")
117 c.indent()
118 c.write("exit()\n")
119 c.dedent()
120 c.dedent()
121 c.write("\n")
122 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
123 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
124 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
125 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
126 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n")
127 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
128 c.write("output, error = p.communicate()\n")
129 c.write("sts = p.wait()\n")
130 if not debug:
131 c.write("output = output.strip()\n")
132 c.write("if len(output) > 0:\n")
133 c.indent()
134 c.write("print this_checkpoint+'==HDMC_CHECKPOINT=='+ output\n")
135 c.dedent()
136 c.write("if len(error.strip()) > 0:\n")
137 c.indent()
138 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n")
139 c.write("exit(1)\n")
140 c.dedent()
141 c.write("else:\n")
142 c.indent()
143 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
144 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
145 c.dedent()
146 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n")
147 else:
148 c.write("print output.strip(),error.strip()\n")
149 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
150 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
151 c.dedent()
152 c.dedent()
153 c.write("\n\n")
154 c.write('if __name__ == \"__main__\":\n')
155 c.indent()
156 c.write("main()\n")
157 c.write("\n")
158
159 frame_file = open("checkpoint_frame.py", "w")
160 print >> frame_file, c.end()
161 frame_file.close()
162 os.system("chmod a+rwx checkpoint_frame.py")
163
164 -def make_frame(script, arguments="", iterations=1, debug=False):
165 '''Generates a basic python frame for running a batch job on a MapReduce cluster.'''
166 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
167 c = CodeGenerator()
168 c.begin()
169 c.write("#! /usr/bin/env python\n\n")
170 c.write("import sys, os, subprocess, shlex, random\n\n")
171 c.write("def main():\n")
172 c.indent()
173 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
174 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n")
175 c.indent()
176 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
177 c.write("output, error = p.communicate()\n")
178 c.write("sts = p.wait()\n")
179 if not debug:
180 c.write("print output\n")
181 else:
182 c.write("print output,error\n")
183 c.dedent()
184 c.dedent()
185 c.write("\n\n")
186 c.write('if __name__ == \"__main__\":\n')
187 c.indent()
188 c.write("main()\n")
189 c.write("\n")
190
191 frame_file = open("frame.py", "w")
192 print >> frame_file, c.end()
193 frame_file.close()
194 os.system("chmod a+rwx frame.py")
195
197 '''Given the full path to a file or directory, returns its HDFS equivalent'''
198 output_path = output_data_name.split("/")
199 return output_path[len(output_path)-1]
200
201
202 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug = False ):
203 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
204 the configured MapReduce instance. The function also generates the necessary execution frame.'''
205
206 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,\
207 '-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
208
209 hadoop_call += ['-mapper', "frame.py"]
210
211
212 if reduction_script:
213 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
214 else:
215 hadoop_call += ['-reducer', 'NONE']
216
217
218 file_list = ["-file", script]
219 file_list += ["-file", "./frame.py"]
220 if reduction_script:
221 file_list += ["-file", reduction_script]
222 if supporting_file_list:
223 for f in supporting_file_list:
224 file_list += ["-file", f]
225
226 hadoop_call += file_list
227 make_frame(script, arguments, iterations, debug)
228 return hadoop_call
229
230 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None ):
231 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
232 the configured MapReduce instance. The function also generates the necessary execution frame.'''
233
234 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,'-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
235
236 hadoop_call += ['-mapper', "checkpoint_frame.py"]
237
238
239 if reduction_script:
240 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
241 else:
242 hadoop_call += ['-reducer', 'NONE']
243
244
245 file_list = ["-file", script]
246 file_list += ["-file", "./checkpoint_frame.py"]
247 if reduction_script:
248 file_list += ["-file", reduction_script]
249 if supporting_file_list:
250 for f in supporting_file_list:
251 file_list += ["-file", f]
252
253 hadoop_call += file_list
254 return hadoop_call
255
256
257 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
258 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
259 the configured MapReduce instance.'''
260
261 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming]
262
263
264
265
266 if num_mappers:
267 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)]
268 if num_reducers:
269 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)]
270 if key_comparator:
271 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator]
272
273 hadoop_call += ['-input', input, '-output', output]
274
275
276 hadoop_call += ['-mapper', mapper]
277 if reducer != "NONE":
278 hadoop_call += ['-reducer', reducer]
279 else:
280 hadoop_call += ['-reducer', 'NONE']
281
282
283 if reducer not in ["NONE", "aggregate"]:
284 file_list = ["-file", mapper, "-file", reducer]
285 else:
286 file_list = ["-file", mapper]
287
288 if supporting_file_list:
289 for f in supporting_file_list:
290 file_list += ["-file", f]
291
292 hadoop_call += file_list
293 return hadoop_call
294
295
297 '''Nonblocking execution of the given call array'''
298 p = subprocess.Popen(hadoop_call)
299
301 '''Blocking execution of the given call array'''
302 p = subprocess.Popen(hadoop_call)
303 sts = p.wait()
304 return sts
305
306
308 '''Creates a piece of dummy map input data in HDFS. This is necessary because
309 Hadoop streamingrequires input for mapping tasks.'''
310 f = open("dummy", "w")
311 print >> f, "dummy data"
312 f.close()
313 hdfs.copyToHDFS("dummy", "dummy")
314
315
317 '''Loads a data file to HDFS. For future use.'''
318 input_path = input_data_file.split("/")
319 hdfs_filename = input_path[len(input_path)-1]
320 hdfs.copyToHDFS(input_data_file, hdfs_filename)
321
323 '''Given a full path, downloads an output directory from HDFS to the specified location.'''
324 output_path = output_data_name.split("/")
325 hdfs_filename = output_path[-1]
326 f = open(output_data_name, "w")
327 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"]
328 f.close()
329
331 '''Given a full path, prints the output of all parts of an HDFS directory.'''
332 output_path = output_data_name.split("/")
333 hdfs_filename = output_path[-1]
334 print hdfs.cat(hdfs_filename+"/part*")["stdout"]
335
336
338 '''Creates a checkpoint directory for parallel file processing. This directory
339 is always named hdmc_checkpoints and exists at the same level as the trailing entry
340 in output_data_name.'''
341 output_path = output_data_name.split("/")
342 output_path.pop()
343 output_dir = config.shared_tmp_space+os.getlogin()
344 try:
345 os.mkdir(output_dir)
346 os.system('chmod 777 '+ output_dir)
347 except OSError:
348 pass
349 cwd = os.getcwd()
350 os.chdir(output_dir)
351 os.system("rm -rf hdmc_checkpoints")
352 os.system("mkdir hdmc_checkpoints")
353 os.system("chmod 777 hdmc_checkpoints")
354 os.chdir(cwd)
355 return output_dir+"/hdmc_checkpoints"
356
358 '''Given a list of file or command names, produces checkpoint names by taking
359 the last member of the array generated by splitting in /'''
360 checkpoints = []
361 for f in file_list:
362 path = f.split("/")
363 checkpoints.append(path[-1])
364 return checkpoints
365
366
367 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
368 '''Submits script non-blocking job to a MapReduce cluster and collects output
369 in output_data_name. Supporting filenames can be passed
370 as a list, as can a reducing/filtering script. Arguments to the submitted script
371 should be passed as a string.'''
372 create_dummy_data()
373 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug)
374 execute(hadoop_call)
375
376
377
378 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
379 '''Submits script blocking job to a MapReduce cluster and collects output
380 in output_data_name. Supporting filenames can be passed
381 as a list, as can a reducing/filtering script. Arguments to the submitted script
382 should be passed as a string.'''
383 create_dummy_data()
384 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug)
385 execute_and_wait(hadoop_call)
386 download_hdfs_data(output_data_name)
387 cleanup()
388
390 '''Submits a script to a MapReduce cluster for
391 parallel operation on a number of files. An optional reducer script can be
392 applied as well, but should filter the map results by splitting file output
393 on ===HDMC_CHECKPOINT===. Arguments to the submitted script
394 should be passed as a string. Blocking.'''
395 create_dummy_data()
396 checkpoint_dir = set_checkpoint_directory(output_data_name)
397 checkpoints = get_checkpoint_names(file_list)
398 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug)
399 if not reduction_script:
400 reduction_script = ("checkpoint_filter.py")
401 make_checkpointing_filter()
402 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments)
403 execute_and_wait(hadoop_call)
404 download_hdfs_data(output_data_name)
405 cleanup()
406 return checkpoints
407
408 -def submit_checkpoint(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
409 '''Submits a script to a MapReduce cluster for
410 parallel operation on a number of files. An optional reducer script can be
411 applied as well, but should filter the map results by splitting file output
412 on ===HDMC_CHECKPOINT===. Arguments to the submitted script
413 should be passed as a string. Non-blocking.'''
414 create_dummy_data()
415 checkpoint_dir = set_checkpoint_directory(output_data_name)
416 checkpoints = get_checkpoint_names(file_list)
417 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug)
418 if not reduction_script:
419 reduction_script = ("checkpoint_filter.py")
420 make_checkpointing_filter()
421 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments)
422 execute(hadoop_call)
423 return checkpoints
424
426 '''Remove files generated by HDMC.'''
427 if os.path.isfile("frame.py"):
428 os.remove("frame.py")
429 if os.path.isfile("checkpoint_frame.py"):
430 os.remove("checkpoint_frame.py")
431 if os.path.isfile("checkpoint_filter.py"):
432 os.remove("checkpoint_filter.py")
433 if os.path.isfile("dummy"):
434 os.remove("dummy")
435