1 '''
2 Module for running monte carlo and other batch jobs on a Hadoop instance.
3 The module allows for the submission of scripts (and supporting files)
4 to a Hadoop MapReduce cluster for batch execution. Default operation runs
5 the submitted script for the specified number of iterations on the configured
6 Hadoop instance. By supplying an additional reducer script, data generated in
7 the batch process can be reduced/filtered/processed before it is written to HDFS
8 and made available to the user.
9
10 WARNING: Piped UNIX commands tend to fail when used as mappers and reducers. Instead
11 write a BASH or python script.
12
13 Created on Jul 28, 2010
14
15 @author: dwmclary
16 '''
17
18 import hadoop_config as config
19 from hdfs import hdfs_access as hdfs
20 import shlex
21 import subprocess
22 import sys
23 import os
24 import stat
25 from code_generator import CodeGenerator
26
28 c = CodeGenerator()
29 c.begin()
30 c.write("#! /usr/bin/env python\n\n")
31 c.write("import sys\n\n")
32 c.write("def read_input(file):\n")
33 c.indent()
34 c.write("for line in file:\n")
35 c.indent()
36 c.write("line = line.strip()\n")
37 c.write("try:\n")
38 c.indent()
39 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n")
40 c.dedent()
41 c.write("except ValueError:\n")
42 c.indent()
43 c.write("key='moredata'\n")
44 c.write("value=line\n")
45 c.dedent()
46 c.write("yield key, value\n")
47 c.dedent()
48 c.dedent()
49 c.write("\n\n")
50 c.write("def main():\n")
51 c.indent()
52 c.write("seen_keys = []\n")
53 c.write("data = read_input(sys.stdin)\n")
54 c.write("for key, value in data:\n")
55 c.indent()
56 c.write("if key=='moredata':\n")
57 c.indent()
58 c.write("print value\n")
59 c.dedent()
60 c.write("elif len(value)>0 and key not in seen_keys:\n")
61 c.indent()
62 c.write("seen_keys.append(key)\n")
63 c.write("print value\n")
64 c.dedent()
65 c.dedent()
66 c.dedent()
67 c.write("\n\n")
68 c.write('if __name__ == \"__main__\":\n')
69 c.indent()
70 c.write("main()\n")
71 c.write("\n")
72
73 frame_file = open("checkpoint_filter.py", "w")
74 print >> frame_file, c.end()
75 frame_file.close()
76 os.system("chmod a+rwx checkpoint_filter.py")
77
79 '''Generates a python script which given a list of files to be processed,
80 executes the specified script in over the files in parallel via MapReduce.'''
81
82 c = CodeGenerator()
83 c.begin()
84 c.write("#! /usr/bin/env python\n\n")
85 c.write("from glob import glob\n")
86 c.write("import sys, os, subprocess, shlex, random, time, re\n\n")
87 c.write("def main():\n")
88 c.indent()
89 c.write("wait_counter = 1\n")
90 c.write("time.sleep(random.random())\n")
91
92 c.write("all_checkpoints = "+str(checkpoint_names)+"\n")
93 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
94 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
95 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
96 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
97 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
98 c.write("while len(final_checkpoints) < len(all_checkpoints):\n")
99 c.indent()
100 c.write("for i in range(len(current_checkpoints)):\n")
101 c.indent()
102 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
103 c.dedent()
104 c.write("while this_checkpoint_end in current_checkpoints:\n")
105 c.indent()
106 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
107 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
108 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
109 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
110 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
111 c.write("for i in range(len(current_checkpoints)):\n")
112 c.indent()
113 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
114 c.dedent()
115 c.write("if len(final_checkpoints) == len(all_checkpoints):\n")
116 c.indent()
117 c.write("exit()\n")
118 c.dedent()
119 c.dedent()
120 c.write("\n")
121 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
122 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
123 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
124 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
125 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n")
126 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
127 c.write("output, error = p.communicate()\n")
128 c.write("sts = p.wait()\n")
129 if not debug:
130 c.write("output = output.strip()\n")
131 c.write("if len(output) > 0:\n")
132 c.indent()
133 c.write("print this_checkpoint+'==HDMC_CHECKPOINT=='+ output\n")
134 c.dedent()
135 c.write("if len(error.strip()) > 0:\n")
136 c.indent()
137 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n")
138 c.write("exit(1)\n")
139 c.dedent()
140 c.write("else:\n")
141 c.indent()
142 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
143 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
144 c.dedent()
145 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n")
146 else:
147 c.write("print output.strip(),error.strip()\n")
148 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
149 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
150 c.dedent()
151 c.dedent()
152 c.write("\n\n")
153 c.write('if __name__ == \"__main__\":\n')
154 c.indent()
155 c.write("main()\n")
156 c.write("\n")
157
158 frame_file = open("checkpoint_frame.py", "w")
159 print >> frame_file, c.end()
160 frame_file.close()
161 os.system("chmod a+rwx checkpoint_frame.py")
162
163 -def make_frame(script, arguments="", iterations=1, debug=False):
164 '''Generates a basic python frame for running a batch job on a MapReduce cluster.'''
165 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
166 c = CodeGenerator()
167 c.begin()
168 c.write("#! /usr/bin/env python\n\n")
169 c.write("import sys, os, subprocess, shlex, random\n\n")
170 c.write("def main():\n")
171 c.indent()
172 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
173 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n")
174 c.indent()
175 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
176 c.write("output, error = p.communicate()\n")
177 c.write("sts = p.wait()\n")
178 if not debug:
179 c.write("print output\n")
180 else:
181 c.write("print output,error\n")
182 c.dedent()
183 c.dedent()
184 c.write("\n\n")
185 c.write('if __name__ == \"__main__\":\n')
186 c.indent()
187 c.write("main()\n")
188 c.write("\n")
189
190 frame_file = open("frame.py", "w")
191 print >> frame_file, c.end()
192 frame_file.close()
193 os.system("chmod a+rwx frame.py")
194
196 '''Given the full path to a file or directory, returns its HDFS equivalent'''
197 output_path = output_data_name.split("/")
198 return output_path[len(output_path)-1]
199
200
201 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug = False ):
202 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
203 the configured MapReduce instance. The function also generates the necessary execution frame.'''
204
205 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,\
206 '-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
207
208 hadoop_call += ['-mapper', "frame.py"]
209
210
211 if reduction_script:
212 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
213 else:
214 hadoop_call += ['-reducer', 'NONE']
215
216
217 file_list = ["-file", script]
218 file_list += ["-file", "./frame.py"]
219 if reduction_script:
220 file_list += ["-file", reduction_script]
221 if supporting_file_list:
222 for f in supporting_file_list:
223 file_list += ["-file", f]
224
225 hadoop_call += file_list
226 make_frame(script, arguments, iterations, debug)
227 return hadoop_call
228
229 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None ):
230 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
231 the configured MapReduce instance. The function also generates the necessary execution frame.'''
232
233 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,'-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
234
235 hadoop_call += ['-mapper', "checkpoint_frame.py"]
236
237
238 if reduction_script:
239 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
240 else:
241 hadoop_call += ['-reducer', 'NONE']
242
243
244 file_list = ["-file", script]
245 file_list += ["-file", "./checkpoint_frame.py"]
246 if reduction_script:
247 file_list += ["-file", reduction_script]
248 if supporting_file_list:
249 for f in supporting_file_list:
250 file_list += ["-file", f]
251
252 hadoop_call += file_list
253 return hadoop_call
254
255
256 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
257 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
258 the configured MapReduce instance.'''
259
260 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming]
261
262
263
264
265 if num_mappers:
266 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)]
267 if num_reducers:
268 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)]
269 if key_comparator:
270 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator]
271
272 hadoop_call += ['-input', input, '-output', output]
273
274
275 hadoop_call += ['-mapper', mapper]
276 if reducer != "NONE":
277 hadoop_call += ['-reducer', reducer]
278 else:
279 hadoop_call += ['-reducer', 'NONE']
280
281
282 if reducer not in ["NONE", "aggregate"]:
283 file_list = ["-file", mapper, "-file", reducer]
284 else:
285 file_list = ["-file", mapper]
286
287 if supporting_file_list:
288 for f in supporting_file_list:
289 file_list += ["-file", f]
290
291 hadoop_call += file_list
292 return hadoop_call
293
294
296 '''Nonblocking execution of the given call array'''
297 p = subprocess.Popen(hadoop_call)
298
300 '''Blocking execution of the given call array'''
301 p = subprocess.Popen(hadoop_call)
302 sts = p.wait()
303 return sts
304
305
307 '''Creates a piece of dummy map input data in HDFS. This is necessary because
308 Hadoop streamingrequires input for mapping tasks.'''
309 f = open("dummy", "w")
310 print >> f, "dummy data"
311 f.close()
312 hdfs.copyToHDFS("dummy", "dummy")
313
314
316 '''Loads a data file to HDFS. For future use.'''
317 input_path = input_data_file.split("/")
318 hdfs_filename = input_path[len(input_path)-1]
319 hdfs.copyToHDFS(input_data_file, hdfs_filename)
320
322 '''Given a full path, downloads an output directory from HDFS to the specified location.'''
323 output_path = output_data_name.split("/")
324 hdfs_filename = output_path[-1]
325 f = open(output_data_name, "w")
326 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"]
327 f.close()
328
330 '''Given a full path, prints the output of all parts of an HDFS directory.'''
331 output_path = output_data_name.split("/")
332 hdfs_filename = output_path[-1]
333 print hdfs.cat(hdfs_filename+"/part*")["stdout"]
334
335
337 '''Creates a checkpoint directory for parallel file processing. This directory
338 is always named hdmc_checkpoints and exists at the same level as the trailing entry
339 in output_data_name.'''
340 output_path = output_data_name.split("/")
341 output_path.pop()
342 output_dir = config.shared_tmp_space+os.getlogin()
343 try:
344 os.mkdir(output_dir)
345 os.system('chmod 777 '+ output_dir)
346 except OSError:
347 pass
348 cwd = os.getcwd()
349 os.chdir(output_dir)
350 os.system("rm -rf hdmc_checkpoints")
351 os.system("mkdir hdmc_checkpoints")
352 os.system("chmod 777 hdmc_checkpoints")
353 os.chdir(cwd)
354 return output_dir+"/hdmc_checkpoints"
355
357 checkpoints = []
358 for f in file_list:
359 path = f.split("/")
360 checkpoints.append(path[-1])
361 return checkpoints
362
363
364 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
365 '''Submits script non-blocking job to a MapReduce cluster and collects output
366 in output_data_name. Supporting filenames can be passed
367 as a list, as can a reducing/filtering script. Arguments to the submitted script
368 should be passed as a string.'''
369 create_dummy_data()
370 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug)
371 execute(hadoop_call)
372
373
374 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
375 '''Submits script blocking job to a MapReduce cluster and collects output
376 in output_data_name. Supporting filenames can be passed
377 as a list, as can a reducing/filtering script. Arguments to the submitted script
378 should be passed as a string.'''
379 create_dummy_data()
380 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug)
381 execute_and_wait(hadoop_call)
382 download_hdfs_data(output_data_name)
383
396
397 -def submit_checkpoint(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
408