Package ziggy :: Package hdmc :: Module hdmc
[hide private]
[frames] | no frames]

Source Code for Module ziggy.hdmc.hdmc

  1  ''' 
  2  Module for running monte carlo and other batch jobs on a Hadoop instance. 
  3  The module allows for the submission of scripts (and supporting files) 
  4  to a Hadoop MapReduce cluster for batch execution.  Default operation runs 
  5  the submitted script for the specified number of iterations on the configured 
  6  Hadoop instance.  By supplying an additional reducer script, data generated in 
  7  the batch process can be reduced/filtered/processed before it is written to HDFS 
  8  and made available to the user. 
  9   
 10  WARNING: Piped UNIX commands tend to fail when used as mappers and reducers.  Instead 
 11  write a BASH or python script. 
 12   
 13  Created on Jul 28, 2010 
 14   
 15  @author: dwmclary 
 16  ''' 
 17   
 18  import hadoop_config as config 
 19  from hdfs import hdfs_access as hdfs 
 20  import shlex 
 21  import subprocess 
 22  import sys 
 23  import os 
 24  import stat 
 25  from code_generator import CodeGenerator 
 26   
27 -def make_checkpointing_filter():
28 '''Generates a python script which filters checkpointing results from HDMC.''' 29 c = CodeGenerator() 30 c.begin() 31 c.write("#! /usr/bin/env python\n\n") 32 c.write("import sys\n\n") 33 c.write("def read_input(file):\n") 34 c.indent() 35 c.write("for line in file:\n") 36 c.indent() 37 c.write("line = line.strip()\n") 38 c.write("try:\n") 39 c.indent() 40 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n") 41 c.dedent() 42 c.write("except ValueError:\n") 43 c.indent() 44 c.write("key='moredata'\n") 45 c.write("value=line\n") 46 c.dedent() 47 c.write("yield key, value\n") 48 c.dedent() 49 c.dedent() 50 c.write("\n\n") 51 c.write("def main():\n") 52 c.indent() 53 c.write("seen_keys = []\n") 54 c.write("data = read_input(sys.stdin)\n") 55 c.write("for key, value in data:\n") 56 c.indent() 57 c.write("if key=='moredata':\n") 58 c.indent() 59 c.write("print value\n") 60 c.dedent() 61 c.write("elif len(value)>0 and key not in seen_keys:\n") 62 c.indent() 63 c.write("seen_keys.append(key)\n") 64 c.write("print value\n") 65 c.dedent() 66 c.dedent() 67 c.dedent() 68 c.write("\n\n") 69 c.write('if __name__ == \"__main__\":\n') 70 c.indent() 71 c.write("main()\n") 72 c.write("\n") 73 74 frame_file = open("checkpoint_filter.py", "w") 75 print >> frame_file, c.end() 76 frame_file.close() 77 os.system("chmod a+rwx checkpoint_filter.py")
78
79 -def make_checkpointing_frame(script, checkpoint_names, checkpoint_dir,arguments="", debug=False):
80 '''Generates a python script which given a list of files to be processed, 81 executes the specified script in over the files in parallel via MapReduce.''' 82 83 c = CodeGenerator() 84 c.begin() 85 c.write("#! /usr/bin/env python\n\n") 86 c.write("from glob import glob\n") 87 c.write("import sys, os, subprocess, shlex, random, time, re\n\n") 88 c.write("def main():\n") 89 c.indent() 90 c.write("wait_counter = 1\n") 91 c.write("time.sleep(random.random())\n") 92 #choose a checkpoint 93 c.write("all_checkpoints = "+str(checkpoint_names)+"\n") 94 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 95 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 96 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 97 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 98 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 99 c.write("while len(final_checkpoints) < len(all_checkpoints):\n") 100 c.indent() 101 c.write("for i in range(len(current_checkpoints)):\n") 102 c.indent() 103 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 104 c.dedent() 105 c.write("while this_checkpoint_end in current_checkpoints:\n") 106 c.indent() 107 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 108 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 109 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 110 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 111 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 112 c.write("for i in range(len(current_checkpoints)):\n") 113 c.indent() 114 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 115 c.dedent() 116 c.write("if len(final_checkpoints) == len(all_checkpoints):\n") 117 c.indent() 118 c.write("exit()\n") 119 c.dedent() 120 c.dedent() 121 c.write("\n") 122 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 123 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 124 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 125 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 126 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n") 127 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 128 c.write("output, error = p.communicate()\n") 129 c.write("sts = p.wait()\n") 130 if not debug: 131 c.write("output = output.strip()\n") 132 c.write("if len(output) > 0:\n") 133 c.indent() 134 c.write("print this_checkpoint+'==HDMC_CHECKPOINT=='+ output\n") 135 c.dedent() 136 c.write("if len(error.strip()) > 0:\n") 137 c.indent() 138 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n") 139 c.write("exit(1)\n") 140 c.dedent() 141 c.write("else:\n") 142 c.indent() 143 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 144 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 145 c.dedent() 146 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n") 147 else: 148 c.write("print output.strip(),error.strip()\n") 149 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 150 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 151 c.dedent() 152 c.dedent() 153 c.write("\n\n") 154 c.write('if __name__ == \"__main__\":\n') 155 c.indent() 156 c.write("main()\n") 157 c.write("\n") 158 159 frame_file = open("checkpoint_frame.py", "w") 160 print >> frame_file, c.end() 161 frame_file.close() 162 os.system("chmod a+rwx checkpoint_frame.py")
163
164 -def make_frame(script, arguments="", iterations=1, debug=False):
165 '''Generates a basic python frame for running a batch job on a MapReduce cluster.''' 166 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 167 c = CodeGenerator() 168 c.begin() 169 c.write("#! /usr/bin/env python\n\n") 170 c.write("import sys, os, subprocess, shlex, random\n\n") 171 c.write("def main():\n") 172 c.indent() 173 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 174 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n") 175 c.indent() 176 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 177 c.write("output, error = p.communicate()\n") 178 c.write("sts = p.wait()\n") 179 if not debug: 180 c.write("print output\n") 181 else: 182 c.write("print output,error\n") 183 c.dedent() 184 c.dedent() 185 c.write("\n\n") 186 c.write('if __name__ == \"__main__\":\n') 187 c.indent() 188 c.write("main()\n") 189 c.write("\n") 190 191 frame_file = open("frame.py", "w") 192 print >> frame_file, c.end() 193 frame_file.close() 194 os.system("chmod a+rwx frame.py")
195
196 -def get_output_hdfs_name(output_data_name):
197 '''Given the full path to a file or directory, returns its HDFS equivalent''' 198 output_path = output_data_name.split("/") 199 return output_path[len(output_path)-1]
200 201
202 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug = False ):
203 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 204 the configured MapReduce instance. The function also generates the necessary execution frame.''' 205 # I/O setup 206 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,\ 207 '-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 208 #mapper name 209 hadoop_call += ['-mapper', "frame.py"] 210 211 #set the reducer 212 if reduction_script: 213 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 214 else: 215 hadoop_call += ['-reducer', 'NONE'] 216 217 #build the supporting file list 218 file_list = ["-file", script] 219 file_list += ["-file", "./frame.py"] 220 if reduction_script: 221 file_list += ["-file", reduction_script] 222 if supporting_file_list: 223 for f in supporting_file_list: 224 file_list += ["-file", f] 225 226 hadoop_call += file_list 227 make_frame(script, arguments, iterations, debug) 228 return hadoop_call
229
230 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None ):
231 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 232 the configured MapReduce instance. The function also generates the necessary execution frame.''' 233 # I/O setup 234 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,'-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 235 #mapper name 236 hadoop_call += ['-mapper', "checkpoint_frame.py"] 237 238 #set the reducer 239 if reduction_script: 240 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 241 else: 242 hadoop_call += ['-reducer', 'NONE'] 243 244 #build the supporting file list 245 file_list = ["-file", script] 246 file_list += ["-file", "./checkpoint_frame.py"] 247 if reduction_script: 248 file_list += ["-file", reduction_script] 249 if supporting_file_list: 250 for f in supporting_file_list: 251 file_list += ["-file", f] 252 253 hadoop_call += file_list 254 return hadoop_call
255 256
257 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
258 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 259 the configured MapReduce instance.''' 260 # I/O setup 261 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming] 262 263 264 #process mapper, reducer, and key comparator options 265 266 if num_mappers: 267 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)] 268 if num_reducers: 269 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)] 270 if key_comparator: 271 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator] 272 273 hadoop_call += ['-input', input, '-output', output] 274 275 #set mapper and reducer 276 hadoop_call += ['-mapper', mapper] 277 if reducer != "NONE": 278 hadoop_call += ['-reducer', reducer] 279 else: 280 hadoop_call += ['-reducer', 'NONE'] 281 282 #build the supporting file list 283 if reducer not in ["NONE", "aggregate"]: 284 file_list = ["-file", mapper, "-file", reducer] 285 else: 286 file_list = ["-file", mapper] 287 288 if supporting_file_list: 289 for f in supporting_file_list: 290 file_list += ["-file", f] 291 292 hadoop_call += file_list 293 return hadoop_call
294 295
296 -def execute(hadoop_call):
297 '''Nonblocking execution of the given call array''' 298 p = subprocess.Popen(hadoop_call)
299
300 -def execute_and_wait(hadoop_call):
301 '''Blocking execution of the given call array''' 302 p = subprocess.Popen(hadoop_call) 303 sts = p.wait() 304 return sts
305 306
307 -def create_dummy_data():
308 '''Creates a piece of dummy map input data in HDFS. This is necessary because 309 Hadoop streamingrequires input for mapping tasks.''' 310 f = open("dummy", "w") 311 print >> f, "dummy data" 312 f.close() 313 hdfs.copyToHDFS("dummy", "dummy")
314 315
316 -def load_data_to_hfds(input_data_file):
317 '''Loads a data file to HDFS. For future use.''' 318 input_path = input_data_file.split("/") 319 hdfs_filename = input_path[len(input_path)-1] 320 hdfs.copyToHDFS(input_data_file, hdfs_filename)
321
322 -def download_hdfs_data(output_data_name):
323 '''Given a full path, downloads an output directory from HDFS to the specified location.''' 324 output_path = output_data_name.split("/") 325 hdfs_filename = output_path[-1] 326 f = open(output_data_name, "w") 327 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"] 328 f.close()
329 335 336
337 -def set_checkpoint_directory(output_data_name):
338 '''Creates a checkpoint directory for parallel file processing. This directory 339 is always named hdmc_checkpoints and exists at the same level as the trailing entry 340 in output_data_name.''' 341 output_path = output_data_name.split("/") 342 output_path.pop() 343 output_dir = config.shared_tmp_space+os.getlogin() 344 try: 345 os.mkdir(output_dir) 346 os.system('chmod 777 '+ output_dir) 347 except OSError: 348 pass 349 cwd = os.getcwd() 350 os.chdir(output_dir) 351 os.system("rm -rf hdmc_checkpoints") 352 os.system("mkdir hdmc_checkpoints") 353 os.system("chmod 777 hdmc_checkpoints") 354 os.chdir(cwd) 355 return output_dir+"/hdmc_checkpoints"
356
357 -def get_checkpoint_names(file_list):
358 '''Given a list of file or command names, produces checkpoint names by taking 359 the last member of the array generated by splitting in /''' 360 checkpoints = [] 361 for f in file_list: 362 path = f.split("/") 363 checkpoints.append(path[-1]) 364 return checkpoints
365 366
367 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
368 '''Submits script non-blocking job to a MapReduce cluster and collects output 369 in output_data_name. Supporting filenames can be passed 370 as a list, as can a reducing/filtering script. Arguments to the submitted script 371 should be passed as a string.''' 372 create_dummy_data() 373 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug) 374 execute(hadoop_call)
375 376 377
378 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
379 '''Submits script blocking job to a MapReduce cluster and collects output 380 in output_data_name. Supporting filenames can be passed 381 as a list, as can a reducing/filtering script. Arguments to the submitted script 382 should be passed as a string.''' 383 create_dummy_data() 384 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug) 385 execute_and_wait(hadoop_call) 386 download_hdfs_data(output_data_name) 387 cleanup()
388
389 -def submit_checkpoint_inline(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
390 '''Submits a script to a MapReduce cluster for 391 parallel operation on a number of files. An optional reducer script can be 392 applied as well, but should filter the map results by splitting file output 393 on ===HDMC_CHECKPOINT===. Arguments to the submitted script 394 should be passed as a string. Blocking.''' 395 create_dummy_data() 396 checkpoint_dir = set_checkpoint_directory(output_data_name) 397 checkpoints = get_checkpoint_names(file_list) 398 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug) 399 if not reduction_script: 400 reduction_script = ("checkpoint_filter.py") 401 make_checkpointing_filter() 402 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments) 403 execute_and_wait(hadoop_call) 404 download_hdfs_data(output_data_name) 405 cleanup() 406 return checkpoints
407
408 -def submit_checkpoint(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
409 '''Submits a script to a MapReduce cluster for 410 parallel operation on a number of files. An optional reducer script can be 411 applied as well, but should filter the map results by splitting file output 412 on ===HDMC_CHECKPOINT===. Arguments to the submitted script 413 should be passed as a string. Non-blocking.''' 414 create_dummy_data() 415 checkpoint_dir = set_checkpoint_directory(output_data_name) 416 checkpoints = get_checkpoint_names(file_list) 417 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug) 418 if not reduction_script: 419 reduction_script = ("checkpoint_filter.py") 420 make_checkpointing_filter() 421 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments) 422 execute(hadoop_call) 423 return checkpoints
424
425 -def cleanup():
426 '''Remove files generated by HDMC.''' 427 if os.path.isfile("frame.py"): 428 os.remove("frame.py") 429 if os.path.isfile("checkpoint_frame.py"): 430 os.remove("checkpoint_frame.py") 431 if os.path.isfile("checkpoint_filter.py"): 432 os.remove("checkpoint_filter.py") 433 if os.path.isfile("dummy"): 434 os.remove("dummy")
435