Package ziggy :: Package hdmc :: Module hdmc
[hide private]
[frames] | no frames]

Source Code for Module ziggy.hdmc.hdmc

  1  ''' 
  2  Module for running monte carlo and other batch jobs on a Hadoop instance. 
  3  The module allows for the submission of scripts (and supporting files) 
  4  to a Hadoop MapReduce cluster for batch execution.  Default operation runs 
  5  the submitted script for the specified number of iterations on the configured 
  6  Hadoop instance.  By supplying an additional reducer script, data generated in 
  7  the batch process can be reduced/filtered/processed before it is written to HDFS 
  8  and made available to the user. 
  9   
 10  WARNING: Piped UNIX commands tend to fail when used as mappers and reducers.  Instead 
 11  write a BASH or python script. 
 12   
 13  Created on Jul 28, 2010 
 14   
 15  @author: dwmclary 
 16  ''' 
 17   
 18  import hadoop_config as config 
 19  from hdfs import hdfs_access as hdfs 
 20  import shlex 
 21  import subprocess 
 22  import sys 
 23  import os 
 24  import stat 
 25  from code_generator import CodeGenerator 
 26   
27 -def make_checkpointing_filter():
28 c = CodeGenerator() 29 c.begin() 30 c.write("#! /usr/bin/env python\n\n") 31 c.write("import sys\n\n") 32 c.write("def read_input(file):\n") 33 c.indent() 34 c.write("for line in file:\n") 35 c.indent() 36 c.write("line = line.strip()\n") 37 c.write("try:\n") 38 c.indent() 39 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n") 40 c.dedent() 41 c.write("except ValueError:\n") 42 c.indent() 43 c.write("key='moredata'\n") 44 c.write("value=line\n") 45 c.dedent() 46 c.write("yield key, value\n") 47 c.dedent() 48 c.dedent() 49 c.write("\n\n") 50 c.write("def main():\n") 51 c.indent() 52 c.write("seen_keys = []\n") 53 c.write("data = read_input(sys.stdin)\n") 54 c.write("for key, value in data:\n") 55 c.indent() 56 c.write("if key=='moredata':\n") 57 c.indent() 58 c.write("print value\n") 59 c.dedent() 60 c.write("elif len(value)>0 and key not in seen_keys:\n") 61 c.indent() 62 c.write("seen_keys.append(key)\n") 63 c.write("print value\n") 64 c.dedent() 65 c.dedent() 66 c.dedent() 67 c.write("\n\n") 68 c.write('if __name__ == \"__main__\":\n') 69 c.indent() 70 c.write("main()\n") 71 c.write("\n") 72 73 frame_file = open("checkpoint_filter.py", "w") 74 print >> frame_file, c.end() 75 frame_file.close() 76 os.system("chmod a+rwx checkpoint_filter.py")
77
78 -def make_checkpointing_frame(script, checkpoint_names, checkpoint_dir,arguments="", debug=False):
79 '''Generates a python script which given a list of files to be processed, 80 executes the specified script in over the files in parallel via MapReduce.''' 81 82 c = CodeGenerator() 83 c.begin() 84 c.write("#! /usr/bin/env python\n\n") 85 c.write("from glob import glob\n") 86 c.write("import sys, os, subprocess, shlex, random, time, re\n\n") 87 c.write("def main():\n") 88 c.indent() 89 c.write("wait_counter = 1\n") 90 c.write("time.sleep(random.random())\n") 91 #choose a checkpoint 92 c.write("all_checkpoints = "+str(checkpoint_names)+"\n") 93 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 94 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 95 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 96 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 97 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 98 c.write("while len(final_checkpoints) < len(all_checkpoints):\n") 99 c.indent() 100 c.write("for i in range(len(current_checkpoints)):\n") 101 c.indent() 102 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 103 c.dedent() 104 c.write("while this_checkpoint_end in current_checkpoints:\n") 105 c.indent() 106 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 107 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 108 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 109 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 110 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 111 c.write("for i in range(len(current_checkpoints)):\n") 112 c.indent() 113 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 114 c.dedent() 115 c.write("if len(final_checkpoints) == len(all_checkpoints):\n") 116 c.indent() 117 c.write("exit()\n") 118 c.dedent() 119 c.dedent() 120 c.write("\n") 121 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 122 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 123 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 124 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 125 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n") 126 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 127 c.write("output, error = p.communicate()\n") 128 c.write("sts = p.wait()\n") 129 if not debug: 130 c.write("output = output.strip()\n") 131 c.write("if len(output) > 0:\n") 132 c.indent() 133 c.write("print this_checkpoint+'==HDMC_CHECKPOINT=='+ output\n") 134 c.dedent() 135 c.write("if len(error.strip()) > 0:\n") 136 c.indent() 137 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n") 138 c.write("exit(1)\n") 139 c.dedent() 140 c.write("else:\n") 141 c.indent() 142 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 143 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 144 c.dedent() 145 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n") 146 else: 147 c.write("print output.strip(),error.strip()\n") 148 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 149 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 150 c.dedent() 151 c.dedent() 152 c.write("\n\n") 153 c.write('if __name__ == \"__main__\":\n') 154 c.indent() 155 c.write("main()\n") 156 c.write("\n") 157 158 frame_file = open("checkpoint_frame.py", "w") 159 print >> frame_file, c.end() 160 frame_file.close() 161 os.system("chmod a+rwx checkpoint_frame.py")
162
163 -def make_frame(script, arguments="", iterations=1, debug=False):
164 '''Generates a basic python frame for running a batch job on a MapReduce cluster.''' 165 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 166 c = CodeGenerator() 167 c.begin() 168 c.write("#! /usr/bin/env python\n\n") 169 c.write("import sys, os, subprocess, shlex, random\n\n") 170 c.write("def main():\n") 171 c.indent() 172 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 173 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n") 174 c.indent() 175 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 176 c.write("output, error = p.communicate()\n") 177 c.write("sts = p.wait()\n") 178 if not debug: 179 c.write("print output\n") 180 else: 181 c.write("print output,error\n") 182 c.dedent() 183 c.dedent() 184 c.write("\n\n") 185 c.write('if __name__ == \"__main__\":\n') 186 c.indent() 187 c.write("main()\n") 188 c.write("\n") 189 190 frame_file = open("frame.py", "w") 191 print >> frame_file, c.end() 192 frame_file.close() 193 os.system("chmod a+rwx frame.py")
194
195 -def get_output_hdfs_name(output_data_name):
196 '''Given the full path to a file or directory, returns its HDFS equivalent''' 197 output_path = output_data_name.split("/") 198 return output_path[len(output_path)-1]
199 200
201 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug = False ):
202 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 203 the configured MapReduce instance. The function also generates the necessary execution frame.''' 204 # I/O setup 205 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,\ 206 '-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 207 #mapper name 208 hadoop_call += ['-mapper', "frame.py"] 209 210 #set the reducer 211 if reduction_script: 212 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 213 else: 214 hadoop_call += ['-reducer', 'NONE'] 215 216 #build the supporting file list 217 file_list = ["-file", script] 218 file_list += ["-file", "./frame.py"] 219 if reduction_script: 220 file_list += ["-file", reduction_script] 221 if supporting_file_list: 222 for f in supporting_file_list: 223 file_list += ["-file", f] 224 225 hadoop_call += file_list 226 make_frame(script, arguments, iterations, debug) 227 return hadoop_call
228
229 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None ):
230 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 231 the configured MapReduce instance. The function also generates the necessary execution frame.''' 232 # I/O setup 233 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming,'-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 234 #mapper name 235 hadoop_call += ['-mapper', "checkpoint_frame.py"] 236 237 #set the reducer 238 if reduction_script: 239 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 240 else: 241 hadoop_call += ['-reducer', 'NONE'] 242 243 #build the supporting file list 244 file_list = ["-file", script] 245 file_list += ["-file", "./checkpoint_frame.py"] 246 if reduction_script: 247 file_list += ["-file", reduction_script] 248 if supporting_file_list: 249 for f in supporting_file_list: 250 file_list += ["-file", f] 251 252 hadoop_call += file_list 253 return hadoop_call
254 255
256 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
257 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 258 the configured MapReduce instance.''' 259 # I/O setup 260 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming] 261 262 263 #process mapper, reducer, and key comparator options 264 265 if num_mappers: 266 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)] 267 if num_reducers: 268 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)] 269 if key_comparator: 270 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator] 271 272 hadoop_call += ['-input', input, '-output', output] 273 274 #set mapper and reducer 275 hadoop_call += ['-mapper', mapper] 276 if reducer != "NONE": 277 hadoop_call += ['-reducer', reducer] 278 else: 279 hadoop_call += ['-reducer', 'NONE'] 280 281 #build the supporting file list 282 if reducer not in ["NONE", "aggregate"]: 283 file_list = ["-file", mapper, "-file", reducer] 284 else: 285 file_list = ["-file", mapper] 286 287 if supporting_file_list: 288 for f in supporting_file_list: 289 file_list += ["-file", f] 290 291 hadoop_call += file_list 292 return hadoop_call
293 294
295 -def execute(hadoop_call):
296 '''Nonblocking execution of the given call array''' 297 p = subprocess.Popen(hadoop_call)
298
299 -def execute_and_wait(hadoop_call):
300 '''Blocking execution of the given call array''' 301 p = subprocess.Popen(hadoop_call) 302 sts = p.wait() 303 return sts
304 305
306 -def create_dummy_data():
307 '''Creates a piece of dummy map input data in HDFS. This is necessary because 308 Hadoop streamingrequires input for mapping tasks.''' 309 f = open("dummy", "w") 310 print >> f, "dummy data" 311 f.close() 312 hdfs.copyToHDFS("dummy", "dummy")
313 314
315 -def load_data_to_hfds(input_data_file):
316 '''Loads a data file to HDFS. For future use.''' 317 input_path = input_data_file.split("/") 318 hdfs_filename = input_path[len(input_path)-1] 319 hdfs.copyToHDFS(input_data_file, hdfs_filename)
320
321 -def download_hdfs_data(output_data_name):
322 '''Given a full path, downloads an output directory from HDFS to the specified location.''' 323 output_path = output_data_name.split("/") 324 hdfs_filename = output_path[-1] 325 f = open(output_data_name, "w") 326 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"] 327 f.close()
328 334 335
336 -def set_checkpoint_directory(output_data_name):
337 '''Creates a checkpoint directory for parallel file processing. This directory 338 is always named hdmc_checkpoints and exists at the same level as the trailing entry 339 in output_data_name.''' 340 output_path = output_data_name.split("/") 341 output_path.pop() 342 output_dir = config.shared_tmp_space+os.getlogin() 343 try: 344 os.mkdir(output_dir) 345 os.system('chmod 777 '+ output_dir) 346 except OSError: 347 pass 348 cwd = os.getcwd() 349 os.chdir(output_dir) 350 os.system("rm -rf hdmc_checkpoints") 351 os.system("mkdir hdmc_checkpoints") 352 os.system("chmod 777 hdmc_checkpoints") 353 os.chdir(cwd) 354 return output_dir+"/hdmc_checkpoints"
355
356 -def get_checkpoint_names(file_list):
357 checkpoints = [] 358 for f in file_list: 359 path = f.split("/") 360 checkpoints.append(path[-1]) 361 return checkpoints
362 363
364 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
365 '''Submits script non-blocking job to a MapReduce cluster and collects output 366 in output_data_name. Supporting filenames can be passed 367 as a list, as can a reducing/filtering script. Arguments to the submitted script 368 should be passed as a string.''' 369 create_dummy_data() 370 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug) 371 execute(hadoop_call)
372 373
374 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False):
375 '''Submits script blocking job to a MapReduce cluster and collects output 376 in output_data_name. Supporting filenames can be passed 377 as a list, as can a reducing/filtering script. Arguments to the submitted script 378 should be passed as a string.''' 379 create_dummy_data() 380 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug) 381 execute_and_wait(hadoop_call) 382 download_hdfs_data(output_data_name)
383
384 -def submit_checkpoint_inline(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
385 create_dummy_data() 386 checkpoint_dir = set_checkpoint_directory(output_data_name) 387 checkpoints = get_checkpoint_names(file_list) 388 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug) 389 if not reduction_script: 390 reduction_script = ("checkpoint_filter.py") 391 make_checkpointing_filter() 392 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments) 393 execute_and_wait(hadoop_call) 394 download_hdfs_data(output_data_name) 395 return checkpoints
396
397 -def submit_checkpoint(script, output_data_name, file_list, reduction_script = None, arguments="", debug=False):
398 create_dummy_data() 399 checkpoint_dir = set_checkpoint_directory(output_data_name) 400 checkpoints = get_checkpoint_names(file_list) 401 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, debug) 402 if not reduction_script: 403 reduction_script = ("checkpoint_filter.py") 404 make_checkpointing_filter() 405 hadoop_call = build_checkpoint_call(script, output_data_name, file_list, reduction_script, arguments) 406 execute(hadoop_call) 407 return checkpoints
408