Package gridmap :: Module job

Source Code for Module gridmap.job

  1  # -*- coding: utf-8 -*- 
  2   
  3  # Written (W) 2008-2012 Christian Widmer 
  4  # Written (W) 2008-2010 Cheng Soon Ong 
  5  # Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org 
  6  # Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS 
  7   
  8  # This file is part of Grid Map. 
  9   
 10  # Grid Map is free software: you can redistribute it and/or modify 
 11  # it under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14   
 15  # Grid Map is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with Grid Map.  If not, see <http://www.gnu.org/licenses/>. 
 22   
 23  """ 
 24  This module provides wrappers that simplify submission and collection of jobs, 
 25  in a more 'pythonic' fashion. 
 26   
 27  @author: Christian Widmer 
 28  @author: Cheng Soon Ong 
 29  @author: Dan Blanchard (dblanchard@ets.org) 
 30  """ 
 31   
 32  from __future__ import absolute_import, print_function, unicode_literals 
 33   
 34  import inspect 
 35  import os 
 36  import subprocess 
 37  import sys 
 38  import traceback 
 39  import uuid 
 40  from socket import gethostname 
 41  from time import sleep 
 42   
 43  from drmaa import Session 
 44  from drmaa.errors import InvalidJobException 
 45  from redis import StrictRedis 
 46  from redis.exceptions import ConnectionError as RedisConnectionError 
 47   
 48  from gridmap.data import clean_path, zload_db, zsave_db 
 49   
 50  # Python 2.x backward compatibility 
 51  if sys.version_info < (3, 0): 
 52      range = xrange 
 53   
 54   
 55  #### Global settings #### 
 56  # Redis settings 
 57  REDIS_DB = int(os.getenv('GRID_MAP_REDIS_DB', '2')) 
 58  REDIS_PORT = int(os.getenv('GRID_MAP_REDIS_PORT', '7272')) 
 59   
 60  # Is mem_free configured properly on the cluster? 
 61  USE_MEM_FREE = 'True' == os.getenv('GRID_MAP_USE_MEM_FREE', 'False') 
 62   
 63  # Which queue should we use by default 
 64  DEFAULT_QUEUE = os.getenv('GRID_MAP_DEFAULT_QUEUE', 'all.q') 
65 66 67 -class Job(object):
68 """ 69 Central entity that wraps a function and its data. Basically, a job consists 70 of a function, its argument list, its keyword list and a field "ret" which 71 is filled, when the execute method gets called. 72 73 @note: This can only be used to wrap picklable functions (i.e., those that 74 are defined at the module or class level). 75 """ 76 77 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception', 78 'environment', 'replace_env', 'working_dir', 'num_slots', 79 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue') 80
81 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G", 82 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
83 """ 84 Initializes a new Job. 85 86 @param f: a function, which should be executed. 87 @type f: function 88 @param args: argument list of function f 89 @type args: list 90 @param kwlist: dictionary of keyword arguments for f 91 @type kwlist: dict 92 @param cleanup: flag that determines the cleanup of input and log file 93 @type cleanup: boolean 94 @param mem_free: Estimate of how much memory this job will need (for 95 scheduling) 96 @type mem_free: C{basestring} 97 @param name: Name to give this job 98 @type name: C{basestring} 99 @param num_slots: Number of slots this job should use. 100 @type num_slots: C{int} 101 @param queue: SGE queue to schedule job on. 102 @type queue: C{basestring} 103 """ 104 105 self.path = None 106 self._f = None 107 self.function = f 108 self.args = args 109 self.jobid = -1 110 self.kwlist = kwlist if kwlist is not None else {} 111 self.cleanup = cleanup 112 self.ret = None 113 self.environment = None 114 self.replace_env = False 115 self.working_dir = os.getcwd() 116 self.num_slots = num_slots 117 self.mem_free = mem_free 118 self.white_list = [] 119 self.name = name.replace(' ', '_') 120 self.queue = queue
121 122 @property
123 - def function(self):
124 ''' Function this job will execute. ''' 125 return self._f
126 127 @function.setter
128 - def function(self, f):
129 """ 130 setter for function that carefully takes care of 131 namespace, avoiding __main__ as a module 132 """ 133 134 m = inspect.getmodule(f) 135 try: 136 self.path = clean_path(os.path.dirname(os.path.abspath( 137 inspect.getsourcefile(f)))) 138 except TypeError: 139 self.path = '' 140 141 # if module is not __main__, all is good 142 if m.__name__ != "__main__": 143 self._f = f 144 145 else: 146 147 # determine real module name 148 mn = os.path.splitext(os.path.basename(m.__file__))[0] 149 150 # make sure module is present 151 __import__(mn) 152 153 # get module 154 mod = sys.modules[mn] 155 156 # set function from module 157 self._f = getattr(mod, f.__name__)
158
159 - def execute(self):
160 """ 161 Executes function f with given arguments 162 and writes return value to field ret. 163 If an exception is encountered during execution, ret will 164 contain a pickled version of it. 165 Input data is removed after execution to save space. 166 """ 167 try: 168 self.ret = self.function(*self.args, **self.kwlist) 169 except Exception as exception: 170 self.ret = exception 171 traceback.print_exc() 172 del self.args 173 del self.kwlist
174 175 @property
176 - def native_specification(self):
177 """ 178 define python-style getter 179 """ 180 181 ret = "" 182 183 if self.name: 184 ret += " -N {0}".format(self.name) 185 if self.mem_free and USE_MEM_FREE: 186 ret += " -l mem_free={0}".format(self.mem_free) 187 if self.num_slots and self.num_slots > 1: 188 ret += " -pe smp {0}".format(self.num_slots) 189 if self.white_list: 190 ret += " -l h={0}".format('|'.join(self.white_list)) 191 if self.queue: 192 ret += " -q {0}".format(self.queue) 193 194 return ret
195
196 197 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None, 198 quiet=True):
199 """ 200 Method used to send a list of jobs onto the cluster. 201 @param jobs: list of jobs to be executed 202 @type jobs: c{list} of L{Job} 203 @param uniq_id: The unique suffix for the tables corresponding to this job 204 in the database. 205 @type uniq_id: C{basestring} 206 @param temp_dir: Local temporary directory for storing output for an 207 individual job. 208 @type temp_dir: C{basestring} 209 @param white_list: List of acceptable nodes to use for scheduling job. If 210 None, all are used. 211 @type white_list: C{list} of C{basestring} 212 @param quiet: When true, do not output information about the jobs that have 213 been submitted. 214 @type quiet: C{bool} 215 """ 216 217 session = Session() 218 session.initialize() 219 jobids = [] 220 221 for job_num, job in enumerate(jobs): 222 # set job white list 223 job.white_list = white_list 224 225 # append jobs 226 jobid = _append_job_to_session(session, job, uniq_id, job_num, 227 temp_dir=temp_dir, quiet=quiet) 228 jobids.append(jobid) 229 230 sid = session.contact 231 session.exit() 232 233 return (sid, jobids)
234
235 236 -def _append_job_to_session(session, job, uniq_id, job_num, temp_dir='/scratch/', 237 quiet=True):
238 """ 239 For an active session, append new job based on information stored in job 240 object. Also sets job.job_id to the ID of the job on the grid. 241 242 @param session: The current DRMAA session with the grid engine. 243 @type session: C{Session} 244 @param job: The Job to add to the queue. 245 @type job: L{Job} 246 @param uniq_id: The unique suffix for the tables corresponding to this job 247 in the database. 248 @type uniq_id: C{basestring} 249 @param job_num: The row in the table to store/retrieve data on. This is only 250 non-zero for jobs created via grid_map. 251 @type job_num: C{int} 252 @param temp_dir: Local temporary directory for storing output for an 253 individual job. 254 @type temp_dir: C{basestring} 255 @param quiet: When true, do not output information about the jobs that have 256 been submitted. 257 @type quiet: C{bool} 258 """ 259 260 jt = session.createJobTemplate() 261 262 # fetch env vars from shell 263 shell_env = os.environ 264 265 if job.environment and job.replace_env: 266 # only consider defined env vars 267 jt.jobEnvironment = job.environment 268 269 elif job.environment and not job.replace_env: 270 # replace env var from shell with defined env vars 271 env = shell_env 272 env.update(job.environment) 273 jt.jobEnvironment = env 274 275 else: 276 # only consider env vars from shell 277 jt.jobEnvironment = shell_env 278 279 # Run module using python -m to avoid ImportErrors when unpickling jobs 280 jt.remoteCommand = sys.executable 281 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id), 282 '{0}'.format(job_num), job.path, temp_dir, gethostname()] 283 jt.nativeSpecification = job.native_specification 284 jt.outputPath = ":" + temp_dir 285 jt.errorPath = ":" + temp_dir 286 287 jobid = session.runJob(jt) 288 289 # set job fields that depend on the jobid assigned by grid engine 290 job.jobid = jobid 291 292 if not quiet: 293 print('Your job {0} has been submitted with id {1}'.format(job.name, 294 jobid), 295 file=sys.stderr) 296 297 session.deleteJobTemplate(jt) 298 299 return jobid
300
301 302 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id, 303 temp_dir='/scratch/', wait=True):
304 """ 305 Collect the results from the jobids, returns a list of Jobs 306 307 @param sid: session identifier 308 @type sid: string returned by cluster 309 @param jobids: list of job identifiers returned by the cluster 310 @type jobids: list of strings 311 @param redis_server: Open connection to the database where the results will 312 be stored. 313 @type redis_server: L{StrictRedis} 314 @param wait: Wait for jobs to finish? 315 @type wait: bool 316 @param temp_dir: Local temporary directory for storing output for an 317 individual job. 318 @type temp_dir: C{basestring} 319 """ 320 321 for ix in range(len(jobids)): 322 assert(jobids[ix] == joblist[ix].jobid) 323 324 # Open DRMAA session as context manager 325 with Session(sid) as session: 326 327 if wait: 328 drmaaWait = Session.TIMEOUT_WAIT_FOREVER 329 else: 330 drmaaWait = Session.TIMEOUT_NO_WAIT 331 332 # Wait for jobs to finish 333 session.synchronize(jobids, drmaaWait, False) 334 335 # attempt to collect results 336 job_output_list = [] 337 for ix, job in enumerate(joblist): 338 339 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix]) 340 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix]) 341 342 # Get the exit status and other status info about the job 343 job_info = session.wait(job.jobid, drmaaWait) 344 345 try: 346 job_output = zload_db(redis_server, 347 'output_{0}'.format(uniq_id), 348 ix) 349 except Exception as detail: 350 print(("Error while unpickling output for gridmap job {1} " + 351 "stored with key output_{0}_{1}").format(uniq_id, ix), 352 file=sys.stderr) 353 print("This usually happens when a job has crashed before " + 354 "writing its output to the database.", 355 file=sys.stderr) 356 print("\nHere is some information about the problem job:", 357 file=sys.stderr) 358 print("stdout:", log_stdout_fn, file=sys.stderr) 359 print("stderr:", log_stderr_fn, file=sys.stderr) 360 if job_info.hasExited: 361 print("Exit status: {0}".format(job_info.exitStatus), 362 file=sys.stderr) 363 if job_info.hasSignal: 364 print(("Terminating signal: " + 365 "{0}").format(job_info.terminatedSignal), 366 file=sys.stderr) 367 print("Core dumped: {0}".format(job_info.hasCoreDump), 368 file=sys.stderr) 369 print(("Job aborted before it ran: " + 370 "{0}").format(job_info.wasAborted), 371 file=sys.stderr) 372 print("Job resources: {0}".format(job_info.resourceUsage), 373 file=sys.stderr) 374 try: 375 print(("Job SGE status: " + 376 "{0}").format(session.jobStatus(job.jobid)), 377 file=sys.stderr) 378 except InvalidJobException: 379 pass 380 print("Unpickling exception: {0}".format(detail), 381 file=sys.stderr) 382 sys.exit(2) 383 384 #print exceptions 385 if isinstance(job_output, Exception): 386 print("Exception encountered in job with log file:", 387 file=sys.stderr) 388 print(log_stdout_fn, file=sys.stderr) 389 print(job_output, file=sys.stderr) 390 print(file=sys.stderr) 391 392 job_output_list.append(job_output) 393 394 return job_output_list
395
396 397 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None, 398 quiet=True):
399 """ 400 Take a list of jobs and process them on the cluster. 401 402 @param temp_dir: Local temporary directory for storing output for an 403 individual job. 404 @type temp_dir: C{basestring} 405 @param wait: Should we wait for jobs to finish? (Should only be false if the 406 function you're running doesn't return anything) 407 @type wait: C{bool} 408 @param white_list: If specified, limit nodes used to only those in list. 409 @type white_list: C{list} of C{basestring} 410 @param quiet: When true, do not output information about the jobs that have 411 been submitted. 412 @type quiet: C{bool} 413 """ 414 # Create new connection to Redis database with pickled jobs 415 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT) 416 417 # Check if Redis server is launched, and spawn it if not. 418 try: 419 redis_server.set('connection_test', True) 420 except RedisConnectionError: 421 with open('/dev/null') as null_file: 422 redis_process = subprocess.Popen(['redis-server', '-'], 423 stdout=null_file, 424 stdin=subprocess.PIPE, 425 stderr=null_file) 426 redis_process.stdin.write('''daemonize yes 427 pidfile {0} 428 port {1} 429 '''.format(os.path.join(temp_dir, 430 'redis{0}.pid'.format(REDIS_PORT)), 431 REDIS_PORT)) 432 redis_process.stdin.close() 433 # Wait for things to get started 434 sleep(5) 435 436 # Generate random name for keys 437 uniq_id = uuid.uuid4() 438 439 # Save jobs to database 440 for job_id, job in enumerate(jobs): 441 zsave_db(job, redis_server, 'job_{0}'.format(uniq_id), job_id) 442 443 # Submit jobs to cluster 444 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list, 445 temp_dir=temp_dir, quiet=quiet) 446 447 # Reconnect and retrieve outputs 448 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id, 449 temp_dir=temp_dir, wait=wait) 450 451 # Make sure we have enough output 452 assert(len(jobs) == len(job_outputs)) 453 454 # Delete keys from existing server or just 455 redis_server.delete(*redis_server.keys('job_{0}_*'.format(uniq_id))) 456 redis_server.delete(*redis_server.keys('output_{0}_*'.format(uniq_id))) 457 return job_outputs
458
459 460 ##################################################################### 461 # MapReduce Interface 462 ##################################################################### 463 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 464 num_slots=1, temp_dir='/scratch/', white_list=None, 465 queue=DEFAULT_QUEUE, quiet=True):
466 """ 467 Maps a function onto the cluster. 468 @note: This can only be used with picklable functions (i.e., those that are 469 defined at the module or class level). 470 471 @param f: The function to map on args_list 472 @type f: C{function} 473 @param args_list: List of arguments to pass to f 474 @type args_list: C{list} 475 @param cleanup: Should we remove the stdout and stderr temporary files for 476 each job when we're done? (They are left in place if there's 477 an error.) 478 @type cleanup: C{bool} 479 @param mem_free: Estimate of how much memory each job will need (for 480 scheduling). (Not currently used, because our cluster does 481 not have that setting enabled.) 482 @type mem_free: C{basestring} 483 @param name: Base name to give each job (will have a number add to end) 484 @type name: C{basestring} 485 @param num_slots: Number of slots each job should use. 486 @type num_slots: C{int} 487 @param temp_dir: Local temporary directory for storing output for an 488 individual job. 489 @type temp_dir: C{basestring} 490 @param white_list: If specified, limit nodes used to only those in list. 491 @type white_list: C{list} of C{basestring} 492 @param queue: The SGE queue to use for scheduling. 493 @type queue: C{basestring} 494 @param quiet: When true, do not output information about the jobs that have 495 been submitted. 496 @type quiet: C{bool} 497 """ 498 499 # construct jobs 500 jobs = [Job(f, [args] if not isinstance(args, list) else args, 501 cleanup=cleanup, mem_free=mem_free, 502 name='{0}{1}'.format(name, job_num), num_slots=num_slots, 503 queue=queue) 504 for job_num, args in enumerate(args_list)] 505 506 # process jobs 507 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list, 508 quiet=quiet) 509 510 return job_results
511
512 513 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 514 num_slots=1, temp_dir='/scratch/', white_list=None, 515 queue=DEFAULT_QUEUE, quiet=True):
516 """ 517 @deprecated: This function has been renamed grid_map. 518 519 @param f: The function to map on args_list 520 @type f: C{function} 521 @param args_list: List of arguments to pass to f 522 @type args_list: C{list} 523 @param cleanup: Should we remove the stdout and stderr temporary files for 524 each job when we're done? (They are left in place if there's 525 an error.) 526 @type cleanup: C{bool} 527 @param mem_free: Estimate of how much memory each job will need (for 528 scheduling). (Not currently used, because our cluster does 529 not have that setting enabled.) 530 @type mem_free: C{basestring} 531 @param name: Base name to give each job (will have a number add to end) 532 @type name: C{basestring} 533 @param num_slots: Number of slots each job should use. 534 @type num_slots: C{int} 535 @param temp_dir: Local temporary directory for storing output for an 536 individual job. 537 @type temp_dir: C{basestring} 538 @param white_list: If specified, limit nodes used to only those in list. 539 @type white_list: C{list} of C{basestring} 540 @param queue: The SGE queue to use for scheduling. 541 @type queue: C{basestring} 542 @param quiet: When true, do not output information about the jobs that have 543 been submitted. 544 @type quiet: C{bool} 545 """ 546 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name, 547 num_slots=num_slots, temp_dir=temp_dir, 548 white_list=white_list, queue=queue, quiet=quiet)
549