Package gridmap :: Module job

Source Code for Module gridmap.job

  1  # -*- coding: utf-8 -*- 
  2   
  3  # Written (W) 2008-2012 Christian Widmer 
  4  # Written (W) 2008-2010 Cheng Soon Ong 
  5  # Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org 
  6  # Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS 
  7   
  8  # This file is part of Grid Map. 
  9   
 10  # Grid Map is free software: you can redistribute it and/or modify 
 11  # it under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14   
 15  # Grid Map is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with Grid Map.  If not, see <http://www.gnu.org/licenses/>. 
 22   
 23  """ 
 24  This module provides wrappers that simplify submission and collection of jobs, 
 25  in a more 'pythonic' fashion. 
 26   
 27  @author: Christian Widmer 
 28  @author: Cheng Soon Ong 
 29  @author: Dan Blanchard (dblanchard@ets.org) 
 30   
 31  @var REDIS_DB: The index of the database to select on the Redis server; can be 
 32                 overriden by setting the GRID_MAP_REDIS_DB environment variable. 
 33  @var REDIS_PORT: The port of the Redis server to use; can be overriden by 
 34                   setting the GRID_MAP_REDIS_PORT environment variable. 
 35  @var USE_MEM_FREE: Does your cluster support specifying how much memory a job 
 36                     will use via mem_free? Can be overriden by setting the 
 37                     GRID_MAP_USE_MEM_FREE environment variable. 
 38  @var DEFAULT_QUEUE: The default job scheduling queue to use; can be overriden 
 39                      via the GRID_MAP_DEFAULT_QUEUE environment variable. 
 40  """ 
 41   
 42  from __future__ import absolute_import, print_function, unicode_literals 
 43   
 44  import inspect 
 45  import os 
 46  import subprocess 
 47  import sys 
 48  import traceback 
 49  import uuid 
 50  from socket import gethostname 
 51  from time import sleep 
 52   
 53  from drmaa import Session 
 54  from drmaa.errors import InvalidJobException 
 55  from redis import StrictRedis 
 56  from redis.exceptions import ConnectionError as RedisConnectionError 
 57   
 58  from gridmap.data import clean_path, zload_db, zsave_db 
 59   
 60  # Python 2.x backward compatibility 
 61  if sys.version_info < (3, 0): 
 62      range = xrange 
 63   
 64   
 65  #### Global settings #### 
 66  # Redis settings 
 67  REDIS_DB = int(os.getenv('GRID_MAP_REDIS_DB', '2')) 
 68   
 69  REDIS_PORT = int(os.getenv('GRID_MAP_REDIS_PORT', '7272')) 
 70   
 71  # Is mem_free configured properly on the cluster? 
 72  USE_MEM_FREE = 'TRUE' == os.getenv('GRID_MAP_USE_MEM_FREE', 'False').upper() 
 73   
 74  # Which queue should we use by default 
 75  DEFAULT_QUEUE = os.getenv('GRID_MAP_DEFAULT_QUEUE', 'all.q') 
76 77 78 -class Job(object):
79 """ 80 Central entity that wraps a function and its data. Basically, a job consists 81 of a function, its argument list, its keyword list and a field "ret" which 82 is filled, when the execute method gets called. 83 84 @note: This can only be used to wrap picklable functions (i.e., those that 85 are defined at the module or class level). 86 """ 87 88 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception', 89 'environment', 'replace_env', 'working_dir', 'num_slots', 90 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue') 91
92 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G", 93 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
94 """ 95 Initializes a new Job. 96 97 @param f: a function, which should be executed. 98 @type f: function 99 @param args: argument list of function f 100 @type args: list 101 @param kwlist: dictionary of keyword arguments for f 102 @type kwlist: dict 103 @param cleanup: flag that determines the cleanup of input and log file 104 @type cleanup: boolean 105 @param mem_free: Estimate of how much memory this job will need (for 106 scheduling) 107 @type mem_free: C{basestring} 108 @param name: Name to give this job 109 @type name: C{basestring} 110 @param num_slots: Number of slots this job should use. 111 @type num_slots: C{int} 112 @param queue: SGE queue to schedule job on. 113 @type queue: C{basestring} 114 """ 115 116 self.path = None 117 self._f = None 118 self.function = f 119 self.args = args 120 self.jobid = -1 121 self.kwlist = kwlist if kwlist is not None else {} 122 self.cleanup = cleanup 123 self.ret = None 124 self.environment = None 125 self.replace_env = False 126 self.working_dir = os.getcwd() 127 self.num_slots = num_slots 128 self.mem_free = mem_free 129 self.white_list = [] 130 self.name = name.replace(' ', '_') 131 self.queue = queue
132 133 @property
134 - def function(self):
135 ''' Function this job will execute. ''' 136 return self._f
137 138 @function.setter
139 - def function(self, f):
140 """ 141 setter for function that carefully takes care of 142 namespace, avoiding __main__ as a module 143 """ 144 145 m = inspect.getmodule(f) 146 try: 147 self.path = clean_path(os.path.dirname(os.path.abspath( 148 inspect.getsourcefile(f)))) 149 except TypeError: 150 self.path = '' 151 152 # if module is not __main__, all is good 153 if m.__name__ != "__main__": 154 self._f = f 155 156 else: 157 158 # determine real module name 159 mn = os.path.splitext(os.path.basename(m.__file__))[0] 160 161 # make sure module is present 162 __import__(mn) 163 164 # get module 165 mod = sys.modules[mn] 166 167 # set function from module 168 self._f = getattr(mod, f.__name__)
169
170 - def execute(self):
171 """ 172 Executes function f with given arguments 173 and writes return value to field ret. 174 If an exception is encountered during execution, ret will 175 contain a pickled version of it. 176 Input data is removed after execution to save space. 177 """ 178 try: 179 self.ret = self.function(*self.args, **self.kwlist) 180 except Exception as exception: 181 self.ret = exception 182 traceback.print_exc() 183 del self.args 184 del self.kwlist
185 186 @property
187 - def native_specification(self):
188 """ 189 define python-style getter 190 """ 191 192 ret = "" 193 194 if self.name: 195 ret += " -N {0}".format(self.name) 196 if self.mem_free and USE_MEM_FREE: 197 ret += " -l mem_free={0}".format(self.mem_free) 198 if self.num_slots and self.num_slots > 1: 199 ret += " -pe smp {0}".format(self.num_slots) 200 if self.white_list: 201 ret += " -l h={0}".format('|'.join(self.white_list)) 202 if self.queue: 203 ret += " -q {0}".format(self.queue) 204 205 return ret
206
207 208 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None, 209 quiet=True):
210 """ 211 Method used to send a list of jobs onto the cluster. 212 @param jobs: list of jobs to be executed 213 @type jobs: c{list} of L{Job} 214 @param uniq_id: The unique suffix for the tables corresponding to this job 215 in the database. 216 @type uniq_id: C{basestring} 217 @param temp_dir: Local temporary directory for storing output for an 218 individual job. 219 @type temp_dir: C{basestring} 220 @param white_list: List of acceptable nodes to use for scheduling job. If 221 None, all are used. 222 @type white_list: C{list} of C{basestring} 223 @param quiet: When true, do not output information about the jobs that have 224 been submitted. 225 @type quiet: C{bool} 226 """ 227 228 session = Session() 229 session.initialize() 230 jobids = [] 231 232 for job_num, job in enumerate(jobs): 233 # set job white list 234 job.white_list = white_list 235 236 # append jobs 237 jobid = _append_job_to_session(session, job, uniq_id, job_num, 238 temp_dir=temp_dir, quiet=quiet) 239 jobids.append(jobid) 240 241 sid = session.contact 242 session.exit() 243 244 return (sid, jobids)
245
246 247 -def _append_job_to_session(session, job, uniq_id, job_num, temp_dir='/scratch/', 248 quiet=True):
249 """ 250 For an active session, append new job based on information stored in job 251 object. Also sets job.job_id to the ID of the job on the grid. 252 253 @param session: The current DRMAA session with the grid engine. 254 @type session: C{Session} 255 @param job: The Job to add to the queue. 256 @type job: L{Job} 257 @param uniq_id: The unique suffix for the tables corresponding to this job 258 in the database. 259 @type uniq_id: C{basestring} 260 @param job_num: The row in the table to store/retrieve data on. This is only 261 non-zero for jobs created via grid_map. 262 @type job_num: C{int} 263 @param temp_dir: Local temporary directory for storing output for an 264 individual job. 265 @type temp_dir: C{basestring} 266 @param quiet: When true, do not output information about the jobs that have 267 been submitted. 268 @type quiet: C{bool} 269 """ 270 271 jt = session.createJobTemplate() 272 273 # fetch env vars from shell 274 shell_env = os.environ 275 276 if job.environment and job.replace_env: 277 # only consider defined env vars 278 jt.jobEnvironment = job.environment 279 280 elif job.environment and not job.replace_env: 281 # replace env var from shell with defined env vars 282 env = shell_env 283 env.update(job.environment) 284 jt.jobEnvironment = env 285 286 else: 287 # only consider env vars from shell 288 jt.jobEnvironment = shell_env 289 290 # Run module using python -m to avoid ImportErrors when unpickling jobs 291 jt.remoteCommand = sys.executable 292 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id), 293 '{0}'.format(job_num), job.path, temp_dir, gethostname()] 294 jt.nativeSpecification = job.native_specification 295 jt.outputPath = ":" + temp_dir 296 jt.errorPath = ":" + temp_dir 297 298 jobid = session.runJob(jt) 299 300 # set job fields that depend on the jobid assigned by grid engine 301 job.jobid = jobid 302 303 if not quiet: 304 print('Your job {0} has been submitted with id {1}'.format(job.name, 305 jobid), 306 file=sys.stderr) 307 308 session.deleteJobTemplate(jt) 309 310 return jobid
311
312 313 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id, 314 temp_dir='/scratch/', wait=True):
315 """ 316 Collect the results from the jobids, returns a list of Jobs 317 318 @param sid: session identifier 319 @type sid: string returned by cluster 320 @param jobids: list of job identifiers returned by the cluster 321 @type jobids: list of strings 322 @param redis_server: Open connection to the database where the results will 323 be stored. 324 @type redis_server: L{StrictRedis} 325 @param wait: Wait for jobs to finish? 326 @type wait: bool 327 @param temp_dir: Local temporary directory for storing output for an 328 individual job. 329 @type temp_dir: C{basestring} 330 """ 331 332 for ix in range(len(jobids)): 333 assert(jobids[ix] == joblist[ix].jobid) 334 335 # Open DRMAA session as context manager 336 with Session(sid) as session: 337 338 if wait: 339 drmaaWait = Session.TIMEOUT_WAIT_FOREVER 340 else: 341 drmaaWait = Session.TIMEOUT_NO_WAIT 342 343 # Wait for jobs to finish 344 session.synchronize(jobids, drmaaWait, False) 345 346 # attempt to collect results 347 job_output_list = [] 348 for ix, job in enumerate(joblist): 349 350 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix]) 351 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix]) 352 353 # Get the exit status and other status info about the job 354 job_info = session.wait(job.jobid, drmaaWait) 355 356 try: 357 job_output = zload_db(redis_server, 358 'output_{0}'.format(uniq_id), 359 ix) 360 except Exception as detail: 361 print(("Error while unpickling output for gridmap job {1} " + 362 "stored with key output_{0}_{1}").format(uniq_id, ix), 363 file=sys.stderr) 364 print("This usually happens when a job has crashed before " + 365 "writing its output to the database.", 366 file=sys.stderr) 367 print("\nHere is some information about the problem job:", 368 file=sys.stderr) 369 print("stdout:", log_stdout_fn, file=sys.stderr) 370 print("stderr:", log_stderr_fn, file=sys.stderr) 371 if job_info.hasExited: 372 print("Exit status: {0}".format(job_info.exitStatus), 373 file=sys.stderr) 374 if job_info.hasSignal: 375 print(("Terminating signal: " + 376 "{0}").format(job_info.terminatedSignal), 377 file=sys.stderr) 378 print("Core dumped: {0}".format(job_info.hasCoreDump), 379 file=sys.stderr) 380 print(("Job aborted before it ran: " + 381 "{0}").format(job_info.wasAborted), 382 file=sys.stderr) 383 print("Job resources: {0}".format(job_info.resourceUsage), 384 file=sys.stderr) 385 try: 386 print(("Job SGE status: " + 387 "{0}").format(session.jobStatus(job.jobid)), 388 file=sys.stderr) 389 except InvalidJobException: 390 pass 391 print("Unpickling exception: {0}".format(detail), 392 file=sys.stderr) 393 sys.exit(2) 394 395 #print exceptions 396 if isinstance(job_output, Exception): 397 print("Exception encountered in job with log file:", 398 file=sys.stderr) 399 print(log_stdout_fn, file=sys.stderr) 400 print(job_output, file=sys.stderr) 401 print(file=sys.stderr) 402 403 job_output_list.append(job_output) 404 405 return job_output_list
406
407 408 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None, 409 quiet=True):
410 """ 411 Take a list of jobs and process them on the cluster. 412 413 @param temp_dir: Local temporary directory for storing output for an 414 individual job. 415 @type temp_dir: C{basestring} 416 @param wait: Should we wait for jobs to finish? (Should only be false if the 417 function you're running doesn't return anything) 418 @type wait: C{bool} 419 @param white_list: If specified, limit nodes used to only those in list. 420 @type white_list: C{list} of C{basestring} 421 @param quiet: When true, do not output information about the jobs that have 422 been submitted. 423 @type quiet: C{bool} 424 """ 425 # Create new connection to Redis database with pickled jobs 426 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT) 427 428 # Check if Redis server is launched, and spawn it if not. 429 try: 430 redis_server.set('connection_test', True) 431 except RedisConnectionError: 432 with open('/dev/null') as null_file: 433 redis_process = subprocess.Popen(['redis-server', '-'], 434 stdout=null_file, 435 stdin=subprocess.PIPE, 436 stderr=null_file) 437 redis_process.stdin.write('''daemonize yes 438 pidfile {0} 439 port {1} 440 '''.format(os.path.join(temp_dir, 441 'redis{0}.pid'.format(REDIS_PORT)), 442 REDIS_PORT)) 443 redis_process.stdin.close() 444 # Wait for things to get started 445 sleep(5) 446 447 # Generate random name for keys 448 uniq_id = uuid.uuid4() 449 450 # Save jobs to database 451 for job_id, job in enumerate(jobs): 452 zsave_db(job, redis_server, 'job_{0}'.format(uniq_id), job_id) 453 454 # Submit jobs to cluster 455 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list, 456 temp_dir=temp_dir, quiet=quiet) 457 458 # Reconnect and retrieve outputs 459 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id, 460 temp_dir=temp_dir, wait=wait) 461 462 # Make sure we have enough output 463 assert(len(jobs) == len(job_outputs)) 464 465 # Delete keys from existing server or just 466 redis_server.delete(*redis_server.keys('job_{0}_*'.format(uniq_id))) 467 redis_server.delete(*redis_server.keys('output_{0}_*'.format(uniq_id))) 468 return job_outputs
469
470 471 ##################################################################### 472 # MapReduce Interface 473 ##################################################################### 474 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 475 num_slots=1, temp_dir='/scratch/', white_list=None, 476 queue=DEFAULT_QUEUE, quiet=True):
477 """ 478 Maps a function onto the cluster. 479 @note: This can only be used with picklable functions (i.e., those that are 480 defined at the module or class level). 481 482 @param f: The function to map on args_list 483 @type f: C{function} 484 @param args_list: List of arguments to pass to f 485 @type args_list: C{list} 486 @param cleanup: Should we remove the stdout and stderr temporary files for 487 each job when we're done? (They are left in place if there's 488 an error.) 489 @type cleanup: C{bool} 490 @param mem_free: Estimate of how much memory each job will need (for 491 scheduling). (Not currently used, because our cluster does 492 not have that setting enabled.) 493 @type mem_free: C{basestring} 494 @param name: Base name to give each job (will have a number add to end) 495 @type name: C{basestring} 496 @param num_slots: Number of slots each job should use. 497 @type num_slots: C{int} 498 @param temp_dir: Local temporary directory for storing output for an 499 individual job. 500 @type temp_dir: C{basestring} 501 @param white_list: If specified, limit nodes used to only those in list. 502 @type white_list: C{list} of C{basestring} 503 @param queue: The SGE queue to use for scheduling. 504 @type queue: C{basestring} 505 @param quiet: When true, do not output information about the jobs that have 506 been submitted. 507 @type quiet: C{bool} 508 """ 509 510 # construct jobs 511 jobs = [Job(f, [args] if not isinstance(args, list) else args, 512 cleanup=cleanup, mem_free=mem_free, 513 name='{0}{1}'.format(name, job_num), num_slots=num_slots, 514 queue=queue) 515 for job_num, args in enumerate(args_list)] 516 517 # process jobs 518 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list, 519 quiet=quiet) 520 521 return job_results
522
523 524 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 525 num_slots=1, temp_dir='/scratch/', white_list=None, 526 queue=DEFAULT_QUEUE, quiet=True):
527 """ 528 @deprecated: This function has been renamed grid_map. 529 530 @param f: The function to map on args_list 531 @type f: C{function} 532 @param args_list: List of arguments to pass to f 533 @type args_list: C{list} 534 @param cleanup: Should we remove the stdout and stderr temporary files for 535 each job when we're done? (They are left in place if there's 536 an error.) 537 @type cleanup: C{bool} 538 @param mem_free: Estimate of how much memory each job will need (for 539 scheduling). (Not currently used, because our cluster does 540 not have that setting enabled.) 541 @type mem_free: C{basestring} 542 @param name: Base name to give each job (will have a number add to end) 543 @type name: C{basestring} 544 @param num_slots: Number of slots each job should use. 545 @type num_slots: C{int} 546 @param temp_dir: Local temporary directory for storing output for an 547 individual job. 548 @type temp_dir: C{basestring} 549 @param white_list: If specified, limit nodes used to only those in list. 550 @type white_list: C{list} of C{basestring} 551 @param queue: The SGE queue to use for scheduling. 552 @type queue: C{basestring} 553 @param quiet: When true, do not output information about the jobs that have 554 been submitted. 555 @type quiet: C{bool} 556 """ 557 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name, 558 num_slots=num_slots, temp_dir=temp_dir, 559 white_list=white_list, queue=queue, quiet=quiet)
560