Package gridmap :: Module job

Source Code for Module gridmap.job

  1  # -*- coding: utf-8 -*- 
  2   
  3  # Written (W) 2008-2012 Christian Widmer 
  4  # Written (W) 2008-2010 Cheng Soon Ong 
  5  # Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org 
  6  # Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS 
  7   
  8  # This file is part of Grid Map. 
  9   
 10  # Grid Map is free software: you can redistribute it and/or modify 
 11  # it under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14   
 15  # Grid Map is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with Grid Map.  If not, see <http://www.gnu.org/licenses/>. 
 22   
 23  """ 
 24  This module provides wrappers that simplify submission and collection of jobs, 
 25  in a more 'pythonic' fashion. 
 26   
 27  @author: Christian Widmer 
 28  @author: Cheng Soon Ong 
 29  @author: Dan Blanchard (dblanchard@ets.org) 
 30  """ 
 31   
 32  from __future__ import absolute_import, print_function, unicode_literals 
 33   
 34  import inspect 
 35  import os 
 36  import subprocess 
 37  import sys 
 38  import traceback 
 39  import uuid 
 40  from socket import gethostname 
 41  from time import sleep 
 42   
 43  import drmaa 
 44  from redis import StrictRedis 
 45  from redis.exceptions import ConnectionError as RedisConnectionError 
 46   
 47  from gridmap.data import clean_path, zload_db, zsave_db 
 48   
 49  # Python 2.x backward compatibility 
 50  if sys.version_info < (3, 0): 
 51      range = xrange 
 52   
 53   
 54  #### Global settings #### 
 55  # Redis settings 
 56  REDIS_DB = 2 
 57  REDIS_PORT = 7272 
 58   
 59  # Is mem_free configured properly on the cluster? 
 60  USE_MEM_FREE = False 
 61   
 62  # Which queue should we use by default 
 63  DEFAULT_QUEUE = 'all.q' 
64 65 66 -class Job(object):
67 """ 68 Central entity that wraps a function and its data. Basically, a job consists 69 of a function, its argument list, its keyword list and a field "ret" which 70 is filled, when the execute method gets called. 71 72 @note: This can only be used to wrap picklable functions (i.e., those that 73 are defined at the module or class level). 74 """ 75 76 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception', 77 'environment', 'replace_env', 'working_dir', 'num_slots', 78 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue') 79
80 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G", 81 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
82 """ 83 Initializes a new Job. 84 85 @param f: a function, which should be executed. 86 @type f: function 87 @param args: argument list of function f 88 @type args: list 89 @param kwlist: dictionary of keyword arguments for f 90 @type kwlist: dict 91 @param cleanup: flag that determines the cleanup of input and log file 92 @type cleanup: boolean 93 @param mem_free: Estimate of how much memory this job will need (for 94 scheduling) 95 @type mem_free: C{basestring} 96 @param name: Name to give this job 97 @type name: C{basestring} 98 @param num_slots: Number of slots this job should use. 99 @type num_slots: C{int} 100 @param queue: SGE queue to schedule job on. 101 @type queue: C{basestring} 102 """ 103 104 self.path = None 105 self._f = None 106 self.function = f 107 self.args = args 108 self.jobid = -1 109 self.kwlist = kwlist if kwlist is not None else {} 110 self.cleanup = cleanup 111 self.ret = None 112 self.environment = None 113 self.replace_env = False 114 self.working_dir = os.getcwd() 115 self.num_slots = num_slots 116 self.mem_free = mem_free 117 self.white_list = [] 118 self.uniq_id = None 119 self.name = name.replace(' ', '_') 120 self.queue = queue
121 122 @property
123 - def function(self):
124 ''' Function this job will execute. ''' 125 return self._f
126 127 @function.setter
128 - def function(self, f):
129 """ 130 setter for function that carefully takes care of 131 namespace, avoiding __main__ as a module 132 """ 133 134 m = inspect.getmodule(f) 135 try: 136 self.path = clean_path(os.path.dirname(os.path.abspath( 137 inspect.getsourcefile(f)))) 138 except TypeError: 139 self.path = '' 140 141 # if module is not __main__, all is good 142 if m.__name__ != "__main__": 143 self._f = f 144 145 else: 146 147 # determine real module name 148 mn = os.path.splitext(os.path.basename(m.__file__))[0] 149 150 # make sure module is present 151 __import__(mn) 152 153 # get module 154 mod = sys.modules[mn] 155 156 # set function from module 157 self._f = getattr(mod, f.__name__)
158
159 - def execute(self):
160 """ 161 Executes function f with given arguments 162 and writes return value to field ret. 163 If an exception is encountered during execution, ret will 164 contain a pickled version of it. 165 Input data is removed after execution to save space. 166 """ 167 try: 168 self.ret = self.function(*self.args, **self.kwlist) 169 except Exception as exception: 170 self.ret = exception 171 traceback.print_exc() 172 del self.args 173 del self.kwlist
174 175 @property
176 - def native_specification(self):
177 """ 178 define python-style getter 179 """ 180 181 ret = "" 182 183 if self.name: 184 ret += " -N {0}".format(self.name) 185 if self.mem_free and USE_MEM_FREE: 186 ret += " -l mem_free={0}".format(self.mem_free) 187 if self.num_slots and self.num_slots > 1: 188 ret += " -pe smp {0}".format(self.num_slots) 189 if self.white_list: 190 ret += " -l h={0}".format('|'.join(self.white_list)) 191 if self.queue: 192 ret += " -q {0}".format(self.queue) 193 194 return ret
195
196 197 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None, 198 quiet=True):
199 """ 200 Method used to send a list of jobs onto the cluster. 201 @param jobs: list of jobs to be executed 202 @type jobs: c{list} of L{Job} 203 @param uniq_id: The unique suffix for the tables corresponding to this job 204 in the database. 205 @type uniq_id: C{basestring} 206 @param temp_dir: Local temporary directory for storing output for an 207 individual job. 208 @type temp_dir: C{basestring} 209 @param white_list: List of acceptable nodes to use for scheduling job. If 210 None, all are used. 211 @type white_list: C{list} of C{basestring} 212 @param quiet: When true, do not output information about the jobs that have 213 been submitted. 214 @type quiet: C{bool} 215 """ 216 217 session = drmaa.Session() 218 session.initialize() 219 jobids = [] 220 221 for job_num, job in enumerate(jobs): 222 # set job white list 223 job.white_list = white_list 224 225 # append jobs 226 jobid = _append_job_to_session(session, job, uniq_id, job_num, 227 temp_dir=temp_dir, quiet=quiet) 228 jobids.append(jobid) 229 230 sid = session.contact 231 session.exit() 232 233 return (sid, jobids)
234
235 236 -def _append_job_to_session(session, job, uniq_id, job_num, temp_dir='/scratch/', 237 quiet=True):
238 """ 239 For an active session, append new job based on information stored in job 240 object. Also sets job.job_id to the ID of the job on the grid. 241 242 @param session: The current DRMAA session with the grid engine. 243 @type session: C{drmaa.Session} 244 @param job: The Job to add to the queue. 245 @type job: L{Job} 246 @param uniq_id: The unique suffix for the tables corresponding to this job 247 in the database. 248 @type uniq_id: C{basestring} 249 @param job_num: The row in the table to store/retrieve data on. This is only 250 non-zero for jobs created via grid_map. 251 @type job_num: C{int} 252 @param temp_dir: Local temporary directory for storing output for an 253 individual job. 254 @type temp_dir: C{basestring} 255 @param quiet: When true, do not output information about the jobs that have 256 been submitted. 257 @type quiet: C{bool} 258 """ 259 260 jt = session.createJobTemplate() 261 262 # fetch env vars from shell 263 shell_env = os.environ 264 265 if job.environment and job.replace_env: 266 # only consider defined env vars 267 jt.jobEnvironment = job.environment 268 269 elif job.environment and not job.replace_env: 270 # replace env var from shell with defined env vars 271 env = shell_env 272 env.update(job.environment) 273 jt.jobEnvironment = env 274 275 else: 276 # only consider env vars from shell 277 jt.jobEnvironment = shell_env 278 279 # Run module using python -m to avoid ImportErrors when unpickling jobs 280 jt.remoteCommand = '/usr/bin/env/python' 281 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id), 282 '{0}'.format(job_num), job.path, temp_dir, gethostname()] 283 jt.nativeSpecification = job.native_specification 284 jt.outputPath = ":" + temp_dir 285 jt.errorPath = ":" + temp_dir 286 287 jobid = session.runJob(jt) 288 289 # set job fields that depend on the jobid assigned by grid engine 290 job.jobid = jobid 291 292 if not quiet: 293 print('Your job {0} has been submitted with id {1}'.format(job.name, 294 jobid), 295 file=sys.stderr) 296 297 session.deleteJobTemplate(jt) 298 299 return jobid
300
301 302 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id, 303 temp_dir='/scratch/', wait=True):
304 """ 305 Collect the results from the jobids, returns a list of Jobs 306 307 @param sid: session identifier 308 @type sid: string returned by cluster 309 @param jobids: list of job identifiers returned by the cluster 310 @type jobids: list of strings 311 @param redis_server: Open connection to the database where the results will 312 be stored. 313 @type redis_server: L{StrictRedis} 314 @param wait: Wait for jobs to finish? 315 @type wait: Boolean, defaults to False 316 @param temp_dir: Local temporary directory for storing output for an 317 individual job. 318 @type temp_dir: C{basestring} 319 """ 320 321 for ix in range(len(jobids)): 322 assert(jobids[ix] == joblist[ix].jobid) 323 324 s = drmaa.Session() 325 s.initialize(sid) 326 327 if wait: 328 drmaaWait = drmaa.Session.TIMEOUT_WAIT_FOREVER 329 else: 330 drmaaWait = drmaa.Session.TIMEOUT_NO_WAIT 331 332 s.synchronize(jobids, drmaaWait, True) 333 # print("success: all jobs finished", file=sys.stderr) 334 s.exit() 335 336 # attempt to collect results 337 job_output_list = [] 338 for ix, job in enumerate(joblist): 339 340 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix]) 341 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix]) 342 343 try: 344 job_output = zload_db(redis_server, 'output{0}'.format(uniq_id), 345 ix) 346 except Exception as detail: 347 print(("Error while unpickling output for gridmap job {1} from" + 348 " stored with key output_{0}_{1}").format(uniq_id, ix), 349 file=sys.stderr) 350 print("This could caused by a problem with the cluster " + 351 "environment, imports or environment variables.", 352 file=sys.stderr) 353 print(("Try running `python -m gridmap.runner {0} {1} {2} {3} " + 354 "{4}` to see if your job crashed before writing its " + 355 "output.").format(uniq_id, 356 ix, 357 job.path, 358 temp_dir, 359 gethostname()), 360 file=sys.stderr) 361 print("Check log files for more information: ", file=sys.stderr) 362 print("stdout:", log_stdout_fn, file=sys.stderr) 363 print("stderr:", log_stderr_fn, file=sys.stderr) 364 print("Exception: {0}".format(detail)) 365 sys.exit(2) 366 367 #print exceptions 368 if isinstance(job_output, Exception): 369 print("Exception encountered in job with log file:", 370 file=sys.stderr) 371 print(log_stdout_fn, file=sys.stderr) 372 print(job_output, file=sys.stderr) 373 print(file=sys.stderr) 374 375 job_output_list.append(job_output) 376 377 return job_output_list
378
379 380 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None, 381 quiet=True):
382 """ 383 Take a list of jobs and process them on the cluster. 384 385 @param temp_dir: Local temporary directory for storing output for an 386 individual job. 387 @type temp_dir: C{basestring} 388 @param wait: Should we wait for jobs to finish? (Should only be false if the 389 function you're running doesn't return anything) 390 @type wait: C{bool} 391 @param white_list: If specified, limit nodes used to only those in list. 392 @type white_list: C{list} of C{basestring} 393 @param quiet: When true, do not output information about the jobs that have 394 been submitted. 395 @type quiet: C{bool} 396 """ 397 # Create new connection to Redis database with pickled jobs 398 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT) 399 400 # Check if Redis server is launched, and spawn it if not. 401 try: 402 redis_server.set('connection_test', True) 403 except RedisConnectionError: 404 with open('/dev/null') as null_file: 405 redis_process = subprocess.Popen(['redis-server', '-'], 406 stdout=null_file, 407 stdin=subprocess.PIPE, 408 stderr=null_file) 409 redis_process.stdin.write('''daemonize yes 410 pidfile {0} 411 port {1} 412 '''.format(os.path.join(temp_dir, 413 'redis{0}.pid'.format(REDIS_PORT)), 414 REDIS_PORT)) 415 redis_process.stdin.close() 416 # Wait for things to get started 417 sleep(5) 418 419 # Generate random name for keys 420 uniq_id = uuid.uuid4() 421 422 # Save jobs to database 423 for job_id, job in enumerate(jobs): 424 zsave_db(job, redis_server, 'job{0}'.format(uniq_id), job_id) 425 426 # Submit jobs to cluster 427 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list, 428 temp_dir=temp_dir, quiet=quiet) 429 430 # Reconnect and retrieve outputs 431 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id, 432 temp_dir=temp_dir, wait=wait) 433 434 # Make sure we have enough output 435 assert(len(jobs) == len(job_outputs)) 436 437 # Delete keys from existing server or just 438 redis_server.delete(*redis_server.keys('job{0}_*'.format(uniq_id))) 439 redis_server.delete(*redis_server.keys('output{0}_*'.format(uniq_id))) 440 return job_outputs
441
442 443 ##################################################################### 444 # MapReduce Interface 445 ##################################################################### 446 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 447 num_slots=1, temp_dir='/scratch/', white_list=None, 448 queue=DEFAULT_QUEUE, quiet=True):
449 """ 450 Maps a function onto the cluster. 451 @note: This can only be used with picklable functions (i.e., those that are 452 defined at the module or class level). 453 454 @param f: The function to map on args_list 455 @type f: C{function} 456 @param args_list: List of arguments to pass to f 457 @type args_list: C{list} 458 @param cleanup: Should we remove the stdout and stderr temporary files for 459 each job when we're done? (They are left in place if there's 460 an error.) 461 @type cleanup: C{bool} 462 @param mem_free: Estimate of how much memory each job will need (for 463 scheduling). (Not currently used, because our cluster does 464 not have that setting enabled.) 465 @type mem_free: C{basestring} 466 @param name: Base name to give each job (will have a number add to end) 467 @type name: C{basestring} 468 @param num_slots: Number of slots each job should use. 469 @type num_slots: C{int} 470 @param temp_dir: Local temporary directory for storing output for an 471 individual job. 472 @type temp_dir: C{basestring} 473 @param white_list: If specified, limit nodes used to only those in list. 474 @type white_list: C{list} of C{basestring} 475 @param queue: The SGE queue to use for scheduling. 476 @type queue: C{basestring} 477 @param quiet: When true, do not output information about the jobs that have 478 been submitted. 479 @type quiet: C{bool} 480 """ 481 482 # construct jobs 483 jobs = [Job(f, [args] if not isinstance(args, list) else args, 484 cleanup=cleanup, mem_free=mem_free, 485 name='{0}{1}'.format(name, job_num), num_slots=num_slots, 486 queue=queue) 487 for job_num, args in enumerate(args_list)] 488 489 # process jobs 490 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list, 491 quiet=quiet) 492 493 return job_results
494
495 496 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 497 num_slots=1, temp_dir='/scratch/', white_list=None, 498 queue=DEFAULT_QUEUE, quiet=True):
499 """ 500 @deprecated: This function has been renamed grid_map. 501 502 @param f: The function to map on args_list 503 @type f: C{function} 504 @param args_list: List of arguments to pass to f 505 @type args_list: C{list} 506 @param cleanup: Should we remove the stdout and stderr temporary files for 507 each job when we're done? (They are left in place if there's 508 an error.) 509 @type cleanup: C{bool} 510 @param mem_free: Estimate of how much memory each job will need (for 511 scheduling). (Not currently used, because our cluster does 512 not have that setting enabled.) 513 @type mem_free: C{basestring} 514 @param name: Base name to give each job (will have a number add to end) 515 @type name: C{basestring} 516 @param num_slots: Number of slots each job should use. 517 @type num_slots: C{int} 518 @param temp_dir: Local temporary directory for storing output for an 519 individual job. 520 @type temp_dir: C{basestring} 521 @param white_list: If specified, limit nodes used to only those in list. 522 @type white_list: C{list} of C{basestring} 523 @param queue: The SGE queue to use for scheduling. 524 @type queue: C{basestring} 525 @param quiet: When true, do not output information about the jobs that have 526 been submitted. 527 @type quiet: C{bool} 528 """ 529 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name, 530 num_slots=num_slots, temp_dir=temp_dir, 531 white_list=white_list, queue=queue, quiet=quiet)
532