1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 This module provides wrappers that simplify submission and collection of jobs,
25 in a more 'pythonic' fashion.
26
27 @author: Christian Widmer
28 @author: Cheng Soon Ong
29 @author: Dan Blanchard (dblanchard@ets.org)
30 """
31
32 from __future__ import absolute_import, print_function, unicode_literals
33
34 import inspect
35 import os
36 import subprocess
37 import sys
38 import traceback
39 import uuid
40 from socket import gethostname
41 from time import sleep
42
43 from drmaa import Session
44 from drmaa.errors import InvalidJobException
45 from redis import StrictRedis
46 from redis.exceptions import ConnectionError as RedisConnectionError
47
48 from gridmap.data import clean_path, zload_db, zsave_db
49
50
51 if sys.version_info < (3, 0):
52 range = xrange
53
54
55
56
57 REDIS_DB = int(os.getenv('GRID_MAP_REDIS_DB', '2'))
58 REDIS_PORT = int(os.getenv('GRID_MAP_REDIS_PORT', '7272'))
59
60
61 USE_MEM_FREE = 'True' == os.getenv('GRID_MAP_USE_MEM_FREE', 'False')
62
63
64 DEFAULT_QUEUE = os.getenv('GRID_MAP_DEFAULT_QUEUE', 'all.q')
65
66
67 -class Job(object):
68 """
69 Central entity that wraps a function and its data. Basically, a job consists
70 of a function, its argument list, its keyword list and a field "ret" which
71 is filled, when the execute method gets called.
72
73 @note: This can only be used to wrap picklable functions (i.e., those that
74 are defined at the module or class level).
75 """
76
77 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception',
78 'environment', 'replace_env', 'working_dir', 'num_slots',
79 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue')
80
81 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G",
82 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
83 """
84 Initializes a new Job.
85
86 @param f: a function, which should be executed.
87 @type f: function
88 @param args: argument list of function f
89 @type args: list
90 @param kwlist: dictionary of keyword arguments for f
91 @type kwlist: dict
92 @param cleanup: flag that determines the cleanup of input and log file
93 @type cleanup: boolean
94 @param mem_free: Estimate of how much memory this job will need (for
95 scheduling)
96 @type mem_free: C{basestring}
97 @param name: Name to give this job
98 @type name: C{basestring}
99 @param num_slots: Number of slots this job should use.
100 @type num_slots: C{int}
101 @param queue: SGE queue to schedule job on.
102 @type queue: C{basestring}
103 """
104
105 self.path = None
106 self._f = None
107 self.function = f
108 self.args = args
109 self.jobid = -1
110 self.kwlist = kwlist if kwlist is not None else {}
111 self.cleanup = cleanup
112 self.ret = None
113 self.environment = None
114 self.replace_env = False
115 self.working_dir = os.getcwd()
116 self.num_slots = num_slots
117 self.mem_free = mem_free
118 self.white_list = []
119 self.name = name.replace(' ', '_')
120 self.queue = queue
121
122 @property
124 ''' Function this job will execute. '''
125 return self._f
126
127 @function.setter
129 """
130 setter for function that carefully takes care of
131 namespace, avoiding __main__ as a module
132 """
133
134 m = inspect.getmodule(f)
135 try:
136 self.path = clean_path(os.path.dirname(os.path.abspath(
137 inspect.getsourcefile(f))))
138 except TypeError:
139 self.path = ''
140
141
142 if m.__name__ != "__main__":
143 self._f = f
144
145 else:
146
147
148 mn = os.path.splitext(os.path.basename(m.__file__))[0]
149
150
151 __import__(mn)
152
153
154 mod = sys.modules[mn]
155
156
157 self._f = getattr(mod, f.__name__)
158
160 """
161 Executes function f with given arguments
162 and writes return value to field ret.
163 If an exception is encountered during execution, ret will
164 contain a pickled version of it.
165 Input data is removed after execution to save space.
166 """
167 try:
168 self.ret = self.function(*self.args, **self.kwlist)
169 except Exception as exception:
170 self.ret = exception
171 traceback.print_exc()
172 del self.args
173 del self.kwlist
174
175 @property
195
196
197 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None,
198 quiet=True):
199 """
200 Method used to send a list of jobs onto the cluster.
201 @param jobs: list of jobs to be executed
202 @type jobs: c{list} of L{Job}
203 @param uniq_id: The unique suffix for the tables corresponding to this job
204 in the database.
205 @type uniq_id: C{basestring}
206 @param temp_dir: Local temporary directory for storing output for an
207 individual job.
208 @type temp_dir: C{basestring}
209 @param white_list: List of acceptable nodes to use for scheduling job. If
210 None, all are used.
211 @type white_list: C{list} of C{basestring}
212 @param quiet: When true, do not output information about the jobs that have
213 been submitted.
214 @type quiet: C{bool}
215 """
216
217 session = Session()
218 session.initialize()
219 jobids = []
220
221 for job_num, job in enumerate(jobs):
222
223 job.white_list = white_list
224
225
226 jobid = _append_job_to_session(session, job, uniq_id, job_num,
227 temp_dir=temp_dir, quiet=quiet)
228 jobids.append(jobid)
229
230 sid = session.contact
231 session.exit()
232
233 return (sid, jobids)
234
238 """
239 For an active session, append new job based on information stored in job
240 object. Also sets job.job_id to the ID of the job on the grid.
241
242 @param session: The current DRMAA session with the grid engine.
243 @type session: C{Session}
244 @param job: The Job to add to the queue.
245 @type job: L{Job}
246 @param uniq_id: The unique suffix for the tables corresponding to this job
247 in the database.
248 @type uniq_id: C{basestring}
249 @param job_num: The row in the table to store/retrieve data on. This is only
250 non-zero for jobs created via grid_map.
251 @type job_num: C{int}
252 @param temp_dir: Local temporary directory for storing output for an
253 individual job.
254 @type temp_dir: C{basestring}
255 @param quiet: When true, do not output information about the jobs that have
256 been submitted.
257 @type quiet: C{bool}
258 """
259
260 jt = session.createJobTemplate()
261
262
263 shell_env = os.environ
264
265 if job.environment and job.replace_env:
266
267 jt.jobEnvironment = job.environment
268
269 elif job.environment and not job.replace_env:
270
271 env = shell_env
272 env.update(job.environment)
273 jt.jobEnvironment = env
274
275 else:
276
277 jt.jobEnvironment = shell_env
278
279
280 jt.remoteCommand = sys.executable
281 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id),
282 '{0}'.format(job_num), job.path, temp_dir, gethostname()]
283 jt.nativeSpecification = job.native_specification
284 jt.outputPath = ":" + temp_dir
285 jt.errorPath = ":" + temp_dir
286
287 jobid = session.runJob(jt)
288
289
290 job.jobid = jobid
291
292 if not quiet:
293 print('Your job {0} has been submitted with id {1}'.format(job.name,
294 jobid),
295 file=sys.stderr)
296
297 session.deleteJobTemplate(jt)
298
299 return jobid
300
301
302 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id,
303 temp_dir='/scratch/', wait=True):
304 """
305 Collect the results from the jobids, returns a list of Jobs
306
307 @param sid: session identifier
308 @type sid: string returned by cluster
309 @param jobids: list of job identifiers returned by the cluster
310 @type jobids: list of strings
311 @param redis_server: Open connection to the database where the results will
312 be stored.
313 @type redis_server: L{StrictRedis}
314 @param wait: Wait for jobs to finish?
315 @type wait: bool
316 @param temp_dir: Local temporary directory for storing output for an
317 individual job.
318 @type temp_dir: C{basestring}
319 """
320
321 for ix in range(len(jobids)):
322 assert(jobids[ix] == joblist[ix].jobid)
323
324
325 with Session(sid) as session:
326
327 if wait:
328 drmaaWait = Session.TIMEOUT_WAIT_FOREVER
329 else:
330 drmaaWait = Session.TIMEOUT_NO_WAIT
331
332
333 session.synchronize(jobids, drmaaWait, False)
334
335
336 job_output_list = []
337 for ix, job in enumerate(joblist):
338
339 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix])
340 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix])
341
342
343 job_info = session.wait(job.jobid, drmaaWait)
344
345 try:
346 job_output = zload_db(redis_server,
347 'output_{0}'.format(uniq_id),
348 ix)
349 except Exception as detail:
350 print(("Error while unpickling output for gridmap job {1} " +
351 "stored with key output_{0}_{1}").format(uniq_id, ix),
352 file=sys.stderr)
353 print("This usually happens when a job has crashed before " +
354 "writing its output to the database.",
355 file=sys.stderr)
356 print("\nHere is some information about the problem job:",
357 file=sys.stderr)
358 print("stdout:", log_stdout_fn, file=sys.stderr)
359 print("stderr:", log_stderr_fn, file=sys.stderr)
360 if job_info.hasExited:
361 print("Exit status: {0}".format(job_info.exitStatus),
362 file=sys.stderr)
363 if job_info.hasSignal:
364 print(("Terminating signal: " +
365 "{0}").format(job_info.terminatedSignal),
366 file=sys.stderr)
367 print("Core dumped: {0}".format(job_info.hasCoreDump),
368 file=sys.stderr)
369 print(("Job aborted before it ran: " +
370 "{0}").format(job_info.wasAborted),
371 file=sys.stderr)
372 print("Job resources: {0}".format(job_info.resourceUsage),
373 file=sys.stderr)
374 try:
375 print(("Job SGE status: " +
376 "{0}").format(session.jobStatus(job.jobid)),
377 file=sys.stderr)
378 except InvalidJobException:
379 pass
380 print("Unpickling exception: {0}".format(detail),
381 file=sys.stderr)
382 sys.exit(2)
383
384
385 if isinstance(job_output, Exception):
386 print("Exception encountered in job with log file:",
387 file=sys.stderr)
388 print(log_stdout_fn, file=sys.stderr)
389 print(job_output, file=sys.stderr)
390 print(file=sys.stderr)
391
392 job_output_list.append(job_output)
393
394 return job_output_list
395
396
397 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None,
398 quiet=True):
399 """
400 Take a list of jobs and process them on the cluster.
401
402 @param temp_dir: Local temporary directory for storing output for an
403 individual job.
404 @type temp_dir: C{basestring}
405 @param wait: Should we wait for jobs to finish? (Should only be false if the
406 function you're running doesn't return anything)
407 @type wait: C{bool}
408 @param white_list: If specified, limit nodes used to only those in list.
409 @type white_list: C{list} of C{basestring}
410 @param quiet: When true, do not output information about the jobs that have
411 been submitted.
412 @type quiet: C{bool}
413 """
414
415 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT)
416
417
418 try:
419 redis_server.set('connection_test', True)
420 except RedisConnectionError:
421 with open('/dev/null') as null_file:
422 redis_process = subprocess.Popen(['redis-server', '-'],
423 stdout=null_file,
424 stdin=subprocess.PIPE,
425 stderr=null_file)
426 redis_process.stdin.write('''daemonize yes
427 pidfile {0}
428 port {1}
429 '''.format(os.path.join(temp_dir,
430 'redis{0}.pid'.format(REDIS_PORT)),
431 REDIS_PORT))
432 redis_process.stdin.close()
433
434 sleep(5)
435
436
437 uniq_id = uuid.uuid4()
438
439
440 for job_id, job in enumerate(jobs):
441 zsave_db(job, redis_server, 'job_{0}'.format(uniq_id), job_id)
442
443
444 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list,
445 temp_dir=temp_dir, quiet=quiet)
446
447
448 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id,
449 temp_dir=temp_dir, wait=wait)
450
451
452 assert(len(jobs) == len(job_outputs))
453
454
455 redis_server.delete(*redis_server.keys('job_{0}_*'.format(uniq_id)))
456 redis_server.delete(*redis_server.keys('output_{0}_*'.format(uniq_id)))
457 return job_outputs
458
459
460
461
462
463 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
464 num_slots=1, temp_dir='/scratch/', white_list=None,
465 queue=DEFAULT_QUEUE, quiet=True):
466 """
467 Maps a function onto the cluster.
468 @note: This can only be used with picklable functions (i.e., those that are
469 defined at the module or class level).
470
471 @param f: The function to map on args_list
472 @type f: C{function}
473 @param args_list: List of arguments to pass to f
474 @type args_list: C{list}
475 @param cleanup: Should we remove the stdout and stderr temporary files for
476 each job when we're done? (They are left in place if there's
477 an error.)
478 @type cleanup: C{bool}
479 @param mem_free: Estimate of how much memory each job will need (for
480 scheduling). (Not currently used, because our cluster does
481 not have that setting enabled.)
482 @type mem_free: C{basestring}
483 @param name: Base name to give each job (will have a number add to end)
484 @type name: C{basestring}
485 @param num_slots: Number of slots each job should use.
486 @type num_slots: C{int}
487 @param temp_dir: Local temporary directory for storing output for an
488 individual job.
489 @type temp_dir: C{basestring}
490 @param white_list: If specified, limit nodes used to only those in list.
491 @type white_list: C{list} of C{basestring}
492 @param queue: The SGE queue to use for scheduling.
493 @type queue: C{basestring}
494 @param quiet: When true, do not output information about the jobs that have
495 been submitted.
496 @type quiet: C{bool}
497 """
498
499
500 jobs = [Job(f, [args] if not isinstance(args, list) else args,
501 cleanup=cleanup, mem_free=mem_free,
502 name='{0}{1}'.format(name, job_num), num_slots=num_slots,
503 queue=queue)
504 for job_num, args in enumerate(args_list)]
505
506
507 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list,
508 quiet=quiet)
509
510 return job_results
511
512
513 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
514 num_slots=1, temp_dir='/scratch/', white_list=None,
515 queue=DEFAULT_QUEUE, quiet=True):
516 """
517 @deprecated: This function has been renamed grid_map.
518
519 @param f: The function to map on args_list
520 @type f: C{function}
521 @param args_list: List of arguments to pass to f
522 @type args_list: C{list}
523 @param cleanup: Should we remove the stdout and stderr temporary files for
524 each job when we're done? (They are left in place if there's
525 an error.)
526 @type cleanup: C{bool}
527 @param mem_free: Estimate of how much memory each job will need (for
528 scheduling). (Not currently used, because our cluster does
529 not have that setting enabled.)
530 @type mem_free: C{basestring}
531 @param name: Base name to give each job (will have a number add to end)
532 @type name: C{basestring}
533 @param num_slots: Number of slots each job should use.
534 @type num_slots: C{int}
535 @param temp_dir: Local temporary directory for storing output for an
536 individual job.
537 @type temp_dir: C{basestring}
538 @param white_list: If specified, limit nodes used to only those in list.
539 @type white_list: C{list} of C{basestring}
540 @param queue: The SGE queue to use for scheduling.
541 @type queue: C{basestring}
542 @param quiet: When true, do not output information about the jobs that have
543 been submitted.
544 @type quiet: C{bool}
545 """
546 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name,
547 num_slots=num_slots, temp_dir=temp_dir,
548 white_list=white_list, queue=queue, quiet=quiet)
549