1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 This module provides wrappers that simplify submission and collection of jobs,
25 in a more 'pythonic' fashion.
26
27 @author: Christian Widmer
28 @author: Cheng Soon Ong
29 @author: Dan Blanchard (dblanchard@ets.org)
30 """
31
32 from __future__ import absolute_import, print_function, unicode_literals
33
34 import inspect
35 import os
36 import subprocess
37 import sys
38 import traceback
39 import uuid
40 from socket import gethostname
41 from time import sleep
42
43 import drmaa
44 from redis import StrictRedis
45 from redis.exceptions import ConnectionError as RedisConnectionError
46
47 from gridmap.data import clean_path, zload_db, zsave_db
48
49
50 if sys.version_info < (3, 0):
51 range = xrange
52
53
54
55
56 REDIS_DB = 2
57 REDIS_PORT = 7272
58
59
60 USE_MEM_FREE = False
61
62
63 DEFAULT_QUEUE = 'all.q'
64
65
66 -class Job(object):
67 """
68 Central entity that wraps a function and its data. Basically, a job consists
69 of a function, its argument list, its keyword list and a field "ret" which
70 is filled, when the execute method gets called.
71
72 @note: This can only be used to wrap picklable functions (i.e., those that
73 are defined at the module or class level).
74 """
75
76 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception',
77 'environment', 'replace_env', 'working_dir', 'num_slots',
78 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue')
79
80 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G",
81 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
82 """
83 Initializes a new Job.
84
85 @param f: a function, which should be executed.
86 @type f: function
87 @param args: argument list of function f
88 @type args: list
89 @param kwlist: dictionary of keyword arguments for f
90 @type kwlist: dict
91 @param cleanup: flag that determines the cleanup of input and log file
92 @type cleanup: boolean
93 @param mem_free: Estimate of how much memory this job will need (for
94 scheduling)
95 @type mem_free: C{basestring}
96 @param name: Name to give this job
97 @type name: C{basestring}
98 @param num_slots: Number of slots this job should use.
99 @type num_slots: C{int}
100 @param queue: SGE queue to schedule job on.
101 @type queue: C{basestring}
102 """
103
104 self.path = None
105 self._f = None
106 self.function = f
107 self.args = args
108 self.jobid = -1
109 self.kwlist = kwlist if kwlist is not None else {}
110 self.cleanup = cleanup
111 self.ret = None
112 self.environment = None
113 self.replace_env = False
114 self.working_dir = os.getcwd()
115 self.num_slots = num_slots
116 self.mem_free = mem_free
117 self.white_list = []
118 self.uniq_id = None
119 self.name = name.replace(' ', '_')
120 self.queue = queue
121
122 @property
124 ''' Function this job will execute. '''
125 return self._f
126
127 @function.setter
129 """
130 setter for function that carefully takes care of
131 namespace, avoiding __main__ as a module
132 """
133
134 m = inspect.getmodule(f)
135 try:
136 self.path = clean_path(os.path.dirname(os.path.abspath(
137 inspect.getsourcefile(f))))
138 except TypeError:
139 self.path = ''
140
141
142 if m.__name__ != "__main__":
143 self._f = f
144
145 else:
146
147
148 mn = os.path.splitext(os.path.basename(m.__file__))[0]
149
150
151 __import__(mn)
152
153
154 mod = sys.modules[mn]
155
156
157 self._f = getattr(mod, f.__name__)
158
160 """
161 Executes function f with given arguments
162 and writes return value to field ret.
163 If an exception is encountered during execution, ret will
164 contain a pickled version of it.
165 Input data is removed after execution to save space.
166 """
167 try:
168 self.ret = self.function(*self.args, **self.kwlist)
169 except Exception as exception:
170 self.ret = exception
171 traceback.print_exc()
172 del self.args
173 del self.kwlist
174
175 @property
195
196
197 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None,
198 quiet=True):
199 """
200 Method used to send a list of jobs onto the cluster.
201 @param jobs: list of jobs to be executed
202 @type jobs: c{list} of L{Job}
203 @param uniq_id: The unique suffix for the tables corresponding to this job
204 in the database.
205 @type uniq_id: C{basestring}
206 @param temp_dir: Local temporary directory for storing output for an
207 individual job.
208 @type temp_dir: C{basestring}
209 @param white_list: List of acceptable nodes to use for scheduling job. If
210 None, all are used.
211 @type white_list: C{list} of C{basestring}
212 @param quiet: When true, do not output information about the jobs that have
213 been submitted.
214 @type quiet: C{bool}
215 """
216
217 session = drmaa.Session()
218 session.initialize()
219 jobids = []
220
221 for job_num, job in enumerate(jobs):
222
223 job.white_list = white_list
224
225
226 jobid = _append_job_to_session(session, job, uniq_id, job_num,
227 temp_dir=temp_dir, quiet=quiet)
228 jobids.append(jobid)
229
230 sid = session.contact
231 session.exit()
232
233 return (sid, jobids)
234
238 """
239 For an active session, append new job based on information stored in job
240 object. Also sets job.job_id to the ID of the job on the grid.
241
242 @param session: The current DRMAA session with the grid engine.
243 @type session: C{drmaa.Session}
244 @param job: The Job to add to the queue.
245 @type job: L{Job}
246 @param uniq_id: The unique suffix for the tables corresponding to this job
247 in the database.
248 @type uniq_id: C{basestring}
249 @param job_num: The row in the table to store/retrieve data on. This is only
250 non-zero for jobs created via grid_map.
251 @type job_num: C{int}
252 @param temp_dir: Local temporary directory for storing output for an
253 individual job.
254 @type temp_dir: C{basestring}
255 @param quiet: When true, do not output information about the jobs that have
256 been submitted.
257 @type quiet: C{bool}
258 """
259
260 jt = session.createJobTemplate()
261
262
263 shell_env = os.environ
264
265 if job.environment and job.replace_env:
266
267 jt.jobEnvironment = job.environment
268
269 elif job.environment and not job.replace_env:
270
271 env = shell_env
272 env.update(job.environment)
273 jt.jobEnvironment = env
274
275 else:
276
277 jt.jobEnvironment = shell_env
278
279
280 jt.remoteCommand = '/usr/bin/env/python'
281 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id),
282 '{0}'.format(job_num), job.path, temp_dir, gethostname()]
283 jt.nativeSpecification = job.native_specification
284 jt.outputPath = ":" + temp_dir
285 jt.errorPath = ":" + temp_dir
286
287 jobid = session.runJob(jt)
288
289
290 job.jobid = jobid
291
292 if not quiet:
293 print('Your job {0} has been submitted with id {1}'.format(job.name,
294 jobid),
295 file=sys.stderr)
296
297 session.deleteJobTemplate(jt)
298
299 return jobid
300
301
302 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id,
303 temp_dir='/scratch/', wait=True):
304 """
305 Collect the results from the jobids, returns a list of Jobs
306
307 @param sid: session identifier
308 @type sid: string returned by cluster
309 @param jobids: list of job identifiers returned by the cluster
310 @type jobids: list of strings
311 @param redis_server: Open connection to the database where the results will
312 be stored.
313 @type redis_server: L{StrictRedis}
314 @param wait: Wait for jobs to finish?
315 @type wait: Boolean, defaults to False
316 @param temp_dir: Local temporary directory for storing output for an
317 individual job.
318 @type temp_dir: C{basestring}
319 """
320
321 for ix in range(len(jobids)):
322 assert(jobids[ix] == joblist[ix].jobid)
323
324 s = drmaa.Session()
325 s.initialize(sid)
326
327 if wait:
328 drmaaWait = drmaa.Session.TIMEOUT_WAIT_FOREVER
329 else:
330 drmaaWait = drmaa.Session.TIMEOUT_NO_WAIT
331
332 s.synchronize(jobids, drmaaWait, True)
333
334 s.exit()
335
336
337 job_output_list = []
338 for ix, job in enumerate(joblist):
339
340 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix])
341 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix])
342
343 try:
344 job_output = zload_db(redis_server, 'output{0}'.format(uniq_id),
345 ix)
346 except Exception as detail:
347 print(("Error while unpickling output for gridmap job {1} from" +
348 " stored with key output_{0}_{1}").format(uniq_id, ix),
349 file=sys.stderr)
350 print("This could caused by a problem with the cluster " +
351 "environment, imports or environment variables.",
352 file=sys.stderr)
353 print(("Try running `python -m gridmap.runner {0} {1} {2} {3} " +
354 "{4}` to see if your job crashed before writing its " +
355 "output.").format(uniq_id,
356 ix,
357 job.path,
358 temp_dir,
359 gethostname()),
360 file=sys.stderr)
361 print("Check log files for more information: ", file=sys.stderr)
362 print("stdout:", log_stdout_fn, file=sys.stderr)
363 print("stderr:", log_stderr_fn, file=sys.stderr)
364 print("Exception: {0}".format(detail))
365 sys.exit(2)
366
367
368 if isinstance(job_output, Exception):
369 print("Exception encountered in job with log file:",
370 file=sys.stderr)
371 print(log_stdout_fn, file=sys.stderr)
372 print(job_output, file=sys.stderr)
373 print(file=sys.stderr)
374
375 job_output_list.append(job_output)
376
377 return job_output_list
378
379
380 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None,
381 quiet=True):
382 """
383 Take a list of jobs and process them on the cluster.
384
385 @param temp_dir: Local temporary directory for storing output for an
386 individual job.
387 @type temp_dir: C{basestring}
388 @param wait: Should we wait for jobs to finish? (Should only be false if the
389 function you're running doesn't return anything)
390 @type wait: C{bool}
391 @param white_list: If specified, limit nodes used to only those in list.
392 @type white_list: C{list} of C{basestring}
393 @param quiet: When true, do not output information about the jobs that have
394 been submitted.
395 @type quiet: C{bool}
396 """
397
398 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT)
399
400
401 try:
402 redis_server.set('connection_test', True)
403 except RedisConnectionError:
404 with open('/dev/null') as null_file:
405 redis_process = subprocess.Popen(['redis-server', '-'],
406 stdout=null_file,
407 stdin=subprocess.PIPE,
408 stderr=null_file)
409 redis_process.stdin.write('''daemonize yes
410 pidfile {0}
411 port {1}
412 '''.format(os.path.join(temp_dir,
413 'redis{0}.pid'.format(REDIS_PORT)),
414 REDIS_PORT))
415 redis_process.stdin.close()
416
417 sleep(5)
418
419
420 uniq_id = uuid.uuid4()
421
422
423 for job_id, job in enumerate(jobs):
424 zsave_db(job, redis_server, 'job{0}'.format(uniq_id), job_id)
425
426
427 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list,
428 temp_dir=temp_dir, quiet=quiet)
429
430
431 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id,
432 temp_dir=temp_dir, wait=wait)
433
434
435 assert(len(jobs) == len(job_outputs))
436
437
438 redis_server.delete(*redis_server.keys('job{0}_*'.format(uniq_id)))
439 redis_server.delete(*redis_server.keys('output{0}_*'.format(uniq_id)))
440 return job_outputs
441
442
443
444
445
446 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
447 num_slots=1, temp_dir='/scratch/', white_list=None,
448 queue=DEFAULT_QUEUE, quiet=True):
449 """
450 Maps a function onto the cluster.
451 @note: This can only be used with picklable functions (i.e., those that are
452 defined at the module or class level).
453
454 @param f: The function to map on args_list
455 @type f: C{function}
456 @param args_list: List of arguments to pass to f
457 @type args_list: C{list}
458 @param cleanup: Should we remove the stdout and stderr temporary files for
459 each job when we're done? (They are left in place if there's
460 an error.)
461 @type cleanup: C{bool}
462 @param mem_free: Estimate of how much memory each job will need (for
463 scheduling). (Not currently used, because our cluster does
464 not have that setting enabled.)
465 @type mem_free: C{basestring}
466 @param name: Base name to give each job (will have a number add to end)
467 @type name: C{basestring}
468 @param num_slots: Number of slots each job should use.
469 @type num_slots: C{int}
470 @param temp_dir: Local temporary directory for storing output for an
471 individual job.
472 @type temp_dir: C{basestring}
473 @param white_list: If specified, limit nodes used to only those in list.
474 @type white_list: C{list} of C{basestring}
475 @param queue: The SGE queue to use for scheduling.
476 @type queue: C{basestring}
477 @param quiet: When true, do not output information about the jobs that have
478 been submitted.
479 @type quiet: C{bool}
480 """
481
482
483 jobs = [Job(f, [args] if not isinstance(args, list) else args,
484 cleanup=cleanup, mem_free=mem_free,
485 name='{0}{1}'.format(name, job_num), num_slots=num_slots,
486 queue=queue)
487 for job_num, args in enumerate(args_list)]
488
489
490 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list,
491 quiet=quiet)
492
493 return job_results
494
495
496 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
497 num_slots=1, temp_dir='/scratch/', white_list=None,
498 queue=DEFAULT_QUEUE, quiet=True):
499 """
500 @deprecated: This function has been renamed grid_map.
501
502 @param f: The function to map on args_list
503 @type f: C{function}
504 @param args_list: List of arguments to pass to f
505 @type args_list: C{list}
506 @param cleanup: Should we remove the stdout and stderr temporary files for
507 each job when we're done? (They are left in place if there's
508 an error.)
509 @type cleanup: C{bool}
510 @param mem_free: Estimate of how much memory each job will need (for
511 scheduling). (Not currently used, because our cluster does
512 not have that setting enabled.)
513 @type mem_free: C{basestring}
514 @param name: Base name to give each job (will have a number add to end)
515 @type name: C{basestring}
516 @param num_slots: Number of slots each job should use.
517 @type num_slots: C{int}
518 @param temp_dir: Local temporary directory for storing output for an
519 individual job.
520 @type temp_dir: C{basestring}
521 @param white_list: If specified, limit nodes used to only those in list.
522 @type white_list: C{list} of C{basestring}
523 @param queue: The SGE queue to use for scheduling.
524 @type queue: C{basestring}
525 @param quiet: When true, do not output information about the jobs that have
526 been submitted.
527 @type quiet: C{bool}
528 """
529 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name,
530 num_slots=num_slots, temp_dir=temp_dir,
531 white_list=white_list, queue=queue, quiet=quiet)
532