1
2 import os
3 import time
4 import datetime
5 import traceback
6 import xml.dom.minidom
7
8 from starcluster import utils
9 from starcluster import static
10 from starcluster import exception
11 from starcluster.balancers import LoadBalancer
12 from starcluster.logger import log
13
14
15 SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge')
16 DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s')
17 DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv')
21 """
22 SunGridEngine stats parser
23 """
24 jobstat_cachesize = 200
25 hosts = []
26 jobs = []
27 jobstats = jobstat_cachesize * [None]
28 max_job_id = 0
29 _default_fields = ["JB_job_number", "state", "JB_submission_time",
30 "queue_name", "slots", "tasks"]
31
32 @property
34 if not self.jobs:
35 return
36 return int(self.jobs[0]['JB_job_number'])
37
38 @property
40 if not self.jobs:
41 return
42 return int(self.jobs[-1]['JB_job_number'])
43
45 """
46 this function parses qhost -xml output and makes a neat array
47 takes in a string, so we can pipe in output from ssh.exec('qhost -xml')
48 """
49 self.hosts = []
50 doc = xml.dom.minidom.parseString(string)
51 for h in doc.getElementsByTagName("host"):
52 name = h.getAttribute("name")
53 hash = {"name": name}
54 for stat in h.getElementsByTagName("hostvalue"):
55 for hvalue in stat.childNodes:
56 attr = stat.attributes['name'].value
57 val = ""
58 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE:
59 val = hvalue.data
60 hash[attr] = val
61 if hash['name'] != u'global':
62 self.hosts.append(hash)
63 return self.hosts
64
66 """
67 This method parses qstat -xml output and makes a neat array
68 """
69 if fields == None:
70 fields = self._default_fields
71 self.jobs = []
72 doc = xml.dom.minidom.parseString(string)
73 for job in doc.getElementsByTagName("job_list"):
74 jstate = job.getAttribute("state")
75 hash = {"job_state": jstate}
76 for tag in fields:
77 es = job.getElementsByTagName(tag)
78 for node in es:
79 for node2 in node.childNodes:
80 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE:
81 hash[tag] = node2.data
82
83 if 'tasks' in hash and hash['tasks'].find('-') > 0:
84 self.job_multiply(hash)
85 else:
86 self.jobs.append(hash)
87 return self.jobs
88
90 """
91 this function deals with sge jobs with a task range, ie qsub -t 1-20:1
92 makes 20 jobs. self.jobs needs to represent that it is 20 jobs instead
93 of just 1.
94 """
95 sz_range = hash['tasks']
96 dashpos = sz_range.find('-')
97 colpos = sz_range.find(':')
98 start = int(sz_range[0:dashpos])
99 fin = int(sz_range[dashpos + 1:colpos])
100 gran = int(sz_range[colpos + 1:len(sz_range)])
101 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s" %
102 (start, fin, gran, sz_range))
103 num_jobs = (fin - start) / gran
104 log.debug("This job expands to %d tasks" % num_jobs)
105 self.jobs.extend([hash] * num_jobs)
106
108 """
109 Takes the SGE qacct formatted time and makes a datetime tuple
110 format is:
111 Tue Jul 13 16:24:03 2010
112 """
113 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
114
116 """
117 This method parses qacct -j output and makes a neat array and
118 calculates some statistics.
119 Takes the string to parse, and a datetime object of the remote
120 host's current time.
121 """
122 job_id = None
123 qd = None
124 start = None
125 end = None
126 counter = 0
127 lines = string.split('\n')
128 for l in lines:
129 l = l.strip()
130 if l.find('jobnumber') != -1:
131 job_id = int(l[13:len(l)])
132 if l.find('qsub_time') != -1:
133 qd = self.qacct_to_datetime_tuple(l[13:len(l)])
134 if l.find('start_time') != -1:
135 if l.find('-/-') > 0:
136 start = dtnow
137 else:
138 start = self.qacct_to_datetime_tuple(l[13:len(l)])
139 if l.find('end_time') != -1:
140 if l.find('-/-') > 0:
141 end = dtnow
142 else:
143 end = self.qacct_to_datetime_tuple(l[13:len(l)])
144 if l.find('==========') != -1:
145 if qd != None:
146 self.max_job_id = job_id
147 hash = {'queued': qd, 'start': start, 'end': end}
148 self.jobstats[job_id % self.jobstat_cachesize] = hash
149 qd = None
150 start = None
151 end = None
152 counter = counter + 1
153 log.debug("added %d new jobs" % counter)
154 log.debug("There are %d items in the jobstats cache" %
155 len(self.jobstats))
156 return self.jobstats
157
159 """
160 This function will return True if half of the queue is empty, False if
161 there are enough entries in it.
162 """
163 return self.max_job_id < (self.jobstat_cachesize * 0.3)
164
166 """
167 returns an array of the running jobs, values stored in dictionary
168 """
169 running = []
170 for j in self.jobs:
171 if j['job_state'] == u'running':
172 running.append(j)
173 return running
174
176 """
177 returns an array of the queued jobs, values stored in dictionary
178 """
179 queued = []
180 for j in self.jobs:
181 if j['job_state'] == u'pending':
182 queued.append(j)
183 return queued
184
186 """
187 returns a count of the hosts in the cluster
188 """
189
190 return len(self.hosts)
191
193 """
194 returns a count of total slots available in this cluser
195 """
196 slots = 0
197 for h in self.hosts:
198 if h['num_proc'] == '-':
199 h['num_proc'] = 0
200 slots = slots + int(h['num_proc'])
201 return slots
202
204 """
205 returns the number of slots per host.
206 If for some reason the cluster is inconsistent, this will return -1
207 for example, if you have m1.large and m1.small in the same cluster
208 """
209 total = self.count_total_slots()
210 if self.hosts[0][u'num_proc'] == '-':
211 self.hosts[0][u'num_proc'] = 0
212 single = int(self.hosts[0][u'num_proc'])
213 if (total != (single * len(self.hosts))):
214 log.error("ERROR: Number of slots not consistent across cluster")
215 return -1
216 return single
217
219 """
220 This returns the age of the oldest job in the queue
221 """
222 for j in self.jobs:
223 if 'JB_submission_time' in j:
224 st = j['JB_submission_time']
225 dt = utils.iso_to_datetime_tuple(st)
226 return dt
227
228
230 """
231 This function returns true if the node is currently working on a task,
232 or false if the node is currently idle.
233 """
234 nodename = node.alias
235 for j in self.jobs:
236 if 'queue_name' in j:
237 qn = j['queue_name']
238 if qn.find(nodename) > 0:
239 log.debug("Node %s is working" % node.id)
240 return True
241 log.debug("Node %s is IDLE" % node.id)
242 return False
243
245 """
246 returns the number of slots requested for the given job id
247 returns -1 if job_id is invalid
248 """
249 ujid = unicode(job_id)
250 for j in self.jobs:
251 if j['JB_job_number'] == ujid:
252 return int(j['slots'])
253 return -1
254
256 count = 0
257 total_seconds = 0
258 for job in self.jobstats:
259 if job != None:
260 delta = job['end'] - job['start']
261 total_seconds = total_seconds + delta.seconds
262 count = count + 1
263 if count == 0:
264 return 0
265 else:
266 return total_seconds / count
267
269 count = 0
270 total_seconds = 0
271 for job in self.jobstats:
272 if job != None:
273 delta = job['start'] - job['queued']
274 total_seconds = total_seconds + delta.seconds
275 count = count + 1
276 if count == 0:
277 return 0
278 else:
279 return total_seconds / count
280
282 """
283 returns true if the cluster is processing the first job,
284 False otherwise
285 """
286 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1':
287 log.info("ON THE FIRST JOB")
288 return True
289 return False
290
292 """
293 returns an array containing the loads on each host in cluster
294 """
295 loads = []
296 for h in self.hosts:
297 if h['load_avg'] == '-':
298 h['load_avg'] = 0
299 loads.append(h['load_avg'])
300 return loads
301
302 - def _add(self, x, y):
303 return float(x) + float(y)
304
328
330 """
331 Write important SGE stats to CSV file
332 Appends one line to the CSV
333 """
334 bits = self.get_all_stats()
335 try:
336 f = open(filename, 'a')
337 flat = ','.join(str(n) for n in bits) + '\n'
338 f.write(flat)
339 f.close()
340 except IOError, e:
341 raise exception.BaseException(str(e))
342
345 """
346 This class is able to query each SGE host and return load & queue
347 statistics
348
349 *** All times are in SECONDS unless otherwise specified ***
350
351 The polling interval in seconds. recommended: 60-300. any more frequent is
352 very wasteful. the polling loop with visualizer takes about 15 seconds.
353 polling_interval = 60
354
355 VERY IMPORTANT: Set this to the max nodes you're willing to have in your
356 cluster. Try setting this to the default cluster size you'd ordinarily
357 use.
358 max_nodes = 5
359
360 IMPORTANT: Set this to the longest time a job can wait before another host
361 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins).
362 Do not use a value less than 300 seconds because that is how long an
363 instance will take to start up.
364 longest_allowed_queue_time = 900
365
366 Keep this at 1 - your master, for now.
367 min_nodes = 1
368
369 This would allow the master to be killed when the queue empties. UNTESTED.
370 allow_master_kill = False
371
372 How many nodes to add per iteration. Setting it > 1 opens up possibility
373 of spending too much $$
374 add_nodes_per_iteration = 1
375
376 Kill an instance after it has been up for X minutes. Do not kill earlier,
377 since you've already paid for that hour. (in mins)
378 kill_after = 45
379
380 After adding a node, how long to wait for the instance to start new jobs
381 stabilization_time = 180
382
383 Visualizer off by default. Start it with "starcluster loadbalance -p tag"
384 plot_stats = False
385
386 How many hours qacct should look back to gather past job data. lower
387 values minimize data transfer
388 lookback_window = 3
389 """
390
391 - def __init__(self, interval=60, max_nodes=None, wait_time=900,
392 add_pi=1, kill_after=45, stab=180, lookback_win=3,
393 min_nodes=1, allow_master_kill=False, plot_stats=False,
394 plot_output_dir=None, dump_stats=False, stats_file=None):
395 self._cluster = None
396 self._keep_polling = True
397 self._visualizer = None
398 self.__last_cluster_mod_time = datetime.datetime.utcnow()
399 self.stat = None
400 self.polling_interval = interval
401 self.max_nodes = max_nodes
402 self.longest_allowed_queue_time = wait_time
403 self.add_nodes_per_iteration = add_pi
404 self.kill_after = kill_after
405 self.stabilization_time = stab
406 self.lookback_window = lookback_win
407 self.min_nodes = min_nodes
408 self.allow_master_kill = allow_master_kill
409 if self.longest_allowed_queue_time < 300:
410 log.warn("The recommended wait_time should be >= 300 seconds "
411 "(it takes ~5 min to launch a new EC2 node)")
412 self.dump_stats = dump_stats
413 self.stats_file = stats_file
414 self.plot_stats = plot_stats
415 self.plot_output_dir = plot_output_dir
416 if plot_stats:
417 assert self.visualizer != None
418
419 @property
421 if not self._visualizer:
422 try:
423 from starcluster.balancers.sge import visualizer
424 except ImportError, e:
425 log.error("Error importing visualizer:")
426 log.error(str(e))
427 log.error("check that matplotlib and numpy are installed and:")
428 log.error(" $ python -c 'import matplotlib'")
429 log.error(" $ python -c 'import numpy'")
430 log.error("completes without error")
431 raise exception.BaseException(
432 "Failed to load stats visualizer")
433 self._visualizer = visualizer.SGEVisualizer(self.stats_file,
434 self.plot_output_dir)
435 else:
436 self._visualizer.stats_file = self.stats_file
437 self._visualizer.pngpath = self.plot_output_dir
438 return self._visualizer
439
441 if not os.path.isdir(dirname):
442 msg = "'%s' is not a directory"
443 if not os.path.exists(dirname):
444 msg = "'%s' does not exist"
445 if msg_prefix:
446 msg = ' '.join([msg_prefix, msg])
447 msg = msg % dirname
448 raise exception.BaseException(msg)
449
450 - def _mkdir(self, directory, makedirs=False):
463
465 """
466 this function remotely executes 'date' on the master node
467 and returns a datetime object with the master's time
468 instead of fetching it from local machine, maybe inaccurate.
469 """
470 cl = self._cluster
471 str = '\n'.join(cl.master_node.ssh.execute('date'))
472 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
473
475 """
476 this function takes the lookback window and creates a string
477 representation of the past few hours, to feed to qacct to
478 limit the data set qacct returns.
479 """
480 if self.stat.is_jobstats_empty():
481 log.info("Loading full job history")
482 temp_lookback_window = self.lookback_window * 60 * 60
483 else:
484 temp_lookback_window = self.polling_interval
485 log.debug("getting past %d seconds worth of job history" %
486 temp_lookback_window)
487 now = now - datetime.timedelta(seconds=temp_lookback_window + 1)
488 str = now.strftime("%Y%m%d%H%M")
489 return str
490
491
493 """
494 this function will ssh to the SGE master and get load & queue stats.
495 it will feed these stats to SGEStats, which parses the XML.
496 it will return two arrays: one of hosts, each host has a hash with its
497 host information inside. The job array contains a hash for every job,
498 containing statistics about the job name, priority, etc
499 """
500 log.debug("starting get_stats")
501 master = self._cluster.master_node
502 self.stat = SGEStats()
503
504 qhostxml = ""
505 qstatxml = ""
506 qacct = ""
507 try:
508 now = self.get_remote_time()
509 qatime = self.get_qatime(now)
510 qacct_cmd = 'qacct -j -b ' + qatime
511 qstat_cmd = 'qstat -q all.q -u \"*\" -xml'
512 qhostxml = '\n'.join(master.ssh.execute('qhost -xml',
513 log_output=True,
514 source_profile=True))
515 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd,
516 log_output=True,
517 source_profile=True))
518 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True,
519 ignore_exit_status=True,
520 source_profile=True))
521 except Exception, e:
522 log.error("Error occured getting SGE stats via ssh. "
523 "Cluster terminated?")
524 log.error(e)
525 return -1
526 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" %
527 (len(qhostxml), len(qstatxml), len(qacct)))
528 self.stat.parse_qhost(qhostxml)
529 self.stat.parse_qstat(qstatxml)
530 self.stat.parse_qacct(qacct, now)
531
532 - def run(self, cluster):
533 """
534 This function will loop indefinitely, using SGELoadBalancer.get_stats()
535 to get the clusters status. It looks at the job queue and tries to
536 decide whether to add or remove a node. It should later look at job
537 durations (currently doesn't)
538 """
539 self._cluster = cluster
540 if self.max_nodes is None:
541 self.max_nodes = cluster.cluster_size
542 use_default_stats_file = self.dump_stats and not self.stats_file
543 use_default_plots_dir = self.plot_stats and not self.plot_output_dir
544 if use_default_stats_file or use_default_plots_dir:
545 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
546 if not self.stats_file:
547 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
548 if not self.plot_output_dir:
549 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
550 if not cluster.is_cluster_up():
551 raise exception.ClusterNotRunning(cluster.cluster_tag)
552 if self.dump_stats:
553 if os.path.isdir(self.stats_file):
554 raise exception.BaseException("stats file destination '%s' is"
555 " a directory" % self.stats_file)
556 sfdir = os.path.dirname(os.path.abspath(self.stats_file))
557 self._validate_dir(sfdir, msg_prefix="stats file destination")
558 if self.plot_stats:
559 if os.path.isfile(self.plot_output_dir):
560 raise exception.BaseException("plot output destination '%s' "
561 "is a file" %
562 self.plot_output_dir)
563 self._validate_dir(self.plot_output_dir,
564 msg_prefix="plot output destination")
565 raw = dict(__raw__=True)
566 log.info("Starting load balancer...\n")
567 log.info("Maximum cluster size: %d" % self.max_nodes,
568 extra=raw)
569 log.info("Minimum cluster size: %d" % self.min_nodes,
570 extra=raw)
571 log.info("Cluster growth rate: %d nodes/iteration\n" %
572 self.add_nodes_per_iteration, extra=raw)
573 if self.dump_stats:
574 log.info("Writing stats to file: %s" % self.stats_file)
575 if self.plot_stats:
576 log.info("Plotting stats to directory: %s" % self.plot_output_dir)
577 while(self._keep_polling):
578 if not cluster.is_cluster_up():
579 log.info("Waiting for all nodes to come up...")
580 time.sleep(self.polling_interval)
581 continue
582 if self.get_stats() == -1:
583 log.error("Failed to get stats. LoadBalancer is terminating")
584 return
585 log.info("Cluster size: %d" % len(self.stat.hosts), extra=raw)
586 log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
587 extra=raw)
588 oldest_queued_job_age = self.stat.oldest_queued_job_age()
589 if oldest_queued_job_age:
590 log.info("Oldest queued job: %s" % oldest_queued_job_age,
591 extra=raw)
592 log.info("Avg job duration: %d secs" %
593 self.stat.avg_job_duration(), extra=raw)
594 log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
595 extra=raw)
596 log.info("Last cluster modification time: %s" %
597 self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"),
598 extra=dict(__raw__=True))
599
600 self._eval_add_node()
601
602 self._eval_remove_node()
603 if self.dump_stats or self.plot_stats:
604 self.stat.write_stats_to_csv(self.stats_file)
605
606 if self.plot_stats:
607 try:
608 self.visualizer.graph_all()
609 except IOError, e:
610 raise exception.BaseException(str(e))
611
612 log.info("Sleeping...(looping again in %d secs)\n" %
613 self.polling_interval)
614 time.sleep(self.polling_interval)
615
617 now = datetime.datetime.utcnow()
618 elapsed = (now - self.__last_cluster_mod_time).seconds
619 is_stabilized = not (elapsed < self.stabilization_time)
620 if not is_stabilized:
621 log.info("Cluster was modified less than %d seconds ago" %
622 self.stabilization_time)
623 log.info("Waiting for cluster to stabilize...")
624 return is_stabilized
625
627 """
628 This function uses the metrics available to it to decide whether to
629 add a new node to the cluster or not. It isn't able to add a node yet.
630 TODO: See if the recent jobs have taken more than 5 minutes (how
631 long it takes to start an instance)
632 """
633 need_to_add = 0
634 if len(self.stat.hosts) >= self.max_nodes:
635 log.info("Not adding nodes: already at or above maximum (%d)" %
636 self.max_nodes)
637 return 0
638 qlen = len(self.stat.get_queued_jobs())
639 sph = self.stat.slots_per_host()
640 ts = self.stat.count_total_slots()
641
642 avg_duration = self.stat.avg_job_duration()
643
644 ettc = avg_duration * qlen / len(self.stat.hosts)
645 if qlen > ts:
646 if not self.has_cluster_stabilized():
647 return 0
648
649
650 oldest_job_dt = self.stat.oldest_queued_job_age()
651 now = self.get_remote_time()
652 age_delta = now - oldest_job_dt
653 if age_delta.seconds > self.longest_allowed_queue_time:
654 log.info("A job has been waiting for %d sec, longer than "
655 "max %d" % (age_delta.seconds,
656 self.longest_allowed_queue_time))
657 need_to_add = qlen / sph
658 if ettc < 600 and not self.stat.on_first_job():
659 log.warn("There is a possibility that the job queue is"
660 " shorter than 10 minutes in duration")
661
662 max_add = self.max_nodes - len(self.stat.hosts)
663 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
664 if need_to_add > 0:
665 log.info("*** ADDING %d NODES at %s" %
666 (need_to_add, str(datetime.datetime.utcnow())))
667 try:
668 self._cluster.add_nodes(need_to_add)
669 except Exception:
670 log.error("Failed to add new host")
671 log.debug(traceback.format_exc())
672 return -1
673 self.__last_cluster_mod_time = datetime.datetime.utcnow()
674 log.info("Done adding nodes at %s" %
675 str(datetime.datetime.utcnow()))
676 return need_to_add
677
679 """
680 This function uses the sge stats to decide whether or not to
681 remove a node from the cluster.
682 """
683 qlen = len(self.stat.get_queued_jobs())
684 if qlen == 0:
685 if not self.has_cluster_stabilized():
686 return 0
687
688 if len(self.stat.hosts) > self.min_nodes:
689 log.info("Checking to remove a node...")
690 to_kill = self._find_node_for_removal()
691 if not to_kill:
692 log.info("No nodes can be killed at this time")
693
694 for n in to_kill:
695 if n.update() == "running":
696 log.info("***REMOVING NODE: %s (%s)" % (n.id,
697 n.dns_name))
698 try:
699 self._cluster.remove_node(n)
700 except Exception:
701 log.error("Failed to remove node %s" % n.alias)
702 log.debug(traceback.format_exc())
703 return -1
704
705 now = datetime.datetime.utcnow()
706 self.__last_cluster_mod_time = now
707 else:
708 log.error("Trying to kill dead node %s" % n.alias)
709 else:
710 log.info("Not removing nodes: already at or below minimum (%d)"
711 % self.min_nodes)
712
714 """
715 This function will find a suitable node to remove from the cluster.
716 The criteria for removal are:
717 1. The node must not be running any SGE job
718 2. The node must have been up for 50-60 minutes past its start time
719 3. The node must not be the master, or allow_master_kill=True
720 """
721 nodes = self._cluster.running_nodes
722 to_rem = []
723 for node in nodes:
724 if not self.allow_master_kill and node.is_master():
725 log.debug("not removing master node")
726 continue
727 is_working = self.stat.is_node_working(node)
728 mins_up = self._minutes_uptime(node) % 60
729 if not is_working:
730 log.info("Idle Node %s (%s) has been up for %d minutes "
731 "past the hour" % (node.id, node.alias, mins_up))
732 if self.polling_interval > 300:
733 self.kill_after = max(45,
734 60 - (2 * self.polling_interval / 60))
735 if not is_working and mins_up >= self.kill_after:
736 to_rem.append(node)
737 return to_rem
738
740 """
741 this function uses data available to boto to determine
742 how many total minutes this instance has been running. you can
743 mod (%) the return value with 60 to determine how many minutes
744 into a billable hour this node has been running.
745 """
746 dt = utils.iso_to_datetime_tuple(node.launch_time)
747 now = self.get_remote_time()
748 timedelta = now - dt
749 return timedelta.seconds / 60
750