1
2 import os
3 import time
4 import datetime
5 import traceback
6 import xml.dom.minidom
7
8 from starcluster import utils
9 from starcluster import static
10 from starcluster import exception
11 from starcluster.balancers import LoadBalancer
12 from starcluster.logger import log
13
14
15 SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge')
16 DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s')
17 DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv')
21 """
22 SunGridEngine stats parser
23 """
24 jobstat_cachesize = 200
25 hosts = []
26 jobs = []
27 jobstats = jobstat_cachesize * [None]
28 max_job_id = 0
29 _default_fields = ["JB_job_number", "state", "JB_submission_time",
30 "queue_name", "slots", "tasks"]
31
32 @property
34 if not self.jobs:
35 return
36 return int(self.jobs[0]['JB_job_number'])
37
38 @property
40 if not self.jobs:
41 return
42 return int(self.jobs[-1]['JB_job_number'])
43
45 """
46 this function parses qhost -xml output and makes a neat array
47 takes in a string, so we can pipe in output from ssh.exec('qhost -xml')
48 """
49 self.hosts = []
50 doc = xml.dom.minidom.parseString(string)
51 for h in doc.getElementsByTagName("host"):
52 name = h.getAttribute("name")
53 hash = {"name": name}
54 for stat in h.getElementsByTagName("hostvalue"):
55 for hvalue in stat.childNodes:
56 attr = stat.attributes['name'].value
57 val = ""
58 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE:
59 val = hvalue.data
60 hash[attr] = val
61 if hash['name'] != u'global':
62 self.hosts.append(hash)
63 return self.hosts
64
66 """
67 This method parses qstat -xml output and makes a neat array
68 """
69 if fields == None:
70 fields = self._default_fields
71 self.jobs = []
72 doc = xml.dom.minidom.parseString(string)
73 for job in doc.getElementsByTagName("job_list"):
74 jstate = job.getAttribute("state")
75 hash = {"job_state": jstate}
76 for tag in fields:
77 es = job.getElementsByTagName(tag)
78 for node in es:
79 for node2 in node.childNodes:
80 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE:
81 hash[tag] = node2.data
82
83 if 'tasks' in hash and hash['tasks'].find('-') > 0:
84 self.job_multiply(hash)
85 else:
86 self.jobs.append(hash)
87 return self.jobs
88
90 """
91 this function deals with sge jobs with a task range, ie qsub -t 1-20:1
92 makes 20 jobs. self.jobs needs to represent that it is 20 jobs instead
93 of just 1.
94 """
95 sz_range = hash['tasks']
96 dashpos = sz_range.find('-')
97 colpos = sz_range.find(':')
98 start = int(sz_range[0:dashpos])
99 fin = int(sz_range[dashpos + 1:colpos])
100 gran = int(sz_range[colpos + 1:len(sz_range)])
101 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s." % \
102 (start, fin, gran, sz_range))
103 num_jobs = (fin - start) / gran
104 log.debug("This job expands to %d tasks." % num_jobs)
105 self.jobs.extend([hash] * num_jobs)
106
108 """
109 Takes the SGE qacct formatted time and makes a datetime tuple
110 format is:
111 Tue Jul 13 16:24:03 2010
112 """
113 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
114
116 """
117 This method parses qacct -j output and makes a neat array and
118 calculates some statistics.
119 Takes the string to parse, and a datetime object of the remote
120 host's current time.
121 """
122 job_id = None
123 qd = None
124 start = None
125 end = None
126 counter = 0
127 lines = string.split('\n')
128 for l in lines:
129 l = l.strip()
130 if l.find('jobnumber') != -1:
131 job_id = int(l[13:len(l)])
132 if l.find('qsub_time') != -1:
133 qd = self.qacct_to_datetime_tuple(l[13:len(l)])
134 if l.find('start_time') != -1:
135 if l.find('-/-') > 0:
136 start = dtnow
137 else:
138 start = self.qacct_to_datetime_tuple(l[13:len(l)])
139 if l.find('end_time') != -1:
140 if l.find('-/-') > 0:
141 end = dtnow
142 else:
143 end = self.qacct_to_datetime_tuple(l[13:len(l)])
144 if l.find('==========') != -1:
145 if qd != None:
146 self.max_job_id = job_id
147 hash = {'queued': qd, 'start': start, 'end': end}
148 self.jobstats[job_id % self.jobstat_cachesize] = hash
149 qd = None
150 start = None
151 end = None
152 counter = counter + 1
153 log.debug("added %d new jobs." % counter)
154 log.debug("There are %d items in the jobstats cache." %
155 len(self.jobstats))
156 return self.jobstats
157
159 """
160 This function will return True if half of the queue is empty, False if
161 there are enough entries in it.
162 """
163 if self.max_job_id < (self.jobstat_cachesize * 0.3):
164 return True
165 return False
166
168 """
169 returns an array of the running jobs, values stored in dictionary
170 """
171 running = []
172 for j in self.jobs:
173 if j['job_state'] == u'running':
174 running.append(j)
175 return running
176
178 """
179 returns an array of the queued jobs, values stored in dictionary
180 """
181 queued = []
182 for j in self.jobs:
183 if j['job_state'] == u'pending':
184 queued.append(j)
185 return queued
186
188 """
189 returns a count of the hosts in the cluster
190 """
191
192 return len(self.hosts)
193
195 """
196 returns a count of total slots available in this cluser
197 """
198 slots = 0
199 for h in self.hosts:
200 if h['num_proc'] == '-':
201 h['num_proc'] = 0
202 slots = slots + int(h['num_proc'])
203 return slots
204
206 """
207 returns the number of slots per host.
208 If for some reason the cluster is inconsistent, this will return -1
209 for example, if you have m1.large and m1.small in the same cluster
210 """
211 total = self.count_total_slots()
212 if self.hosts[0][u'num_proc'] == '-':
213 self.hosts[0][u'num_proc'] = 0
214 single = int(self.hosts[0][u'num_proc'])
215 if (total != (single * len(self.hosts))):
216 log.error("ERROR: Number of slots not consistent across cluster")
217 return -1
218 return single
219
221 """
222 This returns the age of the oldest job in the queue
223 """
224 for j in self.jobs:
225 if 'JB_submission_time' in j:
226 st = j['JB_submission_time']
227 dt = utils.iso_to_datetime_tuple(st)
228 return dt
229
230
232 """
233 This function returns true if the node is currently working on a task,
234 or false if the node is currently idle.
235 """
236 nodename = node.alias
237 for j in self.jobs:
238 if 'queue_name' in j:
239 qn = j['queue_name']
240 if qn.find(nodename) > 0:
241 log.debug("Node %s is working." % node.id)
242 return True
243 log.debug("Node %s is IDLE." % node.id)
244 return False
245
247 """
248 returns the number of slots requested for the given job id
249 returns -1 if job_id is invalid
250 """
251 ujid = unicode(job_id)
252 for j in self.jobs:
253 if j['JB_job_number'] == ujid:
254 return int(j['slots'])
255 return -1
256
258 count = 0
259 total_seconds = 0
260 for job in self.jobstats:
261 if job != None:
262 delta = job['end'] - job['start']
263 total_seconds = total_seconds + delta.seconds
264 count = count + 1
265 if count == 0:
266 return 0
267 else:
268 return total_seconds / count
269
271 count = 0
272 total_seconds = 0
273 for job in self.jobstats:
274 if job != None:
275 delta = job['start'] - job['queued']
276 total_seconds = total_seconds + delta.seconds
277 count = count + 1
278 if count == 0:
279 return 0
280 else:
281 return total_seconds / count
282
284 """
285 returns true if the cluster is processing the first job,
286 False otherwise
287 """
288 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1':
289 log.info("ON THE FIRST JOB")
290 return True
291 return False
292
294 """
295 returns an array containing the loads on each host in cluster
296 """
297 loads = []
298 for h in self.hosts:
299 if h['load_avg'] == '-':
300 h['load_avg'] = 0
301 loads.append(h['load_avg'])
302 return loads
303
304 - def _add(self, x, y):
305 return float(x) + float(y)
306
330
332 """
333 Write important SGE stats to CSV file
334 Appends one line to the CSV
335 """
336 bits = self.get_all_stats()
337 try:
338 f = open(filename, 'a')
339 flat = ','.join(str(n) for n in bits) + '\n'
340 f.write(flat)
341 f.close()
342 except IOError, e:
343 raise exception.BaseException(str(e))
344
347 """
348 This class is able to query each SGE host and return load & queue
349 statistics
350
351 *** All times are in SECONDS unless otherwise specified ***
352
353 The polling interval in seconds. recommended: 60-300. any more frequent is
354 very wasteful. the polling loop with visualizer takes about 15 seconds.
355 polling_interval = 60
356
357 VERY IMPORTANT: Set this to the max nodes you're willing to have in your
358 cluster. Try setting this to the default cluster size you'd ordinarily
359 use.
360 max_nodes = 5
361
362 IMPORTANT: Set this to the longest time a job can wait before another host
363 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins).
364 Do not use a value less than 300 seconds because that is how long an
365 instance will take to start up.
366 longest_allowed_queue_time = 900
367
368 Keep this at 1 - your master, for now.
369 min_nodes = 1
370
371 This would allow the master to be killed when the queue empties. UNTESTED.
372 allow_master_kill = False
373
374 How many nodes to add per iteration. Setting it > 1 opens up possibility
375 of spending too much $$
376 add_nodes_per_iteration = 1
377
378 Kill an instance after it has been up for X minutes. Do not kill earlier,
379 since you've already paid for that hour. (in mins)
380 kill_after = 45
381
382 After adding a node, how long to wait for the instance to start new jobs
383 stabilization_time = 180
384
385 Visualizer off by default. Start it with "starcluster loadbalance -p tag"
386 plot_stats = False
387
388 How many hours qacct should look back to gather past job data. lower
389 values minimize data transfer
390 lookback_window = 3
391 """
392
393 - def __init__(self, interval=60, max_nodes=5, wait_time=900,
394 add_pi=1, kill_after=45, stab=180, lookback_win=3,
395 min_nodes=1, allow_master_kill=False, plot_stats=False,
396 plot_output_dir=None, dump_stats=False, stats_file=None):
397 self._cluster = None
398 self._keep_polling = True
399 self._visualizer = None
400 self.__last_cluster_mod_time = datetime.datetime.utcnow()
401 self.stat = None
402 self.polling_interval = interval
403 self.max_nodes = max_nodes
404 self.longest_allowed_queue_time = wait_time
405 self.add_nodes_per_iteration = add_pi
406 self.kill_after = kill_after
407 self.stabilization_time = stab
408 self.lookback_window = lookback_win
409 self.min_nodes = min_nodes
410 self.allow_master_kill = allow_master_kill
411 if self.longest_allowed_queue_time < 300:
412 log.warn("The recommended wait_time should be >= 300 seconds "
413 "(it takes ~5 min to launch a new EC2 node)")
414 self.dump_stats = dump_stats
415 self.stats_file = stats_file
416 self.plot_stats = plot_stats
417 self.plot_output_dir = plot_output_dir
418 if plot_stats:
419 assert self.visualizer != None
420
421 @property
423 if not self._visualizer:
424 try:
425 from starcluster.balancers.sge import visualizer
426 except ImportError, e:
427 log.error("Error importing visualizer:")
428 log.error(str(e))
429 log.error("check that matplotlib and numpy are installed and:")
430 log.error(" $ python -c 'import matplotlib'")
431 log.error(" $ python -c 'import numpy'")
432 log.error("completes without error")
433 raise exception.BaseException(
434 "Failed to load stats visualizer")
435 self._visualizer = visualizer.SGEVisualizer(self.stats_file,
436 self.plot_output_dir)
437 else:
438 self._visualizer.stats_file = self.stats_file
439 self._visualizer.pngpath = self.plot_output_dir
440 return self._visualizer
441
443 if not os.path.isdir(dirname):
444 msg = "'%s' is not a directory"
445 if not os.path.exists(dirname):
446 msg = "'%s' does not exist"
447 if msg_prefix:
448 msg = ' '.join([msg_prefix, msg])
449 msg = msg % dirname
450 raise exception.BaseException(msg)
451
452 - def _mkdir(self, directory, makedirs=False):
465
467 """
468 this function remotely executes 'date' on the master node
469 and returns a datetime object with the master's time
470 instead of fetching it from local machine, maybe inaccurate.
471 """
472 cl = self._cluster
473 str = '\n'.join(cl.master_node.ssh.execute('date'))
474 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
475
477 """
478 this function takes the lookback window and creates a string
479 representation of the past few hours, to feed to qacct to
480 limit the data set qacct returns.
481 """
482 if self.stat.is_jobstats_empty():
483 log.info("Jobstats cache is not full. Pulling full job history.")
484 temp_lookback_window = self.lookback_window * 60 * 60
485 else:
486 temp_lookback_window = self.polling_interval
487 log.debug("getting past %d seconds worth of job history." %
488 temp_lookback_window)
489 now = now - datetime.timedelta(seconds=temp_lookback_window + 1)
490 str = now.strftime("%Y%m%d%H%M")
491 return str
492
493
495 """
496 this function will ssh to the SGE master and get load & queue stats.
497 it will feed these stats to SGEStats, which parses the XML.
498 it will return two arrays: one of hosts, each host has a hash with its
499 host information inside. The job array contains a hash for every job,
500 containing statistics about the job name, priority, etc
501 """
502 log.debug("starting get_stats")
503 master = self._cluster.master_node
504 self.stat = SGEStats()
505
506 qhostxml = ""
507 qstatxml = ""
508 qacct = ""
509 try:
510 now = self.get_remote_time()
511 qatime = self.get_qatime(now)
512 qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime
513 qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml'
514 qhostxml = '\n'.join(master.ssh.execute(
515 'source /etc/profile && qhost -xml', log_output=True))
516 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd,
517 log_output=True))
518 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True,
519 ignore_exit_status=True))
520 except Exception, e:
521 log.error("Error occured getting SGE stats via ssh. "
522 "Cluster terminated?")
523 log.error(e)
524 return -1
525 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d." %
526 (len(qhostxml), len(qstatxml), len(qacct)))
527 self.stat.parse_qhost(qhostxml)
528 self.stat.parse_qstat(qstatxml)
529 self.stat.parse_qacct(qacct, now)
530
531 - def run(self, cluster):
532 """
533 This function will loop indefinitely, using SGELoadBalancer.get_stats()
534 to get the clusters status. It looks at the job queue and tries to
535 decide whether to add or remove a node. It should later look at job
536 durations (currently doesn't)
537 """
538 self._cluster = cluster
539 use_default_stats_file = self.dump_stats and not self.stats_file
540 use_default_plots_dir = self.plot_stats and not self.plot_output_dir
541 if use_default_stats_file or use_default_plots_dir:
542 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
543 if not self.stats_file:
544 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
545 if not self.plot_output_dir:
546 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
547 if not cluster.is_cluster_up():
548 raise exception.ClusterNotRunning(cluster.cluster_tag)
549 if self.dump_stats:
550 if os.path.isdir(self.stats_file):
551 raise exception.BaseException("stats file destination '%s' is"
552 " a directory" % self.stats_file)
553 sfdir = os.path.dirname(os.path.abspath(self.stats_file))
554 self._validate_dir(sfdir, msg_prefix="stats file destination")
555 if self.plot_stats:
556 if os.path.isfile(self.plot_output_dir):
557 raise exception.BaseException("plot output destination '%s' "
558 "is a file" %
559 self.plot_output_dir)
560 self._validate_dir(self.plot_output_dir,
561 msg_prefix="plot output destination")
562 if self.dump_stats:
563 log.info("Writing stats to file: %s" % self.stats_file)
564 if self.plot_stats:
565 log.info("Plotting stats to directory: %s" % self.plot_output_dir)
566 while(self._keep_polling):
567 if not cluster.is_cluster_up():
568 log.info("Entire cluster is not up, nodes added/removed. "
569 "No Action.")
570 time.sleep(self.polling_interval)
571 continue
572 if self.get_stats() == -1:
573 log.error("Failed to get stats. LoadBalancer is terminating.")
574 return
575 log.info("Oldest job is from %s. # queued jobs = %d. # hosts = %d."
576 % (self.stat.oldest_queued_job_age(),
577 len(self.stat.get_queued_jobs()), len(self.stat.hosts)))
578 log.info("Avg job duration = %d sec, Avg wait time = %d sec." %
579 (self.stat.avg_job_duration(), self.stat.avg_wait_time()))
580
581 self._eval_add_node()
582
583 self._eval_remove_node()
584 if self.dump_stats or self.plot_stats:
585 self.stat.write_stats_to_csv(self.stats_file)
586
587 if self.plot_stats:
588 try:
589 self.visualizer.graph_all()
590 except IOError, e:
591 raise exception.BaseException(str(e))
592
593 log.info("Sleeping, looping again in %d seconds.\n" %
594 self.polling_interval)
595 time.sleep(self.polling_interval)
596
598 """
599 This function uses the metrics available to it to decide whether to
600 add a new node to the cluster or not. It isn't able to add a node yet.
601 TODO: See if the recent jobs have taken more than 5 minutes (how
602 long it takes to start an instance)
603 """
604 need_to_add = 0
605 if len(self.stat.hosts) >= self.max_nodes:
606 log.info("Won't add another host, currently at max (%d)." %
607 self.max_nodes)
608 return 0
609 qlen = len(self.stat.get_queued_jobs())
610 sph = self.stat.slots_per_host()
611 ts = self.stat.count_total_slots()
612
613 avg_duration = self.stat.avg_job_duration()
614
615 ettc = avg_duration * qlen / len(self.stat.hosts)
616 if qlen > ts:
617 now = datetime.datetime.utcnow()
618 mod_delta = (now - self.__last_cluster_mod_time).seconds
619 if mod_delta < self.stabilization_time:
620 log.info("Cluster change made less than %d seconds ago (%s)." %
621 (self.stabilization_time,
622 self.__last_cluster_mod_time))
623 log.info("Not changing cluster size until cluster stabilizes.")
624 return 0
625
626
627 oldest_job_dt = self.stat.oldest_queued_job_age()
628 now = self.get_remote_time()
629 age_delta = now - oldest_job_dt
630 if age_delta.seconds > self.longest_allowed_queue_time:
631 log.info("A job has been waiting for %d sec, longer than "
632 "max %d." % (age_delta.seconds,
633 self.longest_allowed_queue_time))
634 need_to_add = qlen / sph
635 if ettc < 600 and not self.stat.on_first_job():
636 log.warn("There is a possibility that the job queue is"
637 " shorter than 10 minutes in duration.")
638
639 max_add = self.max_nodes - len(self.stat.hosts)
640 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
641 if need_to_add > 0:
642 log.info("*** ADDING %d NODES at %s." %
643 (need_to_add, str(datetime.datetime.utcnow())))
644 try:
645 self._cluster.add_nodes(need_to_add)
646 except Exception:
647 log.error("Failed to add new host.")
648 log.debug(traceback.format_exc())
649 return -1
650 self.__last_cluster_mod_time = datetime.datetime.utcnow()
651 log.info("Done adding nodes at %s." %
652 str(datetime.datetime.utcnow()))
653 return need_to_add
654
656 """
657 This function uses the sge stats to decide whether or not to
658 remove a node from the cluster.
659 """
660 qlen = len(self.stat.get_queued_jobs())
661 if qlen == 0:
662 now = datetime.datetime.utcnow()
663 elapsed = (now - self.__last_cluster_mod_time).seconds
664 if elapsed < self.stabilization_time:
665 log.info("Cluster change made less than %d seconds ago (%s)." %
666 (self.stabilization_time,
667 self.__last_cluster_mod_time))
668 log.info("Not changing cluster size until cluster stabilizes.")
669 return 0
670
671 if len(self.stat.hosts) > self.min_nodes:
672 log.info("Checking to remove a node...")
673 to_kill = self._find_node_for_removal()
674 if not to_kill:
675 log.info("No nodes can be killed at this time.")
676
677 for n in to_kill:
678 if n.update() == "running":
679 log.info("***KILLING NODE: %s (%s)." % (n.id,
680 n.dns_name))
681 try:
682 self._cluster.remove_node(n)
683 except Exception:
684 log.error("Failed to terminate node %s" % n.alias)
685 log.debug(traceback.format_exc())
686 return -1
687
688 now = datetime.datetime.utcnow()
689 self.__last_cluster_mod_time = now
690 else:
691 log.error("Trying to kill dead node %s" % n.alias)
692 else:
693 log.info("Can't remove a node, already at min (%d)." %
694 self.min_nodes)
695
697 """
698 This function will find a suitable node to remove from the cluster.
699 The criteria for removal are:
700 1. The node must not be running any SGE job
701 2. The node must have been up for 50-60 minutes past its start time
702 3. The node must not be the master, or allow_master_kill=True
703 """
704 nodes = self._cluster.running_nodes
705 to_rem = []
706 for node in nodes:
707 if not self.allow_master_kill and node.is_master():
708 log.debug("not removing master node")
709 continue
710 is_working = self.stat.is_node_working(node)
711 mins_up = self._minutes_uptime(node) % 60
712 if not is_working:
713 log.info("Idle Node %s (%s) has been up for %d minutes "
714 "past the hour." % (node.id, node.alias, mins_up))
715 if self.polling_interval > 300:
716 self.kill_after = max(45,
717 60 - (2 * self.polling_interval / 60))
718 if not is_working and mins_up >= self.kill_after:
719 to_rem.append(node)
720 return to_rem
721
723 """
724 this function uses data available to boto to determine
725 how many total minutes this instance has been running. you can
726 mod (%) the return value with 60 to determine how many minutes
727 into a billable hour this node has been running.
728 """
729 dt = utils.iso_to_datetime_tuple(node.launch_time)
730 now = self.get_remote_time()
731 timedelta = now - dt
732 return timedelta.seconds / 60
733