1 import os
2 import time
3 import datetime
4 import traceback
5 import xml.dom.minidom
6
7 from starcluster import utils
8 from starcluster import static
9 from starcluster import exception
10 from starcluster.balancers import LoadBalancer
11 from starcluster.logger import log
12
13
14 SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge')
15 DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s')
16 DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv')
20 """
21 SunGridEngine stats parser
22 """
23 jobstat_cachesize = 200
24 hosts = []
25 jobs = []
26 jobstats = jobstat_cachesize * [None]
27 max_job_id = 0
28 _default_fields = ["JB_job_number", "state", "JB_submission_time",
29 "queue_name", "slots", "tasks"]
30
31 @property
33 if not self.jobs:
34 return
35 return int(self.jobs[0]['JB_job_number'])
36
37 @property
39 if not self.jobs:
40 return
41 return int(self.jobs[-1]['JB_job_number'])
42
44 """
45 this function parses qhost -xml output and makes a neat array
46 takes in a string, so we can pipe in output from ssh.exec('qhost -xml')
47 """
48 self.hosts = []
49 doc = xml.dom.minidom.parseString(string)
50 for h in doc.getElementsByTagName("host"):
51 name = h.getAttribute("name")
52 hash = {"name": name}
53 for stat in h.getElementsByTagName("hostvalue"):
54 for hvalue in stat.childNodes:
55 attr = stat.attributes['name'].value
56 val = ""
57 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE:
58 val = hvalue.data
59 hash[attr] = val
60 if hash['name'] != u'global':
61 self.hosts.append(hash)
62 return self.hosts
63
65 """
66 This method parses qstat -xml output and makes a neat array
67 """
68 if fields == None:
69 fields = self._default_fields
70 self.jobs = []
71 doc = xml.dom.minidom.parseString(string)
72 for job in doc.getElementsByTagName("job_list"):
73 jstate = job.getAttribute("state")
74 hash = {"job_state": jstate}
75 for tag in fields:
76 es = job.getElementsByTagName(tag)
77 for node in es:
78 for node2 in node.childNodes:
79 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE:
80 hash[tag] = node2.data
81
82 if 'tasks' in hash and hash['tasks'].find('-') > 0:
83 self.job_multiply(hash)
84 else:
85 self.jobs.append(hash)
86 return self.jobs
87
89 """
90 This function deals with sge jobs with a task range. For example,
91 'qsub -t 1-20:1' makes 20 jobs. self.jobs needs to represent that it is
92 20 jobs instead of just 1.
93 """
94 sz_range = hash['tasks']
95 dashpos = sz_range.find('-')
96 colpos = sz_range.find(':')
97 start = int(sz_range[0:dashpos])
98 fin = int(sz_range[dashpos + 1:colpos])
99 gran = int(sz_range[colpos + 1:len(sz_range)])
100 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s" %
101 (start, fin, gran, sz_range))
102 num_jobs = (fin - start) / gran
103 log.debug("This job expands to %d tasks" % num_jobs)
104 self.jobs.extend([hash] * num_jobs)
105
107 """
108 Takes the SGE qacct formatted time and makes a datetime tuple
109 format is:
110 Tue Jul 13 16:24:03 2010
111 """
112 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
113
115 """
116 This method parses qacct -j output and makes a neat array and
117 calculates some statistics.
118 Takes the string to parse, and a datetime object of the remote
119 host's current time.
120 """
121 job_id = None
122 qd = None
123 start = None
124 end = None
125 counter = 0
126 lines = string.split('\n')
127 for l in lines:
128 l = l.strip()
129 if l.find('jobnumber') != -1:
130 job_id = int(l[13:len(l)])
131 if l.find('qsub_time') != -1:
132 qd = self.qacct_to_datetime_tuple(l[13:len(l)])
133 if l.find('start_time') != -1:
134 if l.find('-/-') > 0:
135 start = dtnow
136 else:
137 start = self.qacct_to_datetime_tuple(l[13:len(l)])
138 if l.find('end_time') != -1:
139 if l.find('-/-') > 0:
140 end = dtnow
141 else:
142 end = self.qacct_to_datetime_tuple(l[13:len(l)])
143 if l.find('==========') != -1:
144 if qd != None:
145 self.max_job_id = job_id
146 hash = {'queued': qd, 'start': start, 'end': end}
147 self.jobstats[job_id % self.jobstat_cachesize] = hash
148 qd = None
149 start = None
150 end = None
151 counter = counter + 1
152 log.debug("added %d new jobs" % counter)
153 log.debug("There are %d items in the jobstats cache" %
154 len(self.jobstats))
155 return self.jobstats
156
158 """
159 This function will return True if half of the queue is empty, False if
160 there are enough entries in it.
161 """
162 return self.max_job_id < (self.jobstat_cachesize * 0.3)
163
165 """
166 returns an array of the running jobs, values stored in dictionary
167 """
168 running = []
169 for j in self.jobs:
170 if j['job_state'] == u'running':
171 running.append(j)
172 return running
173
175 """
176 returns an array of the queued jobs, values stored in dictionary
177 """
178 queued = []
179 for j in self.jobs:
180 if j['job_state'] == u'pending':
181 queued.append(j)
182 return queued
183
185 """
186 returns a count of the hosts in the cluster
187 """
188
189 return len(self.hosts)
190
192 """
193 returns a count of total slots available in this cluser
194 """
195 slots = 0
196 for h in self.hosts:
197 if h['num_proc'] == '-':
198 h['num_proc'] = 0
199 slots = slots + int(h['num_proc'])
200 return slots
201
203 """
204 returns the number of slots per host.
205 If for some reason the cluster is inconsistent, this will return -1
206 for example, if you have m1.large and m1.small in the same cluster
207 """
208 total = self.count_total_slots()
209 if self.hosts[0][u'num_proc'] == '-':
210 self.hosts[0][u'num_proc'] = 0
211 single = int(self.hosts[0][u'num_proc'])
212 if (total != (single * len(self.hosts))):
213 log.error("ERROR: Number of slots not consistent across cluster")
214 return -1
215 return single
216
218 """
219 This returns the age of the oldest job in the queue
220 """
221 for j in self.jobs:
222 if 'JB_submission_time' in j:
223 st = j['JB_submission_time']
224 dt = utils.iso_to_datetime_tuple(st)
225 return dt
226
227
229 """
230 This function returns true if the node is currently working on a task,
231 or false if the node is currently idle.
232 """
233 nodename = node.alias
234 for j in self.jobs:
235 if 'queue_name' in j:
236 qn = j['queue_name']
237 if qn.find(nodename) > 0:
238 log.debug("Node %s is working" % node.id)
239 return True
240 log.debug("Node %s is IDLE" % node.id)
241 return False
242
244 """
245 returns the number of slots requested for the given job id
246 returns -1 if job_id is invalid
247 """
248 ujid = unicode(job_id)
249 for j in self.jobs:
250 if j['JB_job_number'] == ujid:
251 return int(j['slots'])
252 return -1
253
255 count = 0
256 total_seconds = 0
257 for job in self.jobstats:
258 if job != None:
259 delta = job['end'] - job['start']
260 total_seconds = total_seconds + delta.seconds
261 count = count + 1
262 if count == 0:
263 return 0
264 else:
265 return total_seconds / count
266
268 count = 0
269 total_seconds = 0
270 for job in self.jobstats:
271 if job != None:
272 delta = job['start'] - job['queued']
273 total_seconds = total_seconds + delta.seconds
274 count = count + 1
275 if count == 0:
276 return 0
277 else:
278 return total_seconds / count
279
281 """
282 returns true if the cluster is processing the first job,
283 False otherwise
284 """
285 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1':
286 log.info("ON THE FIRST JOB")
287 return True
288 return False
289
291 """
292 returns an array containing the loads on each host in cluster
293 """
294 loads = []
295 for h in self.hosts:
296 if h['load_avg'] == '-':
297 h['load_avg'] = 0
298 loads.append(h['load_avg'])
299 return loads
300
301 - def _add(self, x, y):
302 return float(x) + float(y)
303
327
329 """
330 Write important SGE stats to CSV file
331 Appends one line to the CSV
332 """
333 bits = self.get_all_stats()
334 try:
335 f = open(filename, 'a')
336 flat = ','.join(str(n) for n in bits) + '\n'
337 f.write(flat)
338 f.close()
339 except IOError, e:
340 raise exception.BaseException(str(e))
341
344 """
345 This class is able to query each SGE host and return load & queue
346 statistics
347
348 *** All times are in SECONDS unless otherwise specified ***
349
350 The polling interval in seconds. recommended: 60-300. any more frequent is
351 very wasteful. the polling loop with visualizer takes about 15 seconds.
352 polling_interval = 60
353
354 VERY IMPORTANT: Set this to the max nodes you're willing to have in your
355 cluster. Try setting this to the default cluster size you'd ordinarily
356 use.
357 max_nodes = 5
358
359 IMPORTANT: Set this to the longest time a job can wait before another host
360 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins).
361 Do not use a value less than 300 seconds because that is how long an
362 instance will take to start up.
363 longest_allowed_queue_time = 900
364
365 Keep this at 1 - your master, for now.
366 min_nodes = 1
367
368 This would allow the master to be killed when the queue empties. UNTESTED.
369 allow_master_kill = False
370
371 How many nodes to add per iteration. Setting it > 1 opens up possibility
372 of spending too much $$
373 add_nodes_per_iteration = 1
374
375 Kill an instance after it has been up for X minutes. Do not kill earlier,
376 since you've already paid for that hour. (in mins)
377 kill_after = 45
378
379 After adding a node, how long to wait for the instance to start new jobs
380 stabilization_time = 180
381
382 Visualizer off by default. Start it with "starcluster loadbalance -p tag"
383 plot_stats = False
384
385 How many hours qacct should look back to gather past job data. lower
386 values minimize data transfer
387 lookback_window = 3
388 """
389
390 - def __init__(self, interval=60, max_nodes=None, wait_time=900,
391 add_pi=1, kill_after=45, stab=180, lookback_win=3,
392 min_nodes=1, allow_master_kill=False, plot_stats=False,
393 plot_output_dir=None, dump_stats=False, stats_file=None):
394 self._cluster = None
395 self._keep_polling = True
396 self._visualizer = None
397 self.__last_cluster_mod_time = datetime.datetime.utcnow()
398 self.stat = None
399 self.polling_interval = interval
400 self.max_nodes = max_nodes
401 self.longest_allowed_queue_time = wait_time
402 self.add_nodes_per_iteration = add_pi
403 self.kill_after = kill_after
404 self.stabilization_time = stab
405 self.lookback_window = lookback_win
406 self.min_nodes = min_nodes
407 self.allow_master_kill = allow_master_kill
408 if self.longest_allowed_queue_time < 300:
409 log.warn("The recommended wait_time should be >= 300 seconds "
410 "(it takes ~5 min to launch a new EC2 node)")
411 self.dump_stats = dump_stats
412 self.stats_file = stats_file
413 self.plot_stats = plot_stats
414 self.plot_output_dir = plot_output_dir
415 if plot_stats:
416 assert self.visualizer != None
417
418 @property
420 if not self._visualizer:
421 try:
422 from starcluster.balancers.sge import visualizer
423 except ImportError, e:
424 log.error("Error importing visualizer:")
425 log.error(str(e))
426 log.error("check that matplotlib and numpy are installed and:")
427 log.error(" $ python -c 'import matplotlib'")
428 log.error(" $ python -c 'import numpy'")
429 log.error("completes without error")
430 raise exception.BaseException(
431 "Failed to load stats visualizer")
432 self._visualizer = visualizer.SGEVisualizer(self.stats_file,
433 self.plot_output_dir)
434 else:
435 self._visualizer.stats_file = self.stats_file
436 self._visualizer.pngpath = self.plot_output_dir
437 return self._visualizer
438
440 if not os.path.isdir(dirname):
441 msg = "'%s' is not a directory"
442 if not os.path.exists(dirname):
443 msg = "'%s' does not exist"
444 if msg_prefix:
445 msg = ' '.join([msg_prefix, msg])
446 msg = msg % dirname
447 raise exception.BaseException(msg)
448
449 - def _mkdir(self, directory, makedirs=False):
462
464 """
465 this function remotely executes 'date' on the master node
466 and returns a datetime object with the master's time
467 instead of fetching it from local machine, maybe inaccurate.
468 """
469 cl = self._cluster
470 str = '\n'.join(cl.master_node.ssh.execute('date'))
471 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
472
474 """
475 this function takes the lookback window and creates a string
476 representation of the past few hours, to feed to qacct to
477 limit the data set qacct returns.
478 """
479 if self.stat.is_jobstats_empty():
480 log.info("Loading full job history")
481 temp_lookback_window = self.lookback_window * 60 * 60
482 else:
483 temp_lookback_window = self.polling_interval
484 log.debug("getting past %d seconds worth of job history" %
485 temp_lookback_window)
486 now = now - datetime.timedelta(seconds=temp_lookback_window + 1)
487 str = now.strftime("%Y%m%d%H%M")
488 return str
489
490
492 """
493 this function will ssh to the SGE master and get load & queue stats.
494 it will feed these stats to SGEStats, which parses the XML.
495 it will return two arrays: one of hosts, each host has a hash with its
496 host information inside. The job array contains a hash for every job,
497 containing statistics about the job name, priority, etc.
498 """
499 log.debug("starting get_stats")
500 master = self._cluster.master_node
501 self.stat = SGEStats()
502
503 qhostxml = ""
504 qstatxml = ""
505 qacct = ""
506 try:
507 now = self.get_remote_time()
508 qatime = self.get_qatime(now)
509 qacct_cmd = 'qacct -j -b ' + qatime
510 qstat_cmd = 'qstat -q all.q -u \"*\" -xml'
511 qhostxml = '\n'.join(master.ssh.execute('qhost -xml',
512 log_output=True,
513 source_profile=True))
514 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd,
515 log_output=True,
516 source_profile=True))
517 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True,
518 ignore_exit_status=True,
519 source_profile=True))
520 except Exception, e:
521 log.error("Error occurred getting SGE stats via ssh. "
522 "Cluster terminated?")
523 log.error(e)
524 return -1
525 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" %
526 (len(qhostxml), len(qstatxml), len(qacct)))
527 self.stat.parse_qhost(qhostxml)
528 self.stat.parse_qstat(qstatxml)
529 self.stat.parse_qacct(qacct, now)
530
531 - def run(self, cluster):
532 """
533 This function will loop indefinitely, using SGELoadBalancer.get_stats()
534 to get the clusters status. It looks at the job queue and tries to
535 decide whether to add or remove a node. It should later look at job
536 durations (currently doesn't)
537 """
538 self._cluster = cluster
539 if self.max_nodes is None:
540 self.max_nodes = cluster.cluster_size
541 use_default_stats_file = self.dump_stats and not self.stats_file
542 use_default_plots_dir = self.plot_stats and not self.plot_output_dir
543 if use_default_stats_file or use_default_plots_dir:
544 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
545 if not self.stats_file:
546 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
547 if not self.plot_output_dir:
548 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
549 if not cluster.is_cluster_up():
550 raise exception.ClusterNotRunning(cluster.cluster_tag)
551 if self.dump_stats:
552 if os.path.isdir(self.stats_file):
553 raise exception.BaseException("stats file destination '%s' is"
554 " a directory" % self.stats_file)
555 sfdir = os.path.dirname(os.path.abspath(self.stats_file))
556 self._validate_dir(sfdir, msg_prefix="stats file destination")
557 if self.plot_stats:
558 if os.path.isfile(self.plot_output_dir):
559 raise exception.BaseException("plot output destination '%s' "
560 "is a file" %
561 self.plot_output_dir)
562 self._validate_dir(self.plot_output_dir,
563 msg_prefix="plot output destination")
564 raw = dict(__raw__=True)
565 log.info("Starting load balancer...\n")
566 log.info("Maximum cluster size: %d" % self.max_nodes,
567 extra=raw)
568 log.info("Minimum cluster size: %d" % self.min_nodes,
569 extra=raw)
570 log.info("Cluster growth rate: %d nodes/iteration\n" %
571 self.add_nodes_per_iteration, extra=raw)
572 if self.dump_stats:
573 log.info("Writing stats to file: %s" % self.stats_file)
574 if self.plot_stats:
575 log.info("Plotting stats to directory: %s" % self.plot_output_dir)
576 while(self._keep_polling):
577 if not cluster.is_cluster_up():
578 log.info("Waiting for all nodes to come up...")
579 time.sleep(self.polling_interval)
580 continue
581 if self.get_stats() == -1:
582 log.error("Failed to get stats. LoadBalancer is terminating")
583 return
584 log.info("Cluster size: %d" % len(self.stat.hosts), extra=raw)
585 log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
586 extra=raw)
587 oldest_queued_job_age = self.stat.oldest_queued_job_age()
588 if oldest_queued_job_age:
589 log.info("Oldest queued job: %s" % oldest_queued_job_age,
590 extra=raw)
591 log.info("Avg job duration: %d secs" %
592 self.stat.avg_job_duration(), extra=raw)
593 log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
594 extra=raw)
595 log.info("Last cluster modification time: %s" %
596 self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"),
597 extra=dict(__raw__=True))
598
599 self._eval_add_node()
600
601 self._eval_remove_node()
602 if self.dump_stats or self.plot_stats:
603 self.stat.write_stats_to_csv(self.stats_file)
604
605 if self.plot_stats:
606 try:
607 self.visualizer.graph_all()
608 except IOError, e:
609 raise exception.BaseException(str(e))
610
611 log.info("Sleeping...(looping again in %d secs)\n" %
612 self.polling_interval)
613 time.sleep(self.polling_interval)
614
616 now = datetime.datetime.utcnow()
617 elapsed = (now - self.__last_cluster_mod_time).seconds
618 is_stabilized = not (elapsed < self.stabilization_time)
619 if not is_stabilized:
620 log.info("Cluster was modified less than %d seconds ago" %
621 self.stabilization_time)
622 log.info("Waiting for cluster to stabilize...")
623 return is_stabilized
624
626 """
627 This function uses the metrics available to it to decide whether to
628 add a new node to the cluster or not. It isn't able to add a node yet.
629 TODO: See if the recent jobs have taken more than 5 minutes (how
630 long it takes to start an instance)
631 """
632 need_to_add = 0
633 if len(self.stat.hosts) >= self.max_nodes:
634 log.info("Not adding nodes: already at or above maximum (%d)" %
635 self.max_nodes)
636 return 0
637 qlen = len(self.stat.get_queued_jobs())
638 sph = self.stat.slots_per_host()
639 ts = self.stat.count_total_slots()
640
641 avg_duration = self.stat.avg_job_duration()
642
643 ettc = avg_duration * qlen / len(self.stat.hosts)
644 if qlen > ts:
645 if not self.has_cluster_stabilized():
646 return 0
647
648
649 oldest_job_dt = self.stat.oldest_queued_job_age()
650 now = self.get_remote_time()
651 age_delta = now - oldest_job_dt
652 if age_delta.seconds > self.longest_allowed_queue_time:
653 log.info("A job has been waiting for %d sec, longer than "
654 "max %d" % (age_delta.seconds,
655 self.longest_allowed_queue_time))
656 need_to_add = qlen / sph
657 if ettc < 600 and not self.stat.on_first_job():
658 log.warn("There is a possibility that the job queue is"
659 " shorter than 10 minutes in duration")
660
661 max_add = self.max_nodes - len(self.stat.hosts)
662 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
663 if need_to_add > 0:
664 log.info("*** ADDING %d NODES at %s" %
665 (need_to_add, str(datetime.datetime.utcnow())))
666 try:
667 self._cluster.add_nodes(need_to_add)
668 except Exception:
669 log.error("Failed to add new host")
670 log.debug(traceback.format_exc())
671 return -1
672 self.__last_cluster_mod_time = datetime.datetime.utcnow()
673 log.info("Done adding nodes at %s" %
674 str(datetime.datetime.utcnow()))
675 return need_to_add
676
678 """
679 This function uses the sge stats to decide whether or not to
680 remove a node from the cluster.
681 """
682 qlen = len(self.stat.get_queued_jobs())
683 if qlen == 0:
684 if not self.has_cluster_stabilized():
685 return 0
686
687 if len(self.stat.hosts) > self.min_nodes:
688 log.info("Checking to remove a node...")
689 to_kill = self._find_node_for_removal()
690 if not to_kill:
691 log.info("No nodes can be killed at this time")
692
693 for n in to_kill:
694 if n.update() == "running":
695 log.info("***REMOVING NODE: %s (%s)" % (n.id,
696 n.dns_name))
697 try:
698 self._cluster.remove_node(n)
699 except Exception:
700 log.error("Failed to remove node %s" % n.alias)
701 log.debug(traceback.format_exc())
702 return -1
703
704 now = datetime.datetime.utcnow()
705 self.__last_cluster_mod_time = now
706 else:
707 log.error("Trying to kill dead node %s" % n.alias)
708 else:
709 log.info("Not removing nodes: already at or below minimum (%d)"
710 % self.min_nodes)
711
713 """
714 This function will find a suitable node to remove from the cluster.
715 The criteria for removal are:
716 1. The node must not be running any SGE job
717 2. The node must have been up for 50-60 minutes past its start time
718 3. The node must not be the master, or allow_master_kill=True
719 """
720 nodes = self._cluster.running_nodes
721 to_rem = []
722 for node in nodes:
723 if not self.allow_master_kill and node.is_master():
724 log.debug("not removing master node")
725 continue
726 is_working = self.stat.is_node_working(node)
727 mins_up = self._minutes_uptime(node) % 60
728 if not is_working:
729 log.info("Idle Node %s (%s) has been up for %d minutes "
730 "past the hour" % (node.id, node.alias, mins_up))
731 if self.polling_interval > 300:
732 self.kill_after = max(45,
733 60 - (2 * self.polling_interval / 60))
734 if not is_working and mins_up >= self.kill_after:
735 to_rem.append(node)
736 return to_rem
737
739 """
740 this function uses data available to boto to determine
741 how many total minutes this instance has been running. you can
742 mod (%) the return value with 60 to determine how many minutes
743 into a billable hour this node has been running.
744 """
745 dt = utils.iso_to_datetime_tuple(node.launch_time)
746 now = self.get_remote_time()
747 timedelta = now - dt
748 return timedelta.seconds / 60
749