Package starcluster :: Package balancers :: Package sge
[hide private]
[frames] | no frames]

Source Code for Package starcluster.balancers.sge

  1  #!/usr/bin/env python 
  2  import os 
  3  import time 
  4  import datetime 
  5  import traceback 
  6  import xml.dom.minidom 
  7   
  8  from starcluster import utils 
  9  from starcluster import static 
 10  from starcluster import exception 
 11  from starcluster.balancers import LoadBalancer 
 12  from starcluster.logger import log 
 13   
 14   
 15  SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge') 
 16  DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s') 
 17  DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv') 
18 19 20 -class SGEStats(object):
21 """ 22 SunGridEngine stats parser 23 """ 24 jobstat_cachesize = 200 25 hosts = [] 26 jobs = [] 27 jobstats = jobstat_cachesize * [None] 28 max_job_id = 0 29 _default_fields = ["JB_job_number", "state", "JB_submission_time", 30 "queue_name", "slots", "tasks"] 31 32 @property
33 - def first_job_id(self):
34 if not self.jobs: 35 return 36 return int(self.jobs[0]['JB_job_number'])
37 38 @property
39 - def last_job_id(self):
40 if not self.jobs: 41 return 42 return int(self.jobs[-1]['JB_job_number'])
43
44 - def parse_qhost(self, string):
45 """ 46 this function parses qhost -xml output and makes a neat array 47 takes in a string, so we can pipe in output from ssh.exec('qhost -xml') 48 """ 49 self.hosts = [] # clear the old hosts 50 doc = xml.dom.minidom.parseString(string) 51 for h in doc.getElementsByTagName("host"): 52 name = h.getAttribute("name") 53 hash = {"name": name} 54 for stat in h.getElementsByTagName("hostvalue"): 55 for hvalue in stat.childNodes: 56 attr = stat.attributes['name'].value 57 val = "" 58 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE: 59 val = hvalue.data 60 hash[attr] = val 61 if hash['name'] != u'global': 62 self.hosts.append(hash) 63 return self.hosts
64
65 - def parse_qstat(self, string, fields=None):
66 """ 67 This method parses qstat -xml output and makes a neat array 68 """ 69 if fields == None: 70 fields = self._default_fields 71 self.jobs = [] # clear the old jobs 72 doc = xml.dom.minidom.parseString(string) 73 for job in doc.getElementsByTagName("job_list"): 74 jstate = job.getAttribute("state") 75 hash = {"job_state": jstate} 76 for tag in fields: 77 es = job.getElementsByTagName(tag) 78 for node in es: 79 for node2 in node.childNodes: 80 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE: 81 hash[tag] = node2.data 82 # grab the submit time on all jobs, the last job's val stays 83 if 'tasks' in hash and hash['tasks'].find('-') > 0: 84 self.job_multiply(hash) 85 else: 86 self.jobs.append(hash) 87 return self.jobs
88
89 - def job_multiply(self, hash):
90 """ 91 this function deals with sge jobs with a task range, ie qsub -t 1-20:1 92 makes 20 jobs. self.jobs needs to represent that it is 20 jobs instead 93 of just 1. 94 """ 95 sz_range = hash['tasks'] 96 dashpos = sz_range.find('-') 97 colpos = sz_range.find(':') 98 start = int(sz_range[0:dashpos]) 99 fin = int(sz_range[dashpos + 1:colpos]) 100 gran = int(sz_range[colpos + 1:len(sz_range)]) 101 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s." % \ 102 (start, fin, gran, sz_range)) 103 num_jobs = (fin - start) / gran 104 log.debug("This job expands to %d tasks." % num_jobs) 105 self.jobs.extend([hash] * num_jobs)
106
107 - def qacct_to_datetime_tuple(self, qacct):
108 """ 109 Takes the SGE qacct formatted time and makes a datetime tuple 110 format is: 111 Tue Jul 13 16:24:03 2010 112 """ 113 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
114
115 - def parse_qacct(self, string, dtnow):
116 """ 117 This method parses qacct -j output and makes a neat array and 118 calculates some statistics. 119 Takes the string to parse, and a datetime object of the remote 120 host's current time. 121 """ 122 job_id = None 123 qd = None 124 start = None 125 end = None 126 counter = 0 127 lines = string.split('\n') 128 for l in lines: 129 l = l.strip() 130 if l.find('jobnumber') != -1: 131 job_id = int(l[13:len(l)]) 132 if l.find('qsub_time') != -1: 133 qd = self.qacct_to_datetime_tuple(l[13:len(l)]) 134 if l.find('start_time') != -1: 135 if l.find('-/-') > 0: 136 start = dtnow 137 else: 138 start = self.qacct_to_datetime_tuple(l[13:len(l)]) 139 if l.find('end_time') != -1: 140 if l.find('-/-') > 0: 141 end = dtnow 142 else: 143 end = self.qacct_to_datetime_tuple(l[13:len(l)]) 144 if l.find('==========') != -1: 145 if qd != None: 146 self.max_job_id = job_id 147 hash = {'queued': qd, 'start': start, 'end': end} 148 self.jobstats[job_id % self.jobstat_cachesize] = hash 149 qd = None 150 start = None 151 end = None 152 counter = counter + 1 153 log.debug("added %d new jobs." % counter) 154 log.debug("There are %d items in the jobstats cache." % 155 len(self.jobstats)) 156 return self.jobstats
157
158 - def is_jobstats_empty(self):
159 """ 160 This function will return True if half of the queue is empty, False if 161 there are enough entries in it. 162 """ 163 if self.max_job_id < (self.jobstat_cachesize * 0.3): 164 return True 165 return False
166
167 - def get_running_jobs(self):
168 """ 169 returns an array of the running jobs, values stored in dictionary 170 """ 171 running = [] 172 for j in self.jobs: 173 if j['job_state'] == u'running': 174 running.append(j) 175 return running
176
177 - def get_queued_jobs(self):
178 """ 179 returns an array of the queued jobs, values stored in dictionary 180 """ 181 queued = [] 182 for j in self.jobs: 183 if j['job_state'] == u'pending': 184 queued.append(j) 185 return queued
186
187 - def count_hosts(self):
188 """ 189 returns a count of the hosts in the cluster 190 """ 191 #todo: throw an exception if hosts not initialized 192 return len(self.hosts)
193
194 - def count_total_slots(self):
195 """ 196 returns a count of total slots available in this cluser 197 """ 198 slots = 0 199 for h in self.hosts: 200 if h['num_proc'] == '-': 201 h['num_proc'] = 0 202 slots = slots + int(h['num_proc']) 203 return slots
204
205 - def slots_per_host(self):
206 """ 207 returns the number of slots per host. 208 If for some reason the cluster is inconsistent, this will return -1 209 for example, if you have m1.large and m1.small in the same cluster 210 """ 211 total = self.count_total_slots() 212 if self.hosts[0][u'num_proc'] == '-': 213 self.hosts[0][u'num_proc'] = 0 214 single = int(self.hosts[0][u'num_proc']) 215 if (total != (single * len(self.hosts))): 216 log.error("ERROR: Number of slots not consistent across cluster") 217 return -1 218 return single
219
220 - def oldest_queued_job_age(self):
221 """ 222 This returns the age of the oldest job in the queue 223 """ 224 for j in self.jobs: 225 if 'JB_submission_time' in j: 226 st = j['JB_submission_time'] 227 dt = utils.iso_to_datetime_tuple(st) 228 return dt
229 #todo: throw a "no queued jobs" exception 230
231 - def is_node_working(self, node):
232 """ 233 This function returns true if the node is currently working on a task, 234 or false if the node is currently idle. 235 """ 236 nodename = node.alias 237 for j in self.jobs: 238 if 'queue_name' in j: 239 qn = j['queue_name'] 240 if qn.find(nodename) > 0: 241 log.debug("Node %s is working." % node.id) 242 return True 243 log.debug("Node %s is IDLE." % node.id) 244 return False
245
246 - def num_slots_for_job(self, job_id):
247 """ 248 returns the number of slots requested for the given job id 249 returns -1 if job_id is invalid 250 """ 251 ujid = unicode(job_id) 252 for j in self.jobs: 253 if j['JB_job_number'] == ujid: 254 return int(j['slots']) 255 return -1
256
257 - def avg_job_duration(self):
258 count = 0 259 total_seconds = 0 260 for job in self.jobstats: 261 if job != None: 262 delta = job['end'] - job['start'] 263 total_seconds = total_seconds + delta.seconds 264 count = count + 1 265 if count == 0: 266 return 0 267 else: 268 return total_seconds / count
269
270 - def avg_wait_time(self):
271 count = 0 272 total_seconds = 0 273 for job in self.jobstats: 274 if job != None: 275 delta = job['start'] - job['queued'] 276 total_seconds = total_seconds + delta.seconds 277 count = count + 1 278 if count == 0: 279 return 0 280 else: 281 return total_seconds / count
282
283 - def on_first_job(self):
284 """ 285 returns true if the cluster is processing the first job, 286 False otherwise 287 """ 288 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1': 289 log.info("ON THE FIRST JOB") 290 return True 291 return False
292
293 - def get_loads(self):
294 """ 295 returns an array containing the loads on each host in cluster 296 """ 297 loads = [] 298 for h in self.hosts: 299 if h['load_avg'] == '-': 300 h['load_avg'] = 0 301 loads.append(h['load_avg']) 302 return loads
303
304 - def _add(self, x, y):
305 return float(x) + float(y)
306
307 - def get_all_stats(self):
308 now = datetime.datetime.utcnow() 309 bits = [] 310 #first field is the time 311 bits.append(now) 312 #second field is the number of hosts 313 bits.append(self.count_hosts()) 314 #third field is # of running jobs 315 bits.append(len(self.get_running_jobs())) 316 #fourth field is # of queued jobs 317 bits.append(len(self.get_queued_jobs())) 318 #fifth field is total # slots 319 bits.append(self.count_total_slots()) 320 #sixth field is average job duration 321 bits.append(self.avg_job_duration()) 322 #seventh field is average job wait time 323 bits.append(self.avg_wait_time()) 324 #last field is array of loads for hosts 325 arr = self.get_loads() 326 load_sum = float(reduce(self._add, arr)) 327 avg_load = load_sum / len(arr) 328 bits.append(avg_load) 329 return bits
330
331 - def write_stats_to_csv(self, filename):
332 """ 333 Write important SGE stats to CSV file 334 Appends one line to the CSV 335 """ 336 bits = self.get_all_stats() 337 try: 338 f = open(filename, 'a') 339 flat = ','.join(str(n) for n in bits) + '\n' 340 f.write(flat) 341 f.close() 342 except IOError, e: 343 raise exception.BaseException(str(e))
344
345 346 -class SGELoadBalancer(LoadBalancer):
347 """ 348 This class is able to query each SGE host and return load & queue 349 statistics 350 351 *** All times are in SECONDS unless otherwise specified *** 352 353 The polling interval in seconds. recommended: 60-300. any more frequent is 354 very wasteful. the polling loop with visualizer takes about 15 seconds. 355 polling_interval = 60 356 357 VERY IMPORTANT: Set this to the max nodes you're willing to have in your 358 cluster. Try setting this to the default cluster size you'd ordinarily 359 use. 360 max_nodes = 5 361 362 IMPORTANT: Set this to the longest time a job can wait before another host 363 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins). 364 Do not use a value less than 300 seconds because that is how long an 365 instance will take to start up. 366 longest_allowed_queue_time = 900 367 368 Keep this at 1 - your master, for now. 369 min_nodes = 1 370 371 This would allow the master to be killed when the queue empties. UNTESTED. 372 allow_master_kill = False 373 374 How many nodes to add per iteration. Setting it > 1 opens up possibility 375 of spending too much $$ 376 add_nodes_per_iteration = 1 377 378 Kill an instance after it has been up for X minutes. Do not kill earlier, 379 since you've already paid for that hour. (in mins) 380 kill_after = 45 381 382 After adding a node, how long to wait for the instance to start new jobs 383 stabilization_time = 180 384 385 Visualizer off by default. Start it with "starcluster loadbalance -p tag" 386 plot_stats = False 387 388 How many hours qacct should look back to gather past job data. lower 389 values minimize data transfer 390 lookback_window = 3 391 """ 392
393 - def __init__(self, interval=60, max_nodes=5, wait_time=900, 394 add_pi=1, kill_after=45, stab=180, lookback_win=3, 395 min_nodes=1, allow_master_kill=False, plot_stats=False, 396 plot_output_dir=None, dump_stats=False, stats_file=None):
397 self._cluster = None 398 self._keep_polling = True 399 self._visualizer = None 400 self.__last_cluster_mod_time = datetime.datetime.utcnow() 401 self.stat = None 402 self.polling_interval = interval 403 self.max_nodes = max_nodes 404 self.longest_allowed_queue_time = wait_time 405 self.add_nodes_per_iteration = add_pi 406 self.kill_after = kill_after 407 self.stabilization_time = stab 408 self.lookback_window = lookback_win 409 self.min_nodes = min_nodes 410 self.allow_master_kill = allow_master_kill 411 if self.longest_allowed_queue_time < 300: 412 log.warn("The recommended wait_time should be >= 300 seconds " 413 "(it takes ~5 min to launch a new EC2 node)") 414 self.dump_stats = dump_stats 415 self.stats_file = stats_file 416 self.plot_stats = plot_stats 417 self.plot_output_dir = plot_output_dir 418 if plot_stats: 419 assert self.visualizer != None
420 421 @property
422 - def visualizer(self):
423 if not self._visualizer: 424 try: 425 from starcluster.balancers.sge import visualizer 426 except ImportError, e: 427 log.error("Error importing visualizer:") 428 log.error(str(e)) 429 log.error("check that matplotlib and numpy are installed and:") 430 log.error(" $ python -c 'import matplotlib'") 431 log.error(" $ python -c 'import numpy'") 432 log.error("completes without error") 433 raise exception.BaseException( 434 "Failed to load stats visualizer") 435 self._visualizer = visualizer.SGEVisualizer(self.stats_file, 436 self.plot_output_dir) 437 else: 438 self._visualizer.stats_file = self.stats_file 439 self._visualizer.pngpath = self.plot_output_dir 440 return self._visualizer
441
442 - def _validate_dir(self, dirname, msg_prefix=""):
443 if not os.path.isdir(dirname): 444 msg = "'%s' is not a directory" 445 if not os.path.exists(dirname): 446 msg = "'%s' does not exist" 447 if msg_prefix: 448 msg = ' '.join([msg_prefix, msg]) 449 msg = msg % dirname 450 raise exception.BaseException(msg)
451
452 - def _mkdir(self, directory, makedirs=False):
453 if not os.path.isdir(directory): 454 if os.path.isfile(directory): 455 raise exception.BaseException("'%s' is a file not a directory") 456 try: 457 if makedirs: 458 os.makedirs(directory) 459 log.info("Created directories %s." % directory) 460 else: 461 os.mkdir(directory) 462 log.info("Created single directory %s." % directory) 463 except IOError, e: 464 raise exception.BaseException(str(e))
465
466 - def get_remote_time(self):
467 """ 468 this function remotely executes 'date' on the master node 469 and returns a datetime object with the master's time 470 instead of fetching it from local machine, maybe inaccurate. 471 """ 472 cl = self._cluster 473 str = '\n'.join(cl.master_node.ssh.execute('date')) 474 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
475
476 - def get_qatime(self, now):
477 """ 478 this function takes the lookback window and creates a string 479 representation of the past few hours, to feed to qacct to 480 limit the data set qacct returns. 481 """ 482 if self.stat.is_jobstats_empty(): 483 log.info("Jobstats cache is not full. Pulling full job history.") 484 temp_lookback_window = self.lookback_window * 60 * 60 485 else: 486 temp_lookback_window = self.polling_interval 487 log.debug("getting past %d seconds worth of job history." % 488 temp_lookback_window) 489 now = now - datetime.timedelta(seconds=temp_lookback_window + 1) 490 str = now.strftime("%Y%m%d%H%M") 491 return str
492 493 #@print_timing
494 - def get_stats(self):
495 """ 496 this function will ssh to the SGE master and get load & queue stats. 497 it will feed these stats to SGEStats, which parses the XML. 498 it will return two arrays: one of hosts, each host has a hash with its 499 host information inside. The job array contains a hash for every job, 500 containing statistics about the job name, priority, etc 501 """ 502 log.debug("starting get_stats") 503 master = self._cluster.master_node 504 self.stat = SGEStats() 505 506 qhostxml = "" 507 qstatxml = "" 508 qacct = "" 509 try: 510 now = self.get_remote_time() 511 qatime = self.get_qatime(now) 512 qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime 513 qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml' 514 qhostxml = '\n'.join(master.ssh.execute( 515 'source /etc/profile && qhost -xml', log_output=True)) 516 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd, 517 log_output=True)) 518 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True, 519 ignore_exit_status=True)) 520 except Exception, e: 521 log.error("Error occured getting SGE stats via ssh. " 522 "Cluster terminated?") 523 log.error(e) 524 return -1 525 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d." % 526 (len(qhostxml), len(qstatxml), len(qacct))) 527 self.stat.parse_qhost(qhostxml) 528 self.stat.parse_qstat(qstatxml) 529 self.stat.parse_qacct(qacct, now)
530
531 - def run(self, cluster):
532 """ 533 This function will loop indefinitely, using SGELoadBalancer.get_stats() 534 to get the clusters status. It looks at the job queue and tries to 535 decide whether to add or remove a node. It should later look at job 536 durations (currently doesn't) 537 """ 538 self._cluster = cluster 539 use_default_stats_file = self.dump_stats and not self.stats_file 540 use_default_plots_dir = self.plot_stats and not self.plot_output_dir 541 if use_default_stats_file or use_default_plots_dir: 542 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) 543 if not self.stats_file: 544 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag 545 if not self.plot_output_dir: 546 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag 547 if not cluster.is_cluster_up(): 548 raise exception.ClusterNotRunning(cluster.cluster_tag) 549 if self.dump_stats: 550 if os.path.isdir(self.stats_file): 551 raise exception.BaseException("stats file destination '%s' is" 552 " a directory" % self.stats_file) 553 sfdir = os.path.dirname(os.path.abspath(self.stats_file)) 554 self._validate_dir(sfdir, msg_prefix="stats file destination") 555 if self.plot_stats: 556 if os.path.isfile(self.plot_output_dir): 557 raise exception.BaseException("plot output destination '%s' " 558 "is a file" % 559 self.plot_output_dir) 560 self._validate_dir(self.plot_output_dir, 561 msg_prefix="plot output destination") 562 if self.dump_stats: 563 log.info("Writing stats to file: %s" % self.stats_file) 564 if self.plot_stats: 565 log.info("Plotting stats to directory: %s" % self.plot_output_dir) 566 while(self._keep_polling): 567 if not cluster.is_cluster_up(): 568 log.info("Entire cluster is not up, nodes added/removed. " 569 "No Action.") 570 time.sleep(self.polling_interval) 571 continue 572 if self.get_stats() == -1: 573 log.error("Failed to get stats. LoadBalancer is terminating.") 574 return 575 log.info("Oldest job is from %s. # queued jobs = %d. # hosts = %d." 576 % (self.stat.oldest_queued_job_age(), 577 len(self.stat.get_queued_jobs()), len(self.stat.hosts))) 578 log.info("Avg job duration = %d sec, Avg wait time = %d sec." % 579 (self.stat.avg_job_duration(), self.stat.avg_wait_time())) 580 #evaluate if nodes need to be added 581 self._eval_add_node() 582 #evaluate if nodes need to be removed 583 self._eval_remove_node() 584 if self.dump_stats or self.plot_stats: 585 self.stat.write_stats_to_csv(self.stats_file) 586 #call the visualizer 587 if self.plot_stats: 588 try: 589 self.visualizer.graph_all() 590 except IOError, e: 591 raise exception.BaseException(str(e)) 592 #sleep for the specified number of seconds 593 log.info("Sleeping, looping again in %d seconds.\n" % 594 self.polling_interval) 595 time.sleep(self.polling_interval)
596
597 - def _eval_add_node(self):
598 """ 599 This function uses the metrics available to it to decide whether to 600 add a new node to the cluster or not. It isn't able to add a node yet. 601 TODO: See if the recent jobs have taken more than 5 minutes (how 602 long it takes to start an instance) 603 """ 604 need_to_add = 0 605 if len(self.stat.hosts) >= self.max_nodes: 606 log.info("Won't add another host, currently at max (%d)." % 607 self.max_nodes) 608 return 0 609 qlen = len(self.stat.get_queued_jobs()) 610 sph = self.stat.slots_per_host() 611 ts = self.stat.count_total_slots() 612 #calculate job duration 613 avg_duration = self.stat.avg_job_duration() 614 #calculate estimated time to completion 615 ettc = avg_duration * qlen / len(self.stat.hosts) 616 if qlen > ts: 617 now = datetime.datetime.utcnow() 618 mod_delta = (now - self.__last_cluster_mod_time).seconds 619 if mod_delta < self.stabilization_time: 620 log.info("Cluster change made less than %d seconds ago (%s)." % 621 (self.stabilization_time, 622 self.__last_cluster_mod_time)) 623 log.info("Not changing cluster size until cluster stabilizes.") 624 return 0 625 #there are more jobs queued than will be consumed with one 626 #cycle of job processing from all nodes 627 oldest_job_dt = self.stat.oldest_queued_job_age() 628 now = self.get_remote_time() 629 age_delta = now - oldest_job_dt 630 if age_delta.seconds > self.longest_allowed_queue_time: 631 log.info("A job has been waiting for %d sec, longer than " 632 "max %d." % (age_delta.seconds, 633 self.longest_allowed_queue_time)) 634 need_to_add = qlen / sph 635 if ettc < 600 and not self.stat.on_first_job(): 636 log.warn("There is a possibility that the job queue is" 637 " shorter than 10 minutes in duration.") 638 #need_to_add = 0 639 max_add = self.max_nodes - len(self.stat.hosts) 640 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) 641 if need_to_add > 0: 642 log.info("*** ADDING %d NODES at %s." % 643 (need_to_add, str(datetime.datetime.utcnow()))) 644 try: 645 self._cluster.add_nodes(need_to_add) 646 except Exception: 647 log.error("Failed to add new host.") 648 log.debug(traceback.format_exc()) 649 return -1 650 self.__last_cluster_mod_time = datetime.datetime.utcnow() 651 log.info("Done adding nodes at %s." % 652 str(datetime.datetime.utcnow())) 653 return need_to_add
654
655 - def _eval_remove_node(self):
656 """ 657 This function uses the sge stats to decide whether or not to 658 remove a node from the cluster. 659 """ 660 qlen = len(self.stat.get_queued_jobs()) 661 if qlen == 0: 662 now = datetime.datetime.utcnow() 663 elapsed = (now - self.__last_cluster_mod_time).seconds 664 if elapsed < self.stabilization_time: 665 log.info("Cluster change made less than %d seconds ago (%s)." % 666 (self.stabilization_time, 667 self.__last_cluster_mod_time)) 668 log.info("Not changing cluster size until cluster stabilizes.") 669 return 0 670 #if at 0, remove all nodes but master 671 if len(self.stat.hosts) > self.min_nodes: 672 log.info("Checking to remove a node...") 673 to_kill = self._find_node_for_removal() 674 if not to_kill: 675 log.info("No nodes can be killed at this time.") 676 #kill the nodes returned 677 for n in to_kill: 678 if n.update() == "running": 679 log.info("***KILLING NODE: %s (%s)." % (n.id, 680 n.dns_name)) 681 try: 682 self._cluster.remove_node(n) 683 except Exception: 684 log.error("Failed to terminate node %s" % n.alias) 685 log.debug(traceback.format_exc()) 686 return -1 687 #successfully removed node 688 now = datetime.datetime.utcnow() 689 self.__last_cluster_mod_time = now 690 else: 691 log.error("Trying to kill dead node %s" % n.alias) 692 else: 693 log.info("Can't remove a node, already at min (%d)." % 694 self.min_nodes)
695
696 - def _find_node_for_removal(self):
697 """ 698 This function will find a suitable node to remove from the cluster. 699 The criteria for removal are: 700 1. The node must not be running any SGE job 701 2. The node must have been up for 50-60 minutes past its start time 702 3. The node must not be the master, or allow_master_kill=True 703 """ 704 nodes = self._cluster.running_nodes 705 to_rem = [] 706 for node in nodes: 707 if not self.allow_master_kill and node.is_master(): 708 log.debug("not removing master node") 709 continue 710 is_working = self.stat.is_node_working(node) 711 mins_up = self._minutes_uptime(node) % 60 712 if not is_working: 713 log.info("Idle Node %s (%s) has been up for %d minutes " 714 "past the hour." % (node.id, node.alias, mins_up)) 715 if self.polling_interval > 300: 716 self.kill_after = max(45, 717 60 - (2 * self.polling_interval / 60)) 718 if not is_working and mins_up >= self.kill_after: 719 to_rem.append(node) 720 return to_rem
721
722 - def _minutes_uptime(self, node):
723 """ 724 this function uses data available to boto to determine 725 how many total minutes this instance has been running. you can 726 mod (%) the return value with 60 to determine how many minutes 727 into a billable hour this node has been running. 728 """ 729 dt = utils.iso_to_datetime_tuple(node.launch_time) 730 now = self.get_remote_time() 731 timedelta = now - dt 732 return timedelta.seconds / 60
733