Package starcluster :: Package balancers :: Package sge
[hide private]
[frames] | no frames]

Source Code for Package starcluster.balancers.sge

  1  #!/usr/bin/env python 
  2  import os 
  3  import time 
  4  import datetime 
  5  import traceback 
  6  import xml.dom.minidom 
  7   
  8  from starcluster import utils 
  9  from starcluster import static 
 10  from starcluster import exception 
 11  from starcluster.balancers import LoadBalancer 
 12  from starcluster.logger import log 
 13   
 14   
 15  SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge') 
 16  DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s') 
 17  DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv') 
18 19 20 -class SGEStats(object):
21 """ 22 SunGridEngine stats parser 23 """ 24 jobstat_cachesize = 200 25 hosts = [] 26 jobs = [] 27 jobstats = jobstat_cachesize * [None] 28 max_job_id = 0 29 _default_fields = ["JB_job_number", "state", "JB_submission_time", 30 "queue_name", "slots", "tasks"] 31 32 @property
33 - def first_job_id(self):
34 if not self.jobs: 35 return 36 return int(self.jobs[0]['JB_job_number'])
37 38 @property
39 - def last_job_id(self):
40 if not self.jobs: 41 return 42 return int(self.jobs[-1]['JB_job_number'])
43
44 - def parse_qhost(self, string):
45 """ 46 this function parses qhost -xml output and makes a neat array 47 takes in a string, so we can pipe in output from ssh.exec('qhost -xml') 48 """ 49 self.hosts = [] # clear the old hosts 50 doc = xml.dom.minidom.parseString(string) 51 for h in doc.getElementsByTagName("host"): 52 name = h.getAttribute("name") 53 hash = {"name": name} 54 for stat in h.getElementsByTagName("hostvalue"): 55 for hvalue in stat.childNodes: 56 attr = stat.attributes['name'].value 57 val = "" 58 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE: 59 val = hvalue.data 60 hash[attr] = val 61 if hash['name'] != u'global': 62 self.hosts.append(hash) 63 return self.hosts
64
65 - def parse_qstat(self, string, fields=None):
66 """ 67 This method parses qstat -xml output and makes a neat array 68 """ 69 if fields == None: 70 fields = self._default_fields 71 self.jobs = [] # clear the old jobs 72 doc = xml.dom.minidom.parseString(string) 73 for job in doc.getElementsByTagName("job_list"): 74 jstate = job.getAttribute("state") 75 hash = {"job_state": jstate} 76 for tag in fields: 77 es = job.getElementsByTagName(tag) 78 for node in es: 79 for node2 in node.childNodes: 80 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE: 81 hash[tag] = node2.data 82 # grab the submit time on all jobs, the last job's val stays 83 if 'tasks' in hash and hash['tasks'].find('-') > 0: 84 self.job_multiply(hash) 85 else: 86 self.jobs.append(hash) 87 return self.jobs
88
89 - def job_multiply(self, hash):
90 """ 91 this function deals with sge jobs with a task range, ie qsub -t 1-20:1 92 makes 20 jobs. self.jobs needs to represent that it is 20 jobs instead 93 of just 1. 94 """ 95 sz_range = hash['tasks'] 96 dashpos = sz_range.find('-') 97 colpos = sz_range.find(':') 98 start = int(sz_range[0:dashpos]) 99 fin = int(sz_range[dashpos + 1:colpos]) 100 gran = int(sz_range[colpos + 1:len(sz_range)]) 101 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s" % 102 (start, fin, gran, sz_range)) 103 num_jobs = (fin - start) / gran 104 log.debug("This job expands to %d tasks" % num_jobs) 105 self.jobs.extend([hash] * num_jobs)
106
107 - def qacct_to_datetime_tuple(self, qacct):
108 """ 109 Takes the SGE qacct formatted time and makes a datetime tuple 110 format is: 111 Tue Jul 13 16:24:03 2010 112 """ 113 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
114
115 - def parse_qacct(self, string, dtnow):
116 """ 117 This method parses qacct -j output and makes a neat array and 118 calculates some statistics. 119 Takes the string to parse, and a datetime object of the remote 120 host's current time. 121 """ 122 job_id = None 123 qd = None 124 start = None 125 end = None 126 counter = 0 127 lines = string.split('\n') 128 for l in lines: 129 l = l.strip() 130 if l.find('jobnumber') != -1: 131 job_id = int(l[13:len(l)]) 132 if l.find('qsub_time') != -1: 133 qd = self.qacct_to_datetime_tuple(l[13:len(l)]) 134 if l.find('start_time') != -1: 135 if l.find('-/-') > 0: 136 start = dtnow 137 else: 138 start = self.qacct_to_datetime_tuple(l[13:len(l)]) 139 if l.find('end_time') != -1: 140 if l.find('-/-') > 0: 141 end = dtnow 142 else: 143 end = self.qacct_to_datetime_tuple(l[13:len(l)]) 144 if l.find('==========') != -1: 145 if qd != None: 146 self.max_job_id = job_id 147 hash = {'queued': qd, 'start': start, 'end': end} 148 self.jobstats[job_id % self.jobstat_cachesize] = hash 149 qd = None 150 start = None 151 end = None 152 counter = counter + 1 153 log.debug("added %d new jobs" % counter) 154 log.debug("There are %d items in the jobstats cache" % 155 len(self.jobstats)) 156 return self.jobstats
157
158 - def is_jobstats_empty(self):
159 """ 160 This function will return True if half of the queue is empty, False if 161 there are enough entries in it. 162 """ 163 return self.max_job_id < (self.jobstat_cachesize * 0.3)
164
165 - def get_running_jobs(self):
166 """ 167 returns an array of the running jobs, values stored in dictionary 168 """ 169 running = [] 170 for j in self.jobs: 171 if j['job_state'] == u'running': 172 running.append(j) 173 return running
174
175 - def get_queued_jobs(self):
176 """ 177 returns an array of the queued jobs, values stored in dictionary 178 """ 179 queued = [] 180 for j in self.jobs: 181 if j['job_state'] == u'pending': 182 queued.append(j) 183 return queued
184
185 - def count_hosts(self):
186 """ 187 returns a count of the hosts in the cluster 188 """ 189 #todo: throw an exception if hosts not initialized 190 return len(self.hosts)
191
192 - def count_total_slots(self):
193 """ 194 returns a count of total slots available in this cluser 195 """ 196 slots = 0 197 for h in self.hosts: 198 if h['num_proc'] == '-': 199 h['num_proc'] = 0 200 slots = slots + int(h['num_proc']) 201 return slots
202
203 - def slots_per_host(self):
204 """ 205 returns the number of slots per host. 206 If for some reason the cluster is inconsistent, this will return -1 207 for example, if you have m1.large and m1.small in the same cluster 208 """ 209 total = self.count_total_slots() 210 if self.hosts[0][u'num_proc'] == '-': 211 self.hosts[0][u'num_proc'] = 0 212 single = int(self.hosts[0][u'num_proc']) 213 if (total != (single * len(self.hosts))): 214 log.error("ERROR: Number of slots not consistent across cluster") 215 return -1 216 return single
217
218 - def oldest_queued_job_age(self):
219 """ 220 This returns the age of the oldest job in the queue 221 """ 222 for j in self.jobs: 223 if 'JB_submission_time' in j: 224 st = j['JB_submission_time'] 225 dt = utils.iso_to_datetime_tuple(st) 226 return dt
227 #todo: throw a "no queued jobs" exception 228
229 - def is_node_working(self, node):
230 """ 231 This function returns true if the node is currently working on a task, 232 or false if the node is currently idle. 233 """ 234 nodename = node.alias 235 for j in self.jobs: 236 if 'queue_name' in j: 237 qn = j['queue_name'] 238 if qn.find(nodename) > 0: 239 log.debug("Node %s is working" % node.id) 240 return True 241 log.debug("Node %s is IDLE" % node.id) 242 return False
243
244 - def num_slots_for_job(self, job_id):
245 """ 246 returns the number of slots requested for the given job id 247 returns -1 if job_id is invalid 248 """ 249 ujid = unicode(job_id) 250 for j in self.jobs: 251 if j['JB_job_number'] == ujid: 252 return int(j['slots']) 253 return -1
254
255 - def avg_job_duration(self):
256 count = 0 257 total_seconds = 0 258 for job in self.jobstats: 259 if job != None: 260 delta = job['end'] - job['start'] 261 total_seconds = total_seconds + delta.seconds 262 count = count + 1 263 if count == 0: 264 return 0 265 else: 266 return total_seconds / count
267
268 - def avg_wait_time(self):
269 count = 0 270 total_seconds = 0 271 for job in self.jobstats: 272 if job != None: 273 delta = job['start'] - job['queued'] 274 total_seconds = total_seconds + delta.seconds 275 count = count + 1 276 if count == 0: 277 return 0 278 else: 279 return total_seconds / count
280
281 - def on_first_job(self):
282 """ 283 returns true if the cluster is processing the first job, 284 False otherwise 285 """ 286 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1': 287 log.info("ON THE FIRST JOB") 288 return True 289 return False
290
291 - def get_loads(self):
292 """ 293 returns an array containing the loads on each host in cluster 294 """ 295 loads = [] 296 for h in self.hosts: 297 if h['load_avg'] == '-': 298 h['load_avg'] = 0 299 loads.append(h['load_avg']) 300 return loads
301
302 - def _add(self, x, y):
303 return float(x) + float(y)
304
305 - def get_all_stats(self):
306 now = datetime.datetime.utcnow() 307 bits = [] 308 #first field is the time 309 bits.append(now) 310 #second field is the number of hosts 311 bits.append(self.count_hosts()) 312 #third field is # of running jobs 313 bits.append(len(self.get_running_jobs())) 314 #fourth field is # of queued jobs 315 bits.append(len(self.get_queued_jobs())) 316 #fifth field is total # slots 317 bits.append(self.count_total_slots()) 318 #sixth field is average job duration 319 bits.append(self.avg_job_duration()) 320 #seventh field is average job wait time 321 bits.append(self.avg_wait_time()) 322 #last field is array of loads for hosts 323 arr = self.get_loads() 324 load_sum = float(reduce(self._add, arr)) 325 avg_load = load_sum / len(arr) 326 bits.append(avg_load) 327 return bits
328
329 - def write_stats_to_csv(self, filename):
330 """ 331 Write important SGE stats to CSV file 332 Appends one line to the CSV 333 """ 334 bits = self.get_all_stats() 335 try: 336 f = open(filename, 'a') 337 flat = ','.join(str(n) for n in bits) + '\n' 338 f.write(flat) 339 f.close() 340 except IOError, e: 341 raise exception.BaseException(str(e))
342
343 344 -class SGELoadBalancer(LoadBalancer):
345 """ 346 This class is able to query each SGE host and return load & queue 347 statistics 348 349 *** All times are in SECONDS unless otherwise specified *** 350 351 The polling interval in seconds. recommended: 60-300. any more frequent is 352 very wasteful. the polling loop with visualizer takes about 15 seconds. 353 polling_interval = 60 354 355 VERY IMPORTANT: Set this to the max nodes you're willing to have in your 356 cluster. Try setting this to the default cluster size you'd ordinarily 357 use. 358 max_nodes = 5 359 360 IMPORTANT: Set this to the longest time a job can wait before another host 361 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins). 362 Do not use a value less than 300 seconds because that is how long an 363 instance will take to start up. 364 longest_allowed_queue_time = 900 365 366 Keep this at 1 - your master, for now. 367 min_nodes = 1 368 369 This would allow the master to be killed when the queue empties. UNTESTED. 370 allow_master_kill = False 371 372 How many nodes to add per iteration. Setting it > 1 opens up possibility 373 of spending too much $$ 374 add_nodes_per_iteration = 1 375 376 Kill an instance after it has been up for X minutes. Do not kill earlier, 377 since you've already paid for that hour. (in mins) 378 kill_after = 45 379 380 After adding a node, how long to wait for the instance to start new jobs 381 stabilization_time = 180 382 383 Visualizer off by default. Start it with "starcluster loadbalance -p tag" 384 plot_stats = False 385 386 How many hours qacct should look back to gather past job data. lower 387 values minimize data transfer 388 lookback_window = 3 389 """ 390
391 - def __init__(self, interval=60, max_nodes=None, wait_time=900, 392 add_pi=1, kill_after=45, stab=180, lookback_win=3, 393 min_nodes=1, allow_master_kill=False, plot_stats=False, 394 plot_output_dir=None, dump_stats=False, stats_file=None):
395 self._cluster = None 396 self._keep_polling = True 397 self._visualizer = None 398 self.__last_cluster_mod_time = datetime.datetime.utcnow() 399 self.stat = None 400 self.polling_interval = interval 401 self.max_nodes = max_nodes 402 self.longest_allowed_queue_time = wait_time 403 self.add_nodes_per_iteration = add_pi 404 self.kill_after = kill_after 405 self.stabilization_time = stab 406 self.lookback_window = lookback_win 407 self.min_nodes = min_nodes 408 self.allow_master_kill = allow_master_kill 409 if self.longest_allowed_queue_time < 300: 410 log.warn("The recommended wait_time should be >= 300 seconds " 411 "(it takes ~5 min to launch a new EC2 node)") 412 self.dump_stats = dump_stats 413 self.stats_file = stats_file 414 self.plot_stats = plot_stats 415 self.plot_output_dir = plot_output_dir 416 if plot_stats: 417 assert self.visualizer != None
418 419 @property
420 - def visualizer(self):
421 if not self._visualizer: 422 try: 423 from starcluster.balancers.sge import visualizer 424 except ImportError, e: 425 log.error("Error importing visualizer:") 426 log.error(str(e)) 427 log.error("check that matplotlib and numpy are installed and:") 428 log.error(" $ python -c 'import matplotlib'") 429 log.error(" $ python -c 'import numpy'") 430 log.error("completes without error") 431 raise exception.BaseException( 432 "Failed to load stats visualizer") 433 self._visualizer = visualizer.SGEVisualizer(self.stats_file, 434 self.plot_output_dir) 435 else: 436 self._visualizer.stats_file = self.stats_file 437 self._visualizer.pngpath = self.plot_output_dir 438 return self._visualizer
439
440 - def _validate_dir(self, dirname, msg_prefix=""):
441 if not os.path.isdir(dirname): 442 msg = "'%s' is not a directory" 443 if not os.path.exists(dirname): 444 msg = "'%s' does not exist" 445 if msg_prefix: 446 msg = ' '.join([msg_prefix, msg]) 447 msg = msg % dirname 448 raise exception.BaseException(msg)
449
450 - def _mkdir(self, directory, makedirs=False):
451 if not os.path.isdir(directory): 452 if os.path.isfile(directory): 453 raise exception.BaseException("'%s' is a file not a directory") 454 try: 455 if makedirs: 456 os.makedirs(directory) 457 log.info("Created directories %s" % directory) 458 else: 459 os.mkdir(directory) 460 log.info("Created single directory %s" % directory) 461 except IOError, e: 462 raise exception.BaseException(str(e))
463
464 - def get_remote_time(self):
465 """ 466 this function remotely executes 'date' on the master node 467 and returns a datetime object with the master's time 468 instead of fetching it from local machine, maybe inaccurate. 469 """ 470 cl = self._cluster 471 str = '\n'.join(cl.master_node.ssh.execute('date')) 472 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
473
474 - def get_qatime(self, now):
475 """ 476 this function takes the lookback window and creates a string 477 representation of the past few hours, to feed to qacct to 478 limit the data set qacct returns. 479 """ 480 if self.stat.is_jobstats_empty(): 481 log.info("Loading full job history") 482 temp_lookback_window = self.lookback_window * 60 * 60 483 else: 484 temp_lookback_window = self.polling_interval 485 log.debug("getting past %d seconds worth of job history" % 486 temp_lookback_window) 487 now = now - datetime.timedelta(seconds=temp_lookback_window + 1) 488 str = now.strftime("%Y%m%d%H%M") 489 return str
490 491 #@print_timing
492 - def get_stats(self):
493 """ 494 this function will ssh to the SGE master and get load & queue stats. 495 it will feed these stats to SGEStats, which parses the XML. 496 it will return two arrays: one of hosts, each host has a hash with its 497 host information inside. The job array contains a hash for every job, 498 containing statistics about the job name, priority, etc 499 """ 500 log.debug("starting get_stats") 501 master = self._cluster.master_node 502 self.stat = SGEStats() 503 504 qhostxml = "" 505 qstatxml = "" 506 qacct = "" 507 try: 508 now = self.get_remote_time() 509 qatime = self.get_qatime(now) 510 qacct_cmd = 'qacct -j -b ' + qatime 511 qstat_cmd = 'qstat -q all.q -u \"*\" -xml' 512 qhostxml = '\n'.join(master.ssh.execute('qhost -xml', 513 log_output=True, 514 source_profile=True)) 515 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd, 516 log_output=True, 517 source_profile=True)) 518 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True, 519 ignore_exit_status=True, 520 source_profile=True)) 521 except Exception, e: 522 log.error("Error occured getting SGE stats via ssh. " 523 "Cluster terminated?") 524 log.error(e) 525 return -1 526 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" % 527 (len(qhostxml), len(qstatxml), len(qacct))) 528 self.stat.parse_qhost(qhostxml) 529 self.stat.parse_qstat(qstatxml) 530 self.stat.parse_qacct(qacct, now)
531
532 - def run(self, cluster):
533 """ 534 This function will loop indefinitely, using SGELoadBalancer.get_stats() 535 to get the clusters status. It looks at the job queue and tries to 536 decide whether to add or remove a node. It should later look at job 537 durations (currently doesn't) 538 """ 539 self._cluster = cluster 540 if self.max_nodes is None: 541 self.max_nodes = cluster.cluster_size 542 use_default_stats_file = self.dump_stats and not self.stats_file 543 use_default_plots_dir = self.plot_stats and not self.plot_output_dir 544 if use_default_stats_file or use_default_plots_dir: 545 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) 546 if not self.stats_file: 547 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag 548 if not self.plot_output_dir: 549 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag 550 if not cluster.is_cluster_up(): 551 raise exception.ClusterNotRunning(cluster.cluster_tag) 552 if self.dump_stats: 553 if os.path.isdir(self.stats_file): 554 raise exception.BaseException("stats file destination '%s' is" 555 " a directory" % self.stats_file) 556 sfdir = os.path.dirname(os.path.abspath(self.stats_file)) 557 self._validate_dir(sfdir, msg_prefix="stats file destination") 558 if self.plot_stats: 559 if os.path.isfile(self.plot_output_dir): 560 raise exception.BaseException("plot output destination '%s' " 561 "is a file" % 562 self.plot_output_dir) 563 self._validate_dir(self.plot_output_dir, 564 msg_prefix="plot output destination") 565 raw = dict(__raw__=True) 566 log.info("Starting load balancer...\n") 567 log.info("Maximum cluster size: %d" % self.max_nodes, 568 extra=raw) 569 log.info("Minimum cluster size: %d" % self.min_nodes, 570 extra=raw) 571 log.info("Cluster growth rate: %d nodes/iteration\n" % 572 self.add_nodes_per_iteration, extra=raw) 573 if self.dump_stats: 574 log.info("Writing stats to file: %s" % self.stats_file) 575 if self.plot_stats: 576 log.info("Plotting stats to directory: %s" % self.plot_output_dir) 577 while(self._keep_polling): 578 if not cluster.is_cluster_up(): 579 log.info("Waiting for all nodes to come up...") 580 time.sleep(self.polling_interval) 581 continue 582 if self.get_stats() == -1: 583 log.error("Failed to get stats. LoadBalancer is terminating") 584 return 585 log.info("Cluster size: %d" % len(self.stat.hosts), extra=raw) 586 log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), 587 extra=raw) 588 oldest_queued_job_age = self.stat.oldest_queued_job_age() 589 if oldest_queued_job_age: 590 log.info("Oldest queued job: %s" % oldest_queued_job_age, 591 extra=raw) 592 log.info("Avg job duration: %d secs" % 593 self.stat.avg_job_duration(), extra=raw) 594 log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), 595 extra=raw) 596 log.info("Last cluster modification time: %s" % 597 self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"), 598 extra=dict(__raw__=True)) 599 #evaluate if nodes need to be added 600 self._eval_add_node() 601 #evaluate if nodes need to be removed 602 self._eval_remove_node() 603 if self.dump_stats or self.plot_stats: 604 self.stat.write_stats_to_csv(self.stats_file) 605 #call the visualizer 606 if self.plot_stats: 607 try: 608 self.visualizer.graph_all() 609 except IOError, e: 610 raise exception.BaseException(str(e)) 611 #sleep for the specified number of seconds 612 log.info("Sleeping...(looping again in %d secs)\n" % 613 self.polling_interval) 614 time.sleep(self.polling_interval)
615
616 - def has_cluster_stabilized(self):
617 now = datetime.datetime.utcnow() 618 elapsed = (now - self.__last_cluster_mod_time).seconds 619 is_stabilized = not (elapsed < self.stabilization_time) 620 if not is_stabilized: 621 log.info("Cluster was modified less than %d seconds ago" % 622 self.stabilization_time) 623 log.info("Waiting for cluster to stabilize...") 624 return is_stabilized
625
626 - def _eval_add_node(self):
627 """ 628 This function uses the metrics available to it to decide whether to 629 add a new node to the cluster or not. It isn't able to add a node yet. 630 TODO: See if the recent jobs have taken more than 5 minutes (how 631 long it takes to start an instance) 632 """ 633 need_to_add = 0 634 if len(self.stat.hosts) >= self.max_nodes: 635 log.info("Not adding nodes: already at or above maximum (%d)" % 636 self.max_nodes) 637 return 0 638 qlen = len(self.stat.get_queued_jobs()) 639 sph = self.stat.slots_per_host() 640 ts = self.stat.count_total_slots() 641 #calculate job duration 642 avg_duration = self.stat.avg_job_duration() 643 #calculate estimated time to completion 644 ettc = avg_duration * qlen / len(self.stat.hosts) 645 if qlen > ts: 646 if not self.has_cluster_stabilized(): 647 return 0 648 #there are more jobs queued than will be consumed with one 649 #cycle of job processing from all nodes 650 oldest_job_dt = self.stat.oldest_queued_job_age() 651 now = self.get_remote_time() 652 age_delta = now - oldest_job_dt 653 if age_delta.seconds > self.longest_allowed_queue_time: 654 log.info("A job has been waiting for %d sec, longer than " 655 "max %d" % (age_delta.seconds, 656 self.longest_allowed_queue_time)) 657 need_to_add = qlen / sph 658 if ettc < 600 and not self.stat.on_first_job(): 659 log.warn("There is a possibility that the job queue is" 660 " shorter than 10 minutes in duration") 661 #need_to_add = 0 662 max_add = self.max_nodes - len(self.stat.hosts) 663 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) 664 if need_to_add > 0: 665 log.info("*** ADDING %d NODES at %s" % 666 (need_to_add, str(datetime.datetime.utcnow()))) 667 try: 668 self._cluster.add_nodes(need_to_add) 669 except Exception: 670 log.error("Failed to add new host") 671 log.debug(traceback.format_exc()) 672 return -1 673 self.__last_cluster_mod_time = datetime.datetime.utcnow() 674 log.info("Done adding nodes at %s" % 675 str(datetime.datetime.utcnow())) 676 return need_to_add
677
678 - def _eval_remove_node(self):
679 """ 680 This function uses the sge stats to decide whether or not to 681 remove a node from the cluster. 682 """ 683 qlen = len(self.stat.get_queued_jobs()) 684 if qlen == 0: 685 if not self.has_cluster_stabilized(): 686 return 0 687 #if at 0, remove all nodes but master 688 if len(self.stat.hosts) > self.min_nodes: 689 log.info("Checking to remove a node...") 690 to_kill = self._find_node_for_removal() 691 if not to_kill: 692 log.info("No nodes can be killed at this time") 693 #kill the nodes returned 694 for n in to_kill: 695 if n.update() == "running": 696 log.info("***REMOVING NODE: %s (%s)" % (n.id, 697 n.dns_name)) 698 try: 699 self._cluster.remove_node(n) 700 except Exception: 701 log.error("Failed to remove node %s" % n.alias) 702 log.debug(traceback.format_exc()) 703 return -1 704 #successfully removed node 705 now = datetime.datetime.utcnow() 706 self.__last_cluster_mod_time = now 707 else: 708 log.error("Trying to kill dead node %s" % n.alias) 709 else: 710 log.info("Not removing nodes: already at or below minimum (%d)" 711 % self.min_nodes)
712
713 - def _find_node_for_removal(self):
714 """ 715 This function will find a suitable node to remove from the cluster. 716 The criteria for removal are: 717 1. The node must not be running any SGE job 718 2. The node must have been up for 50-60 minutes past its start time 719 3. The node must not be the master, or allow_master_kill=True 720 """ 721 nodes = self._cluster.running_nodes 722 to_rem = [] 723 for node in nodes: 724 if not self.allow_master_kill and node.is_master(): 725 log.debug("not removing master node") 726 continue 727 is_working = self.stat.is_node_working(node) 728 mins_up = self._minutes_uptime(node) % 60 729 if not is_working: 730 log.info("Idle Node %s (%s) has been up for %d minutes " 731 "past the hour" % (node.id, node.alias, mins_up)) 732 if self.polling_interval > 300: 733 self.kill_after = max(45, 734 60 - (2 * self.polling_interval / 60)) 735 if not is_working and mins_up >= self.kill_after: 736 to_rem.append(node) 737 return to_rem
738
739 - def _minutes_uptime(self, node):
740 """ 741 this function uses data available to boto to determine 742 how many total minutes this instance has been running. you can 743 mod (%) the return value with 60 to determine how many minutes 744 into a billable hour this node has been running. 745 """ 746 dt = utils.iso_to_datetime_tuple(node.launch_time) 747 now = self.get_remote_time() 748 timedelta = now - dt 749 return timedelta.seconds / 60
750