Package starcluster :: Package balancers :: Package sge
[hide private]
[frames] | no frames]

Source Code for Package starcluster.balancers.sge

  1  import os 
  2  import time 
  3  import datetime 
  4  import traceback 
  5  import xml.dom.minidom 
  6   
  7  from starcluster import utils 
  8  from starcluster import static 
  9  from starcluster import exception 
 10  from starcluster.balancers import LoadBalancer 
 11  from starcluster.logger import log 
 12   
 13   
 14  SGE_STATS_DIR = os.path.join(static.STARCLUSTER_CFG_DIR, 'sge') 
 15  DEFAULT_STATS_DIR = os.path.join(SGE_STATS_DIR, '%s') 
 16  DEFAULT_STATS_FILE = os.path.join(DEFAULT_STATS_DIR, 'sge-stats.csv') 
17 18 19 -class SGEStats(object):
20 """ 21 SunGridEngine stats parser 22 """ 23 jobstat_cachesize = 200 24 hosts = [] 25 jobs = [] 26 jobstats = jobstat_cachesize * [None] 27 max_job_id = 0 28 _default_fields = ["JB_job_number", "state", "JB_submission_time", 29 "queue_name", "slots", "tasks"] 30 31 @property
32 - def first_job_id(self):
33 if not self.jobs: 34 return 35 return int(self.jobs[0]['JB_job_number'])
36 37 @property
38 - def last_job_id(self):
39 if not self.jobs: 40 return 41 return int(self.jobs[-1]['JB_job_number'])
42
43 - def parse_qhost(self, string):
44 """ 45 this function parses qhost -xml output and makes a neat array 46 takes in a string, so we can pipe in output from ssh.exec('qhost -xml') 47 """ 48 self.hosts = [] # clear the old hosts 49 doc = xml.dom.minidom.parseString(string) 50 for h in doc.getElementsByTagName("host"): 51 name = h.getAttribute("name") 52 hash = {"name": name} 53 for stat in h.getElementsByTagName("hostvalue"): 54 for hvalue in stat.childNodes: 55 attr = stat.attributes['name'].value 56 val = "" 57 if hvalue.nodeType == xml.dom.minidom.Node.TEXT_NODE: 58 val = hvalue.data 59 hash[attr] = val 60 if hash['name'] != u'global': 61 self.hosts.append(hash) 62 return self.hosts
63
64 - def parse_qstat(self, string, fields=None):
65 """ 66 This method parses qstat -xml output and makes a neat array 67 """ 68 if fields == None: 69 fields = self._default_fields 70 self.jobs = [] # clear the old jobs 71 doc = xml.dom.minidom.parseString(string) 72 for job in doc.getElementsByTagName("job_list"): 73 jstate = job.getAttribute("state") 74 hash = {"job_state": jstate} 75 for tag in fields: 76 es = job.getElementsByTagName(tag) 77 for node in es: 78 for node2 in node.childNodes: 79 if node2.nodeType == xml.dom.minidom.Node.TEXT_NODE: 80 hash[tag] = node2.data 81 # grab the submit time on all jobs, the last job's val stays 82 if 'tasks' in hash and hash['tasks'].find('-') > 0: 83 self.job_multiply(hash) 84 else: 85 self.jobs.append(hash) 86 return self.jobs
87
88 - def job_multiply(self, hash):
89 """ 90 This function deals with sge jobs with a task range. For example, 91 'qsub -t 1-20:1' makes 20 jobs. self.jobs needs to represent that it is 92 20 jobs instead of just 1. 93 """ 94 sz_range = hash['tasks'] 95 dashpos = sz_range.find('-') 96 colpos = sz_range.find(':') 97 start = int(sz_range[0:dashpos]) 98 fin = int(sz_range[dashpos + 1:colpos]) 99 gran = int(sz_range[colpos + 1:len(sz_range)]) 100 log.debug("start = %d, fin = %d, granularity = %d, sz_range = %s" % 101 (start, fin, gran, sz_range)) 102 num_jobs = (fin - start) / gran 103 log.debug("This job expands to %d tasks" % num_jobs) 104 self.jobs.extend([hash] * num_jobs)
105
106 - def qacct_to_datetime_tuple(self, qacct):
107 """ 108 Takes the SGE qacct formatted time and makes a datetime tuple 109 format is: 110 Tue Jul 13 16:24:03 2010 111 """ 112 return datetime.datetime.strptime(qacct, "%a %b %d %H:%M:%S %Y")
113
114 - def parse_qacct(self, string, dtnow):
115 """ 116 This method parses qacct -j output and makes a neat array and 117 calculates some statistics. 118 Takes the string to parse, and a datetime object of the remote 119 host's current time. 120 """ 121 job_id = None 122 qd = None 123 start = None 124 end = None 125 counter = 0 126 lines = string.split('\n') 127 for l in lines: 128 l = l.strip() 129 if l.find('jobnumber') != -1: 130 job_id = int(l[13:len(l)]) 131 if l.find('qsub_time') != -1: 132 qd = self.qacct_to_datetime_tuple(l[13:len(l)]) 133 if l.find('start_time') != -1: 134 if l.find('-/-') > 0: 135 start = dtnow 136 else: 137 start = self.qacct_to_datetime_tuple(l[13:len(l)]) 138 if l.find('end_time') != -1: 139 if l.find('-/-') > 0: 140 end = dtnow 141 else: 142 end = self.qacct_to_datetime_tuple(l[13:len(l)]) 143 if l.find('==========') != -1: 144 if qd != None: 145 self.max_job_id = job_id 146 hash = {'queued': qd, 'start': start, 'end': end} 147 self.jobstats[job_id % self.jobstat_cachesize] = hash 148 qd = None 149 start = None 150 end = None 151 counter = counter + 1 152 log.debug("added %d new jobs" % counter) 153 log.debug("There are %d items in the jobstats cache" % 154 len(self.jobstats)) 155 return self.jobstats
156
157 - def is_jobstats_empty(self):
158 """ 159 This function will return True if half of the queue is empty, False if 160 there are enough entries in it. 161 """ 162 return self.max_job_id < (self.jobstat_cachesize * 0.3)
163
164 - def get_running_jobs(self):
165 """ 166 returns an array of the running jobs, values stored in dictionary 167 """ 168 running = [] 169 for j in self.jobs: 170 if j['job_state'] == u'running': 171 running.append(j) 172 return running
173
174 - def get_queued_jobs(self):
175 """ 176 returns an array of the queued jobs, values stored in dictionary 177 """ 178 queued = [] 179 for j in self.jobs: 180 if j['job_state'] == u'pending': 181 queued.append(j) 182 return queued
183
184 - def count_hosts(self):
185 """ 186 returns a count of the hosts in the cluster 187 """ 188 #todo: throw an exception if hosts not initialized 189 return len(self.hosts)
190
191 - def count_total_slots(self):
192 """ 193 returns a count of total slots available in this cluser 194 """ 195 slots = 0 196 for h in self.hosts: 197 if h['num_proc'] == '-': 198 h['num_proc'] = 0 199 slots = slots + int(h['num_proc']) 200 return slots
201
202 - def slots_per_host(self):
203 """ 204 returns the number of slots per host. 205 If for some reason the cluster is inconsistent, this will return -1 206 for example, if you have m1.large and m1.small in the same cluster 207 """ 208 total = self.count_total_slots() 209 if self.hosts[0][u'num_proc'] == '-': 210 self.hosts[0][u'num_proc'] = 0 211 single = int(self.hosts[0][u'num_proc']) 212 if (total != (single * len(self.hosts))): 213 log.error("ERROR: Number of slots not consistent across cluster") 214 return -1 215 return single
216
217 - def oldest_queued_job_age(self):
218 """ 219 This returns the age of the oldest job in the queue 220 """ 221 for j in self.jobs: 222 if 'JB_submission_time' in j: 223 st = j['JB_submission_time'] 224 dt = utils.iso_to_datetime_tuple(st) 225 return dt
226 #todo: throw a "no queued jobs" exception 227
228 - def is_node_working(self, node):
229 """ 230 This function returns true if the node is currently working on a task, 231 or false if the node is currently idle. 232 """ 233 nodename = node.alias 234 for j in self.jobs: 235 if 'queue_name' in j: 236 qn = j['queue_name'] 237 if qn.find(nodename) > 0: 238 log.debug("Node %s is working" % node.id) 239 return True 240 log.debug("Node %s is IDLE" % node.id) 241 return False
242
243 - def num_slots_for_job(self, job_id):
244 """ 245 returns the number of slots requested for the given job id 246 returns -1 if job_id is invalid 247 """ 248 ujid = unicode(job_id) 249 for j in self.jobs: 250 if j['JB_job_number'] == ujid: 251 return int(j['slots']) 252 return -1
253
254 - def avg_job_duration(self):
255 count = 0 256 total_seconds = 0 257 for job in self.jobstats: 258 if job != None: 259 delta = job['end'] - job['start'] 260 total_seconds = total_seconds + delta.seconds 261 count = count + 1 262 if count == 0: 263 return 0 264 else: 265 return total_seconds / count
266
267 - def avg_wait_time(self):
268 count = 0 269 total_seconds = 0 270 for job in self.jobstats: 271 if job != None: 272 delta = job['start'] - job['queued'] 273 total_seconds = total_seconds + delta.seconds 274 count = count + 1 275 if count == 0: 276 return 0 277 else: 278 return total_seconds / count
279
280 - def on_first_job(self):
281 """ 282 returns true if the cluster is processing the first job, 283 False otherwise 284 """ 285 if len(self.jobs) > 0 and self.jobs[0]['JB_job_number'] != u'1': 286 log.info("ON THE FIRST JOB") 287 return True 288 return False
289
290 - def get_loads(self):
291 """ 292 returns an array containing the loads on each host in cluster 293 """ 294 loads = [] 295 for h in self.hosts: 296 if h['load_avg'] == '-': 297 h['load_avg'] = 0 298 loads.append(h['load_avg']) 299 return loads
300
301 - def _add(self, x, y):
302 return float(x) + float(y)
303
304 - def get_all_stats(self):
305 now = datetime.datetime.utcnow() 306 bits = [] 307 #first field is the time 308 bits.append(now) 309 #second field is the number of hosts 310 bits.append(self.count_hosts()) 311 #third field is # of running jobs 312 bits.append(len(self.get_running_jobs())) 313 #fourth field is # of queued jobs 314 bits.append(len(self.get_queued_jobs())) 315 #fifth field is total # slots 316 bits.append(self.count_total_slots()) 317 #sixth field is average job duration 318 bits.append(self.avg_job_duration()) 319 #seventh field is average job wait time 320 bits.append(self.avg_wait_time()) 321 #last field is array of loads for hosts 322 arr = self.get_loads() 323 load_sum = float(reduce(self._add, arr)) 324 avg_load = load_sum / len(arr) 325 bits.append(avg_load) 326 return bits
327
328 - def write_stats_to_csv(self, filename):
329 """ 330 Write important SGE stats to CSV file 331 Appends one line to the CSV 332 """ 333 bits = self.get_all_stats() 334 try: 335 f = open(filename, 'a') 336 flat = ','.join(str(n) for n in bits) + '\n' 337 f.write(flat) 338 f.close() 339 except IOError, e: 340 raise exception.BaseException(str(e))
341
342 343 -class SGELoadBalancer(LoadBalancer):
344 """ 345 This class is able to query each SGE host and return load & queue 346 statistics 347 348 *** All times are in SECONDS unless otherwise specified *** 349 350 The polling interval in seconds. recommended: 60-300. any more frequent is 351 very wasteful. the polling loop with visualizer takes about 15 seconds. 352 polling_interval = 60 353 354 VERY IMPORTANT: Set this to the max nodes you're willing to have in your 355 cluster. Try setting this to the default cluster size you'd ordinarily 356 use. 357 max_nodes = 5 358 359 IMPORTANT: Set this to the longest time a job can wait before another host 360 is added to the cluster to help. Recommended: 300-900 seconds (5-15 mins). 361 Do not use a value less than 300 seconds because that is how long an 362 instance will take to start up. 363 longest_allowed_queue_time = 900 364 365 Keep this at 1 - your master, for now. 366 min_nodes = 1 367 368 This would allow the master to be killed when the queue empties. UNTESTED. 369 allow_master_kill = False 370 371 How many nodes to add per iteration. Setting it > 1 opens up possibility 372 of spending too much $$ 373 add_nodes_per_iteration = 1 374 375 Kill an instance after it has been up for X minutes. Do not kill earlier, 376 since you've already paid for that hour. (in mins) 377 kill_after = 45 378 379 After adding a node, how long to wait for the instance to start new jobs 380 stabilization_time = 180 381 382 Visualizer off by default. Start it with "starcluster loadbalance -p tag" 383 plot_stats = False 384 385 How many hours qacct should look back to gather past job data. lower 386 values minimize data transfer 387 lookback_window = 3 388 """ 389
390 - def __init__(self, interval=60, max_nodes=None, wait_time=900, 391 add_pi=1, kill_after=45, stab=180, lookback_win=3, 392 min_nodes=1, allow_master_kill=False, plot_stats=False, 393 plot_output_dir=None, dump_stats=False, stats_file=None):
394 self._cluster = None 395 self._keep_polling = True 396 self._visualizer = None 397 self.__last_cluster_mod_time = datetime.datetime.utcnow() 398 self.stat = None 399 self.polling_interval = interval 400 self.max_nodes = max_nodes 401 self.longest_allowed_queue_time = wait_time 402 self.add_nodes_per_iteration = add_pi 403 self.kill_after = kill_after 404 self.stabilization_time = stab 405 self.lookback_window = lookback_win 406 self.min_nodes = min_nodes 407 self.allow_master_kill = allow_master_kill 408 if self.longest_allowed_queue_time < 300: 409 log.warn("The recommended wait_time should be >= 300 seconds " 410 "(it takes ~5 min to launch a new EC2 node)") 411 self.dump_stats = dump_stats 412 self.stats_file = stats_file 413 self.plot_stats = plot_stats 414 self.plot_output_dir = plot_output_dir 415 if plot_stats: 416 assert self.visualizer != None
417 418 @property
419 - def visualizer(self):
420 if not self._visualizer: 421 try: 422 from starcluster.balancers.sge import visualizer 423 except ImportError, e: 424 log.error("Error importing visualizer:") 425 log.error(str(e)) 426 log.error("check that matplotlib and numpy are installed and:") 427 log.error(" $ python -c 'import matplotlib'") 428 log.error(" $ python -c 'import numpy'") 429 log.error("completes without error") 430 raise exception.BaseException( 431 "Failed to load stats visualizer") 432 self._visualizer = visualizer.SGEVisualizer(self.stats_file, 433 self.plot_output_dir) 434 else: 435 self._visualizer.stats_file = self.stats_file 436 self._visualizer.pngpath = self.plot_output_dir 437 return self._visualizer
438
439 - def _validate_dir(self, dirname, msg_prefix=""):
440 if not os.path.isdir(dirname): 441 msg = "'%s' is not a directory" 442 if not os.path.exists(dirname): 443 msg = "'%s' does not exist" 444 if msg_prefix: 445 msg = ' '.join([msg_prefix, msg]) 446 msg = msg % dirname 447 raise exception.BaseException(msg)
448
449 - def _mkdir(self, directory, makedirs=False):
450 if not os.path.isdir(directory): 451 if os.path.isfile(directory): 452 raise exception.BaseException("'%s' is a file not a directory") 453 try: 454 if makedirs: 455 os.makedirs(directory) 456 log.info("Created directories %s" % directory) 457 else: 458 os.mkdir(directory) 459 log.info("Created single directory %s" % directory) 460 except IOError, e: 461 raise exception.BaseException(str(e))
462
463 - def get_remote_time(self):
464 """ 465 this function remotely executes 'date' on the master node 466 and returns a datetime object with the master's time 467 instead of fetching it from local machine, maybe inaccurate. 468 """ 469 cl = self._cluster 470 str = '\n'.join(cl.master_node.ssh.execute('date')) 471 return datetime.datetime.strptime(str, "%a %b %d %H:%M:%S UTC %Y")
472
473 - def get_qatime(self, now):
474 """ 475 this function takes the lookback window and creates a string 476 representation of the past few hours, to feed to qacct to 477 limit the data set qacct returns. 478 """ 479 if self.stat.is_jobstats_empty(): 480 log.info("Loading full job history") 481 temp_lookback_window = self.lookback_window * 60 * 60 482 else: 483 temp_lookback_window = self.polling_interval 484 log.debug("getting past %d seconds worth of job history" % 485 temp_lookback_window) 486 now = now - datetime.timedelta(seconds=temp_lookback_window + 1) 487 str = now.strftime("%Y%m%d%H%M") 488 return str
489 490 #@print_timing
491 - def get_stats(self):
492 """ 493 this function will ssh to the SGE master and get load & queue stats. 494 it will feed these stats to SGEStats, which parses the XML. 495 it will return two arrays: one of hosts, each host has a hash with its 496 host information inside. The job array contains a hash for every job, 497 containing statistics about the job name, priority, etc. 498 """ 499 log.debug("starting get_stats") 500 master = self._cluster.master_node 501 self.stat = SGEStats() 502 503 qhostxml = "" 504 qstatxml = "" 505 qacct = "" 506 try: 507 now = self.get_remote_time() 508 qatime = self.get_qatime(now) 509 qacct_cmd = 'qacct -j -b ' + qatime 510 qstat_cmd = 'qstat -q all.q -u \"*\" -xml' 511 qhostxml = '\n'.join(master.ssh.execute('qhost -xml', 512 log_output=True, 513 source_profile=True)) 514 qstatxml = '\n'.join(master.ssh.execute(qstat_cmd, 515 log_output=True, 516 source_profile=True)) 517 qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=True, 518 ignore_exit_status=True, 519 source_profile=True)) 520 except Exception, e: 521 log.error("Error occurred getting SGE stats via ssh. " 522 "Cluster terminated?") 523 log.error(e) 524 return -1 525 log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" % 526 (len(qhostxml), len(qstatxml), len(qacct))) 527 self.stat.parse_qhost(qhostxml) 528 self.stat.parse_qstat(qstatxml) 529 self.stat.parse_qacct(qacct, now)
530
531 - def run(self, cluster):
532 """ 533 This function will loop indefinitely, using SGELoadBalancer.get_stats() 534 to get the clusters status. It looks at the job queue and tries to 535 decide whether to add or remove a node. It should later look at job 536 durations (currently doesn't) 537 """ 538 self._cluster = cluster 539 if self.max_nodes is None: 540 self.max_nodes = cluster.cluster_size 541 use_default_stats_file = self.dump_stats and not self.stats_file 542 use_default_plots_dir = self.plot_stats and not self.plot_output_dir 543 if use_default_stats_file or use_default_plots_dir: 544 self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) 545 if not self.stats_file: 546 self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag 547 if not self.plot_output_dir: 548 self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag 549 if not cluster.is_cluster_up(): 550 raise exception.ClusterNotRunning(cluster.cluster_tag) 551 if self.dump_stats: 552 if os.path.isdir(self.stats_file): 553 raise exception.BaseException("stats file destination '%s' is" 554 " a directory" % self.stats_file) 555 sfdir = os.path.dirname(os.path.abspath(self.stats_file)) 556 self._validate_dir(sfdir, msg_prefix="stats file destination") 557 if self.plot_stats: 558 if os.path.isfile(self.plot_output_dir): 559 raise exception.BaseException("plot output destination '%s' " 560 "is a file" % 561 self.plot_output_dir) 562 self._validate_dir(self.plot_output_dir, 563 msg_prefix="plot output destination") 564 raw = dict(__raw__=True) 565 log.info("Starting load balancer...\n") 566 log.info("Maximum cluster size: %d" % self.max_nodes, 567 extra=raw) 568 log.info("Minimum cluster size: %d" % self.min_nodes, 569 extra=raw) 570 log.info("Cluster growth rate: %d nodes/iteration\n" % 571 self.add_nodes_per_iteration, extra=raw) 572 if self.dump_stats: 573 log.info("Writing stats to file: %s" % self.stats_file) 574 if self.plot_stats: 575 log.info("Plotting stats to directory: %s" % self.plot_output_dir) 576 while(self._keep_polling): 577 if not cluster.is_cluster_up(): 578 log.info("Waiting for all nodes to come up...") 579 time.sleep(self.polling_interval) 580 continue 581 if self.get_stats() == -1: 582 log.error("Failed to get stats. LoadBalancer is terminating") 583 return 584 log.info("Cluster size: %d" % len(self.stat.hosts), extra=raw) 585 log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), 586 extra=raw) 587 oldest_queued_job_age = self.stat.oldest_queued_job_age() 588 if oldest_queued_job_age: 589 log.info("Oldest queued job: %s" % oldest_queued_job_age, 590 extra=raw) 591 log.info("Avg job duration: %d secs" % 592 self.stat.avg_job_duration(), extra=raw) 593 log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), 594 extra=raw) 595 log.info("Last cluster modification time: %s" % 596 self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"), 597 extra=dict(__raw__=True)) 598 #evaluate if nodes need to be added 599 self._eval_add_node() 600 #evaluate if nodes need to be removed 601 self._eval_remove_node() 602 if self.dump_stats or self.plot_stats: 603 self.stat.write_stats_to_csv(self.stats_file) 604 #call the visualizer 605 if self.plot_stats: 606 try: 607 self.visualizer.graph_all() 608 except IOError, e: 609 raise exception.BaseException(str(e)) 610 #sleep for the specified number of seconds 611 log.info("Sleeping...(looping again in %d secs)\n" % 612 self.polling_interval) 613 time.sleep(self.polling_interval)
614
615 - def has_cluster_stabilized(self):
616 now = datetime.datetime.utcnow() 617 elapsed = (now - self.__last_cluster_mod_time).seconds 618 is_stabilized = not (elapsed < self.stabilization_time) 619 if not is_stabilized: 620 log.info("Cluster was modified less than %d seconds ago" % 621 self.stabilization_time) 622 log.info("Waiting for cluster to stabilize...") 623 return is_stabilized
624
625 - def _eval_add_node(self):
626 """ 627 This function uses the metrics available to it to decide whether to 628 add a new node to the cluster or not. It isn't able to add a node yet. 629 TODO: See if the recent jobs have taken more than 5 minutes (how 630 long it takes to start an instance) 631 """ 632 need_to_add = 0 633 if len(self.stat.hosts) >= self.max_nodes: 634 log.info("Not adding nodes: already at or above maximum (%d)" % 635 self.max_nodes) 636 return 0 637 qlen = len(self.stat.get_queued_jobs()) 638 sph = self.stat.slots_per_host() 639 ts = self.stat.count_total_slots() 640 #calculate job duration 641 avg_duration = self.stat.avg_job_duration() 642 #calculate estimated time to completion 643 ettc = avg_duration * qlen / len(self.stat.hosts) 644 if qlen > ts: 645 if not self.has_cluster_stabilized(): 646 return 0 647 #there are more jobs queued than will be consumed with one 648 #cycle of job processing from all nodes 649 oldest_job_dt = self.stat.oldest_queued_job_age() 650 now = self.get_remote_time() 651 age_delta = now - oldest_job_dt 652 if age_delta.seconds > self.longest_allowed_queue_time: 653 log.info("A job has been waiting for %d sec, longer than " 654 "max %d" % (age_delta.seconds, 655 self.longest_allowed_queue_time)) 656 need_to_add = qlen / sph 657 if ettc < 600 and not self.stat.on_first_job(): 658 log.warn("There is a possibility that the job queue is" 659 " shorter than 10 minutes in duration") 660 #need_to_add = 0 661 max_add = self.max_nodes - len(self.stat.hosts) 662 need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) 663 if need_to_add > 0: 664 log.info("*** ADDING %d NODES at %s" % 665 (need_to_add, str(datetime.datetime.utcnow()))) 666 try: 667 self._cluster.add_nodes(need_to_add) 668 except Exception: 669 log.error("Failed to add new host") 670 log.debug(traceback.format_exc()) 671 return -1 672 self.__last_cluster_mod_time = datetime.datetime.utcnow() 673 log.info("Done adding nodes at %s" % 674 str(datetime.datetime.utcnow())) 675 return need_to_add
676
677 - def _eval_remove_node(self):
678 """ 679 This function uses the sge stats to decide whether or not to 680 remove a node from the cluster. 681 """ 682 qlen = len(self.stat.get_queued_jobs()) 683 if qlen == 0: 684 if not self.has_cluster_stabilized(): 685 return 0 686 #if at 0, remove all nodes but master 687 if len(self.stat.hosts) > self.min_nodes: 688 log.info("Checking to remove a node...") 689 to_kill = self._find_node_for_removal() 690 if not to_kill: 691 log.info("No nodes can be killed at this time") 692 #kill the nodes returned 693 for n in to_kill: 694 if n.update() == "running": 695 log.info("***REMOVING NODE: %s (%s)" % (n.id, 696 n.dns_name)) 697 try: 698 self._cluster.remove_node(n) 699 except Exception: 700 log.error("Failed to remove node %s" % n.alias) 701 log.debug(traceback.format_exc()) 702 return -1 703 #successfully removed node 704 now = datetime.datetime.utcnow() 705 self.__last_cluster_mod_time = now 706 else: 707 log.error("Trying to kill dead node %s" % n.alias) 708 else: 709 log.info("Not removing nodes: already at or below minimum (%d)" 710 % self.min_nodes)
711
712 - def _find_node_for_removal(self):
713 """ 714 This function will find a suitable node to remove from the cluster. 715 The criteria for removal are: 716 1. The node must not be running any SGE job 717 2. The node must have been up for 50-60 minutes past its start time 718 3. The node must not be the master, or allow_master_kill=True 719 """ 720 nodes = self._cluster.running_nodes 721 to_rem = [] 722 for node in nodes: 723 if not self.allow_master_kill and node.is_master(): 724 log.debug("not removing master node") 725 continue 726 is_working = self.stat.is_node_working(node) 727 mins_up = self._minutes_uptime(node) % 60 728 if not is_working: 729 log.info("Idle Node %s (%s) has been up for %d minutes " 730 "past the hour" % (node.id, node.alias, mins_up)) 731 if self.polling_interval > 300: 732 self.kill_after = max(45, 733 60 - (2 * self.polling_interval / 60)) 734 if not is_working and mins_up >= self.kill_after: 735 to_rem.append(node) 736 return to_rem
737
738 - def _minutes_uptime(self, node):
739 """ 740 this function uses data available to boto to determine 741 how many total minutes this instance has been running. you can 742 mod (%) the return value with 60 to determine how many minutes 743 into a billable hour this node has been running. 744 """ 745 dt = utils.iso_to_datetime_tuple(node.launch_time) 746 now = self.get_remote_time() 747 timedelta = now - dt 748 return timedelta.seconds / 60
749