Package starcluster :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module starcluster.cluster

   1  #!/usr/bin/env python 
   2  import os 
   3  import re 
   4  import time 
   5  import zlib 
   6  import string 
   7  import pprint 
   8  import base64 
   9  import cPickle 
  10  import traceback 
  11   
  12  from starcluster import utils 
  13  from starcluster import static 
  14  from starcluster import spinner 
  15  from starcluster import iptools 
  16  from starcluster import managers 
  17  from starcluster import exception 
  18  from starcluster import progressbar 
  19  from starcluster import clustersetup 
  20  from starcluster.node import Node 
  21  from starcluster.utils import print_timing 
  22  from starcluster.templates import user_msgs 
  23  from starcluster.logger import log 
24 25 26 -class ClusterManager(managers.Manager):
27 """ 28 Manager class for Cluster objects 29 """
30 - def __repr__(self):
31 return "<ClusterManager: %s>" % self.ec2.region.name
32
33 - def get_cluster(self, cluster_name, group=None, load_receipt=True, 34 load_plugins=True):
35 """ 36 Returns a Cluster object representing an active cluster 37 """ 38 try: 39 clname = self._get_cluster_name(cluster_name) 40 cltag = self.get_tag_from_sg(clname) 41 if not group: 42 group = self.ec2.get_security_group(clname) 43 cl = Cluster(ec2_conn=self.ec2, cluster_tag=cltag, 44 cluster_group=group) 45 if load_receipt: 46 cl.load_receipt(load_plugins=load_plugins) 47 try: 48 key_location = self.cfg.get_key(cl.keyname).get('key_location') 49 cl.key_location = key_location 50 except (exception.KeyNotFound, Exception): 51 pass 52 return cl 53 except exception.SecurityGroupDoesNotExist: 54 raise exception.ClusterDoesNotExist(cluster_name)
55
56 - def get_clusters(self, load_receipt=True, load_plugins=True):
57 """ 58 Returns a list of all active clusters 59 """ 60 cluster_groups = self.get_cluster_security_groups() 61 clusters = [self.get_cluster(g.name, group=g, 62 load_receipt=load_receipt, 63 load_plugins=load_plugins) 64 for g in cluster_groups] 65 return clusters
66
68 """ 69 Returns name of the default cluster template defined in the config 70 """ 71 return self.cfg.get_default_cluster_template()
72
73 - def get_cluster_template(self, template_name, tag_name=None):
74 """ 75 Returns a new Cluster object using the settings from the cluster 76 template template_name 77 78 If tag_name is passed, the Cluster object's cluster_tag setting will 79 be set to tag_name 80 """ 81 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name, 82 ec2_conn=self.ec2) 83 return cl
84
85 - def get_cluster_or_none(self, cluster_name):
86 """ 87 Same as get_cluster but returns None instead of throwing an exception 88 if the cluster does not exist 89 """ 90 try: 91 return self.get_cluster(cluster_name) 92 except exception.ClusterDoesNotExist: 93 pass
94
95 - def cluster_exists(self, tag_name):
96 """ 97 Returns True if cluster exists 98 """ 99 return self.get_cluster_or_none(tag_name) is not None
100
101 - def ssh_to_master(self, cluster_name, user='root'):
102 """ 103 ssh to master node of cluster_name 104 105 user keyword specifies an alternate user to login as 106 """ 107 cluster = self.get_cluster(cluster_name) 108 cluster.ssh_to_master(user=user)
109
110 - def ssh_to_cluster_node(self, cluster_name, node_id, user='root'):
111 """ 112 ssh to a node in cluster_name that has either an id, 113 dns name, or alias matching node_id 114 115 user keyword specifies an alternate user to login as 116 """ 117 cluster = self.get_cluster(cluster_name) 118 cluster.ssh_to_node(node_id, user=user)
119
120 - def _get_cluster_name(self, cluster_name):
121 """ 122 Returns human readable cluster name/tag prefixed with '@sc-' 123 """ 124 if not cluster_name.startswith(static.SECURITY_GROUP_PREFIX): 125 cluster_name = static.SECURITY_GROUP_TEMPLATE % cluster_name 126 return cluster_name
127
128 - def add_node(self, cluster_name, alias=None, no_create=False):
129 cl = self.get_cluster(cluster_name) 130 cl.add_node(alias, no_create=no_create)
131
132 - def add_nodes(self, cluster_name, num_nodes, aliases=None, 133 no_create=False):
134 """ 135 Add one or more nodes to cluster 136 """ 137 cl = self.get_cluster(cluster_name) 138 cl.add_nodes(num_nodes, aliases=aliases, no_create=no_create)
139
140 - def remove_node(self, cluster_name, alias, terminate=True):
141 """ 142 Remove a single node from a cluster 143 """ 144 cl = self.get_cluster(cluster_name) 145 n = cl.get_node_by_alias(alias) 146 if not n: 147 raise exception.InstanceDoesNotExist(alias, label='node') 148 cl.remove_node(n, terminate=terminate)
149
150 - def restart_cluster(self, cluster_name):
151 """ 152 Reboots and reconfigures cluster_name 153 """ 154 cl = self.get_cluster(cluster_name) 155 cl.restart_cluster()
156
157 - def stop_cluster(self, cluster_name, terminate_unstoppable=False):
158 """ 159 Stop an EBS-backed cluster 160 """ 161 cl = self.get_cluster(cluster_name) 162 cl.stop_cluster(terminate_unstoppable)
163
164 - def terminate_cluster(self, cluster_name):
165 """ 166 Terminates cluster_name 167 """ 168 cl = self.get_cluster(cluster_name) 169 cl.terminate_cluster()
170
171 - def get_cluster_security_group(self, group_name):
172 """ 173 Return all security groups on EC2 that start with '@sc-' 174 """ 175 gname = self._get_cluster_name(group_name) 176 return self.ec2.get_security_group(gname)
177
179 """ 180 Return all security groups on EC2 that start with '@sc-' 181 """ 182 glob = static.SECURITY_GROUP_TEMPLATE % '*' 183 sgs = self.ec2.get_security_groups(filters={'group-name': glob}) 184 return sgs
185
186 - def get_tag_from_sg(self, sg):
187 """ 188 Returns the cluster tag name from a security group name that starts 189 with static.SECURITY_GROUP_PREFIX 190 191 Example: 192 sg = '@sc-mycluster' 193 print get_tag_from_sg(sg) 194 mycluster 195 """ 196 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)') 197 match = regex.match(sg) 198 if match: 199 return match.groups()[0]
200
201 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
202 """ 203 Prints a summary for each active cluster on EC2 204 """ 205 if not cluster_groups: 206 cluster_groups = self.get_cluster_security_groups() 207 if not cluster_groups: 208 log.info("No clusters found...") 209 else: 210 try: 211 cluster_groups = [self.get_cluster_security_group(g) for g \ 212 in cluster_groups] 213 except exception.SecurityGroupDoesNotExist: 214 raise exception.ClusterDoesNotExist(g) 215 for scg in cluster_groups: 216 tag = self.get_tag_from_sg(scg.name) 217 try: 218 cl = self.get_cluster(tag, group=scg, load_plugins=False) 219 except exception.IncompatibleCluster, e: 220 sep = '*' * 60 221 log.error('\n'.join([sep, e.msg, sep]), 222 extra=dict(__textwrap__=True)) 223 continue 224 header = '%s (security group: %s)' % (tag, scg.name) 225 print '-' * len(header) 226 print header 227 print '-' * len(header) 228 nodes = cl.nodes 229 try: 230 n = nodes[0] 231 except IndexError: 232 n = None 233 state = getattr(n, 'state', None) 234 ltime = 'N/A' 235 uptime = 'N/A' 236 if state in ['pending', 'running']: 237 ltime = getattr(n, 'local_launch_time', 'N/A') 238 uptime = getattr(n, 'uptime', 'N/A') 239 print 'Launch time: %s' % ltime 240 print 'Uptime: %s' % uptime 241 print 'Zone: %s' % getattr(n, 'placement', 'N/A') 242 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A') 243 ebs_nodes = [n for n in nodes if n.attached_vols] 244 if ebs_nodes: 245 print 'EBS volumes:' 246 for node in ebs_nodes: 247 devices = node.attached_vols 248 node_id = node.alias or node.id 249 for dev in devices: 250 d = devices.get(dev) 251 vol_id = d.volume_id 252 status = d.status 253 print ' %s on %s:%s (status: %s)' % \ 254 (vol_id, node_id, dev, status) 255 else: 256 print 'EBS volumes: N/A' 257 if nodes: 258 print 'Cluster nodes:' 259 for node in nodes: 260 nodeline = " %7s %s %s %s" % (node.alias, node.state, 261 node.id, node.dns_name) 262 if node.spot_id: 263 nodeline += ' (spot %s)' % node.spot_id 264 if show_ssh_status: 265 ssh_status = {True: 'Up', False: 'Down'} 266 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()] 267 print nodeline 268 print 'Total nodes: %d' % len(nodes) 269 else: 270 print 'Cluster nodes: N/A' 271 print
272
273 - def run_plugin(self, plugin_name, cluster_tag):
274 """ 275 Run a plugin defined in the config. 276 277 plugin_name must match the plugin's section name in the config 278 cluster_tag specifies the cluster to run the plugin on 279 """ 280 cl = self.get_cluster(cluster_tag, load_plugins=False) 281 if not cl.is_cluster_up(): 282 raise exception.ClusterNotRunning(cluster_tag) 283 plugs = [self.cfg.get_plugin(plugin_name)] 284 name, plugin = cl.load_plugins(plugs)[0] 285 cl.run_plugin(plugin, name)
286
287 288 -class Cluster(object):
289 - def __init__(self, 290 ec2_conn=None, 291 spot_bid=None, 292 cluster_tag=None, 293 cluster_description=None, 294 cluster_size=None, 295 cluster_user=None, 296 cluster_shell=None, 297 master_image_id=None, 298 master_instance_type=None, 299 node_image_id=None, 300 node_instance_type=None, 301 node_instance_types=[], 302 availability_zone=None, 303 keyname=None, 304 key_location=None, 305 volumes=[], 306 plugins=[], 307 permissions=[], 308 refresh_interval=30, 309 disable_queue=False, 310 disable_threads=False, 311 cluster_group=None, 312 force_spot_master=False, 313 **kwargs):
314 315 now = time.strftime("%Y%m%d%H%M") 316 self.ec2 = ec2_conn 317 self.spot_bid = spot_bid 318 self.cluster_tag = cluster_tag 319 self.cluster_description = cluster_description 320 if self.cluster_tag is None: 321 self.cluster_tag = "cluster%s" % now 322 if cluster_description is None: 323 self.cluster_description = "Cluster created at %s" % now 324 self.cluster_size = cluster_size or 0 325 self.cluster_user = cluster_user 326 self.cluster_shell = cluster_shell 327 self.master_image_id = master_image_id 328 self.master_instance_type = master_instance_type 329 self.node_image_id = node_image_id 330 self.node_instance_type = node_instance_type 331 self.node_instance_types = node_instance_types 332 self.availability_zone = availability_zone 333 self.keyname = keyname 334 self.key_location = key_location 335 self.volumes = self.load_volumes(volumes) 336 self.plugins = self.load_plugins(plugins) 337 self.permissions = permissions 338 self.refresh_interval = refresh_interval 339 self.disable_queue = disable_queue 340 self.disable_threads = disable_threads 341 self.force_spot_master = force_spot_master 342 343 self.__instance_types = static.INSTANCE_TYPES 344 self.__cluster_settings = static.CLUSTER_SETTINGS 345 self.__available_shells = static.AVAILABLE_SHELLS 346 self.__protocols = static.PROTOCOLS 347 self._progress_bar = None 348 self._master_reservation = None 349 self._node_reservation = None 350 self._nodes = [] 351 self._master = None 352 self._zone = None 353 self._plugins = plugins 354 self._cluster_group = None 355 self._placement_group = None
356
357 - def __repr__(self):
358 return '<Cluster: %s (%s-node)>' % (self.cluster_tag, 359 self.cluster_size)
360 361 @property
362 - def zone(self):
363 """ 364 If volumes are specified, this method determines the common 365 availability zone between those volumes. If an availability zone 366 is explicitly specified in the config and does not match the common 367 availability zone of the volumes, an exception is raised. If all 368 volumes are not in the same availabilty zone an exception is raised. 369 If no volumes are specified, returns the user specified availability 370 zone if it exists. 371 """ 372 if not self._zone: 373 zone = None 374 if self.availability_zone: 375 zone = self.ec2.get_zone(self.availability_zone).name 376 common_zone = None 377 for volume in self.volumes: 378 volid = self.volumes.get(volume).get('volume_id') 379 vol = self.ec2.get_volume(volid) 380 if not common_zone: 381 common_zone = vol.zone 382 elif vol.zone != common_zone: 383 vols = [self.volumes.get(v).get('volume_id') 384 for v in self.volumes] 385 raise exception.VolumesZoneError(vols) 386 if common_zone and zone and zone != common_zone: 387 raise exception.InvalidZone(zone, common_zone) 388 if not zone and common_zone: 389 zone = common_zone 390 self._zone = zone 391 return self._zone
392
393 - def load_volumes(self, vols):
394 """ 395 Iterate through vols and set device/partition settings automatically if 396 not specified. 397 398 This method assigns the first volume to /dev/sdz, second to /dev/sdy, 399 etc for all volumes that do not include a device/partition setting 400 """ 401 devices = ['/dev/sd%s' % s for s in string.lowercase] 402 devmap = {} 403 for volname in vols: 404 vol = vols.get(volname) 405 dev = vol.get('device') 406 if dev in devices: 407 #rm user-defined devices from the list of auto-assigned devices 408 devices.remove(dev) 409 volid = vol.get('volume_id') 410 if dev and not volid in devmap: 411 devmap[volid] = dev 412 volumes = {} 413 for volname in vols: 414 vol = vols.get(volname) 415 vol_id = vol.get('volume_id') 416 device = vol.get('device') 417 if not device: 418 if vol_id in devmap: 419 device = devmap.get(vol_id) 420 else: 421 device = devices.pop() 422 devmap[vol_id] = device 423 if not utils.is_valid_device(device): 424 raise exception.InvalidDevice(device) 425 v = volumes[volname] = utils.AttributeDict() 426 v.update(vol) 427 v['device'] = device 428 part = vol.get('partition') 429 if part: 430 partition = device + str(part) 431 if not utils.is_valid_partition(partition): 432 raise exception.InvalidPartition(part) 433 v['partition'] = partition 434 return volumes
435
436 - def load_plugins(self, plugins):
437 plugs = [] 438 for plugin in plugins: 439 setup_class = plugin.get('setup_class') 440 plugin_name = plugin.get('__name__').split()[-1] 441 mod_name = '.'.join(setup_class.split('.')[:-1]) 442 class_name = setup_class.split('.')[-1] 443 try: 444 mod = __import__(mod_name, globals(), locals(), [class_name]) 445 except SyntaxError, e: 446 raise exception.PluginSyntaxError( 447 "Plugin %s (%s) contains a syntax error at line %s" % \ 448 (plugin_name, e.filename, e.lineno)) 449 except ImportError, e: 450 raise exception.PluginLoadError( 451 "Failed to import plugin %s: %s" % \ 452 (plugin_name, e[0])) 453 klass = getattr(mod, class_name, None) 454 if not klass: 455 raise exception.PluginError( 456 'Plugin class %s does not exist' % setup_class) 457 if not issubclass(klass, clustersetup.ClusterSetup): 458 raise exception.PluginError( 459 ("Plugin %s must be a subclass of " + \ 460 "starcluster.clustersetup.ClusterSetup") % setup_class) 461 args, kwargs = utils.get_arg_spec(klass.__init__) 462 config_args = [] 463 missing_args = [] 464 for arg in args: 465 if arg in plugin: 466 config_args.append(plugin.get(arg)) 467 else: 468 missing_args.append(arg) 469 log.debug("config_args = %s" % config_args) 470 if missing_args: 471 raise exception.PluginError( 472 "Not enough settings provided for plugin %s (missing: %s)" 473 % (plugin_name, ', '.join(missing_args))) 474 config_kwargs = {} 475 for arg in kwargs: 476 if arg in plugin: 477 config_kwargs[arg] = plugin.get(arg) 478 log.debug("config_kwargs = %s" % config_kwargs) 479 plugs.append((plugin_name, klass(*config_args, **config_kwargs))) 480 return plugs
481
482 - def update(self, kwargs):
483 for key in kwargs.keys(): 484 if hasattr(self, key): 485 self.__dict__[key] = kwargs[key]
486
488 """ 489 Validate existing instances against this cluster's settings 490 """ 491 self.wait_for_active_spots() 492 nodes = self.nodes 493 if not nodes: 494 raise exception.ClusterValidationError("No existing nodes found!") 495 log.info("Validating existing instances...") 496 mazone = self.master_node.placement 497 rlmap = self._get_launch_map(reverse=True) 498 for node in nodes: 499 itype, image = rlmap.get(node.alias) 500 alias = node.alias 501 ntype = node.instance_type 502 if ntype != itype: 503 raise exception.ClusterValidationError( 504 "%s's instance type (%s) != %s" % (alias, ntype, itype)) 505 nimage = node.image_id 506 if nimage != image: 507 raise exception.ClusterValidationError( 508 "%s's image id (%s) != %s" % (alias, nimage, image)) 509 if node.key_name != self.keyname: 510 raise exception.ClusterValidationError( 511 "%s's key_name (%s) != %s" % (alias, node.key_name, 512 self.keyname)) 513 nazone = node.placement 514 if mazone != nazone: 515 raise exception.ClusterValidationError( 516 "Node '%s' zone (%s) does not match master's zone (%s)" % 517 (alias, nazone, mazone)) 518 # reset zone cache 519 self._zone = None 520 if self.zone and self.zone != mazone: 521 raise exception.ClusterValidationError( 522 "Running cluster's availability_zone (%s) != %s" % 523 (mazone, self.zone))
524
525 - def get(self, name):
526 return self.__dict__.get(name)
527
528 - def __str__(self):
529 cfg = self.__getstate__() 530 return pprint.pformat(cfg)
531
532 - def load_receipt(self, load_plugins=True):
533 """ 534 Load the original settings used to launch this cluster into this 535 Cluster object. The settings are loaded from the cluster group's 536 description field. 537 """ 538 try: 539 desc = self.cluster_group.description 540 version, b64data = desc.split('-', 1) 541 if utils.program_version_greater(version, static.VERSION): 542 d = dict(cluster=self.cluster_tag, old_version=static.VERSION, 543 new_version=version) 544 msg = user_msgs.version_mismatch % d 545 sep = '*' * 60 546 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1}) 547 compressed_data = base64.b64decode(b64data) 548 pkl_data = zlib.decompress(compressed_data) 549 cluster_settings = cPickle.loads(str(pkl_data)).__dict__ 550 except (cPickle.PickleError, zlib.error, ValueError, TypeError, 551 EOFError, IndexError), e: 552 log.debug('load receipt exception: ', exc_info=True) 553 raise exception.IncompatibleCluster(self.cluster_group) 554 except Exception, e: 555 raise exception.ClusterReceiptError( 556 'failed to load cluster receipt: %s' % e) 557 for key in cluster_settings: 558 if hasattr(self, key): 559 setattr(self, key, cluster_settings.get(key)) 560 if load_plugins: 561 try: 562 self.plugins = self.load_plugins(self._plugins) 563 except exception.PluginError, e: 564 log.warn(e) 565 log.warn("An error occured while loading plugins") 566 log.warn("Not running any plugins") 567 except Exception, e: 568 raise exception.ClusterReceiptError( 569 'failed to load cluster receipt: %s' % e) 570 return True
571
572 - def __getstate__(self):
573 cfg = {} 574 exclude = ['key_location', 'plugins'] 575 include = ['_zone', '_plugins'] 576 for key in self.__dict__.keys(): 577 private = key.startswith('_') 578 if (not private or key in include) and not key in exclude: 579 val = getattr(self, key) 580 if type(val) in [str, unicode, bool, int, float, list, dict]: 581 cfg[key] = val 582 elif type(val) is utils.AttributeDict: 583 cfg[key] = dict(val) 584 return cfg
585 586 @property
587 - def _security_group(self):
588 return static.SECURITY_GROUP_TEMPLATE % self.cluster_tag
589 590 @property
591 - def cluster_group(self):
592 if self._cluster_group is None: 593 ssh_port = static.DEFAULT_SSH_PORT 594 desc = base64.b64encode(zlib.compress(cPickle.dumps(self))) 595 desc = '-'.join([static.VERSION, desc]) 596 sg = self.ec2.get_or_create_group(self._security_group, 597 desc, 598 auth_ssh=True, 599 auth_group_traffic=True) 600 for p in self.permissions: 601 perm = self.permissions.get(p) 602 ip_protocol = perm.get('ip_protocol', 'tcp') 603 from_port = perm.get('from_port') 604 to_port = perm.get('to_port') 605 cidr_ip = perm.get('cidr_ip', static.WORLD_CIDRIP) 606 if not self.ec2.has_permission(sg, ip_protocol, from_port, 607 to_port, cidr_ip): 608 log.info("Opening %s port range %s-%s for CIDR %s" % 609 (ip_protocol, from_port, to_port, cidr_ip)) 610 sg.authorize(ip_protocol, from_port, to_port, cidr_ip) 611 if ip_protocol == 'tcp' and from_port <= ssh_port <= to_port: 612 sg.revoke(ip_protocol, ssh_port, ssh_port, 613 static.WORLD_CIDRIP) 614 self._cluster_group = sg 615 return self._cluster_group
616 617 @property
618 - def placement_group(self):
619 if self._placement_group is None: 620 pg = self.ec2.get_or_create_placement_group(self._security_group) 621 self._placement_group = pg 622 return self._placement_group
623 624 @property
625 - def master_node(self):
626 if not self._master: 627 for node in self.nodes: 628 if node.is_master(): 629 self._master = node 630 if not self._master: 631 raise exception.MasterDoesNotExist() 632 return self._master
633 634 @property
635 - def nodes(self):
636 states = ['pending', 'running', 'stopping', 'stopped'] 637 filters = {'group-name': self._security_group, 638 'instance-state-name': states} 639 nodes = self.ec2.get_all_instances(filters=filters) 640 # remove any cached nodes not in the current node list from EC2 641 current_ids = [n.id for n in nodes] 642 remove_nodes = [n for n in self._nodes if n.id not in current_ids] 643 for node in remove_nodes: 644 self._nodes.remove(node) 645 # update node cache with latest instance data from EC2 646 existing_nodes = dict([(n.id, n) for n in self._nodes]) 647 log.debug('existing nodes: %s' % existing_nodes) 648 for node in nodes: 649 if node.id in existing_nodes: 650 log.debug('updating existing node %s in self._nodes' % node.id) 651 enode = existing_nodes.get(node.id) 652 enode.key_location = self.key_location 653 enode.instance = node 654 else: 655 log.debug('adding node %s to self._nodes list' % node.id) 656 n = Node(node, self.key_location) 657 if n.is_master(): 658 self._master = n 659 self._nodes.insert(0, n) 660 else: 661 self._nodes.append(n) 662 self._nodes.sort(key=lambda n: n.alias) 663 log.debug('returning self._nodes = %s' % self._nodes) 664 return self._nodes
665
666 - def get_nodes_or_raise(self):
667 nodes = self.nodes 668 if not nodes: 669 raise exception.NoClusterNodesFound 670 return nodes
671
672 - def get_node_by_dns_name(self, dns_name):
673 for node in self.nodes: 674 if node.dns_name == dns_name: 675 return node 676 raise exception.InstanceDoesNotExist(dns_name, label='node')
677
678 - def get_node_by_id(self, instance_id):
679 for node in self.nodes: 680 if node.id == instance_id: 681 return node 682 raise exception.InstanceDoesNotExist(instance_id, label='node')
683
684 - def get_node_by_alias(self, alias):
685 for node in self.nodes: 686 if node.alias == alias: 687 return node 688 raise exception.InstanceDoesNotExist(alias, label='node')
689
690 - def _nodes_in_states(self, states):
691 return filter(lambda x: x.state in states, self.nodes)
692 693 @property
694 - def running_nodes(self):
695 return self._nodes_in_states(['running'])
696 697 @property
698 - def stopped_nodes(self):
699 return self._nodes_in_states(['stopping', 'stopped'])
700 701 @property
702 - def spot_requests(self):
703 filters = {'launch.group-id': self.cluster_group.id, 704 'state': ['active', 'open']} 705 return self.ec2.get_all_spot_requests(filters=filters)
706
707 - def get_spot_requests_or_raise(self):
708 spots = self.spot_requests 709 if not spots: 710 raise exception.NoClusterSpotRequests 711 return spots
712
713 - def create_node(self, alias, image_id=None, instance_type=None, zone=None, 714 placement_group=None, spot_bid=None, force_flat=False):
715 return self.create_nodes([alias], image_id=image_id, 716 instance_type=instance_type, count=1, 717 zone=zone, placement_group=placement_group, 718 spot_bid=spot_bid, force_flat=force_flat)[0]
719
720 - def create_nodes(self, aliases, image_id=None, instance_type=None, count=1, 721 zone=None, placement_group=None, spot_bid=None, 722 force_flat=False):
723 """ 724 Convenience method for requesting instances with this cluster's 725 settings. All settings (kwargs) except force_flat default to cluster 726 settings if not provided. Passing force_flat=True ignores spot_bid 727 completely forcing a flat-rate instance to be requested. 728 """ 729 spot_bid = spot_bid or self.spot_bid 730 if force_flat: 731 spot_bid = None 732 cluster_sg = self.cluster_group.name 733 instance_type = instance_type or self.node_instance_type 734 if not placement_group and instance_type in static.CLUSTER_TYPES: 735 placement_group = self.placement_group.name 736 image_id = image_id or self.node_image_id 737 kwargs = dict(price=spot_bid, instance_type=instance_type, 738 min_count=count, max_count=count, count=count, 739 key_name=self.keyname, security_groups=[cluster_sg], 740 availability_zone_group=cluster_sg, 741 launch_group=cluster_sg, placement=zone or self.zone, 742 user_data='|'.join(aliases), 743 placement_group=placement_group) 744 resvs = [] 745 if spot_bid: 746 for alias in aliases: 747 kwargs['user_data'] = alias 748 resvs.extend(self.ec2.request_instances(image_id, **kwargs)) 749 else: 750 resvs.append(self.ec2.request_instances(image_id, **kwargs)) 751 for resv in resvs: 752 log.info(str(resv), extra=dict(__raw__=True)) 753 return resvs
754
755 - def _get_next_node_num(self):
756 nodes = self._nodes_in_states(['pending', 'running']) 757 nodes = filter(lambda x: not x.is_master(), nodes) 758 highest = 0 759 for n in nodes: 760 try: 761 highest = max(highest, int(n.alias[4:8])) 762 except ValueError: 763 pass 764 next = highest + 1 765 log.debug("Highest node number is %d. choosing %d." % (highest, next)) 766 return next
767
768 - def add_node(self, alias=None, no_create=False):
769 """ 770 Add a single node to this cluster 771 """ 772 aliases = None 773 if alias: 774 aliases = [alias] 775 self.add_nodes(1, aliases=aliases, no_create=no_create)
776
777 - def add_nodes(self, num_nodes, aliases=None, no_create=False):
778 """ 779 Add new nodes to this cluster 780 781 aliases - list of aliases to assign to new nodes (len must equal 782 num_nodes) 783 """ 784 running_pending = self._nodes_in_states(['pending', 'running']) 785 aliases = aliases or [] 786 if not aliases: 787 next_node_id = self._get_next_node_num() 788 for i in range(next_node_id, next_node_id + num_nodes): 789 alias = 'node%.3d' % i 790 aliases.append(alias) 791 assert len(aliases) == num_nodes 792 if "master" in aliases: 793 raise exception.ClusterValidationError( 794 "worker nodes cannot have master as an alias") 795 if not no_create: 796 for node in running_pending: 797 if node.alias in aliases: 798 raise exception.ClusterValidationError( 799 "node with alias %s already exists" % node.alias) 800 log.info("Launching node(s): %s" % ', '.join(aliases)) 801 self.create_nodes(aliases, count=len(aliases)) 802 self.wait_for_cluster(msg="Waiting for node(s) to come up...") 803 log.debug("Adding node(s): %s" % aliases) 804 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 805 self.disable_threads) 806 for alias in aliases: 807 node = self.get_node_by_alias(alias) 808 default_plugin.on_add_node( 809 node, self.nodes, self.master_node, 810 self.cluster_user, self.cluster_shell, 811 self.volumes) 812 self.run_plugins(method_name="on_add_node", node=node)
813
814 - def remove_node(self, node, terminate=True):
815 """ 816 Remove a single node from this cluster 817 """ 818 return self.remove_nodes([node], terminate=terminate)
819
820 - def remove_nodes(self, nodes, terminate=True):
821 """ 822 Remove a list of nodes from this cluster 823 """ 824 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 825 self.disable_threads) 826 for node in nodes: 827 if node.is_master(): 828 raise exception.InvalidOperation("cannot remove master node") 829 self.run_plugins(method_name="on_remove_node", 830 node=node, reverse=True) 831 default_plugin.on_remove_node( 832 node, self.nodes, self.master_node, 833 self.cluster_user, self.cluster_shell, 834 self.volumes) 835 if not terminate: 836 continue 837 if node.spot_id: 838 log.info("Cancelling spot request %s" % node.spot_id) 839 node.get_spot_request().cancel() 840 node.terminate()
841
842 - def _get_launch_map(self, reverse=False):
843 """ 844 Groups all node-aliases that have similar instance types/image ids 845 Returns a dictionary that's used to launch all similar instance types 846 and image ids in the same request. Example return value: 847 848 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'], 849 ('m1.large', 'ami-a5c02dcc'): ['node003'], 850 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'], 851 ('m1.small', 'ami-19e17a2b'): ['node004']} 852 853 Passing reverse=True will return the same information only keyed by 854 node aliases: 855 856 {'master': ('m1.small', 'ami-17b15e7e'), 857 'node001': ('c1.xlarge', 'ami-a5c02dcc'), 858 'node002': ('c1.xlarge', 'ami-a5c02dcc'), 859 'node003': ('m1.large', 'ami-a5c02dcc'), 860 'node004': ('m1.small', 'ami-19e17a2b'), 861 'node005': ('m1.small', 'ami-17b15e7e'), 862 'node006': ('m1.small', 'ami-17b15e7e')} 863 """ 864 lmap = {} 865 mtype = self.master_instance_type or self.node_instance_type 866 mimage = self.master_image_id or self.node_image_id 867 lmap[(mtype, mimage)] = ['master'] 868 id_start = 1 869 for itype in self.node_instance_types: 870 count = itype['size'] 871 image_id = itype['image'] or self.node_image_id 872 type = itype['type'] or self.node_instance_type 873 if not (type, image_id) in lmap: 874 lmap[(type, image_id)] = [] 875 for id in range(id_start, id_start + count): 876 alias = 'node%.3d' % id 877 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 878 (alias, image_id, type)) 879 lmap[(type, image_id)].append(alias) 880 id_start += 1 881 ntype = self.node_instance_type 882 nimage = self.node_image_id 883 if not (ntype, nimage) in lmap: 884 lmap[(ntype, nimage)] = [] 885 for id in range(id_start, self.cluster_size): 886 alias = 'node%.3d' % id 887 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 888 (alias, nimage, ntype)) 889 lmap[(ntype, nimage)].append(alias) 890 if reverse: 891 rlmap = {} 892 for (itype, image_id) in lmap: 893 aliases = lmap.get((itype, image_id)) 894 for alias in aliases: 895 rlmap[alias] = (itype, image_id) 896 return rlmap 897 return lmap
898
899 - def _get_type_and_image_id(self, alias):
900 """ 901 Returns (instance_type,image_id) for a given alias based 902 on the map returned from self._get_launch_map 903 """ 904 lmap = self._get_launch_map() 905 for (type, image) in lmap: 906 key = (type, image) 907 if alias in lmap.get(key): 908 return key
909
910 - def create_cluster(self):
911 """ 912 Launches all EC2 instances based on this cluster's settings. 913 """ 914 log.info("Launching a %d-node cluster..." % self.cluster_size) 915 mtype = self.master_instance_type or self.node_instance_type 916 self.master_instance_type = mtype 917 if self.spot_bid: 918 self._create_spot_cluster() 919 else: 920 self._create_flat_rate_cluster()
921
922 - def _create_flat_rate_cluster(self):
923 """ 924 Launches cluster using flat-rate instances. This method attempts to 925 minimize the number of launch requests by grouping nodes of the same 926 type/ami and launching each group simultaneously within a single launch 927 request. This is especially important for Cluster Compute instances 928 given that Amazon *highly* recommends requesting all CCI in a single 929 launch request. 930 """ 931 lmap = self._get_launch_map() 932 zone = None 933 master_map = None 934 for (type, image) in lmap: 935 # launch all aliases that match master's itype/image_id 936 aliases = lmap.get((type, image)) 937 if 'master' in aliases: 938 master_map = (type, image) 939 for alias in aliases: 940 log.debug("Launching %s (ami: %s, type: %s)" % \ 941 (alias, image, type)) 942 master_response = self.create_nodes(aliases, image_id=image, 943 instance_type=type, 944 count=len(aliases), 945 force_flat=True)[0] 946 zone = master_response.instances[0].placement 947 lmap.pop(master_map) 948 if self.cluster_size <= 1: 949 return 950 for (type, image) in lmap: 951 aliases = lmap.get((type, image)) 952 for alias in aliases: 953 log.debug("Launching %s (ami: %s, type: %s)" % \ 954 (alias, image, type)) 955 self.create_nodes(aliases, image_id=image, instance_type=type, 956 count=len(aliases), zone=zone, force_flat=True)
957
958 - def _create_spot_cluster(self):
959 """ 960 Launches cluster using all spot instances. This method makes a single 961 spot request for each node in the cluster since spot instances 962 *always* have an ami_launch_index of 0. This is needed in order to 963 correctly assign aliases to nodes. 964 """ 965 (mtype, mimage) = self._get_type_and_image_id('master') 966 log.info("Launching master node (ami: %s, type: %s)..." % \ 967 (mimage, mtype)) 968 force_flat = not self.force_spot_master and self.cluster_size > 1 969 master_response = self.create_node('master', 970 image_id=mimage, 971 instance_type=mtype, 972 force_flat=force_flat) 973 zone = None 974 if not force_flat and self.spot_bid: 975 # Make sure nodes are in same zone as master 976 launch_spec = master_response.launch_specification 977 zone = launch_spec.placement 978 else: 979 # Make sure nodes are in same zone as master 980 zone = master_response.instances[0].placement 981 if self.cluster_size <= 1: 982 return 983 for id in range(1, self.cluster_size): 984 alias = 'node%.3d' % id 985 (ntype, nimage) = self._get_type_and_image_id(alias) 986 log.info("Launching %s (ami: %s, type: %s)" % 987 (alias, nimage, ntype)) 988 self.create_node(alias, image_id=nimage, instance_type=ntype, 989 zone=zone)
990
991 - def is_spot_cluster(self):
992 """ 993 Returns True if all nodes are spot instances 994 """ 995 nodes = self.nodes 996 if not nodes: 997 return False 998 for node in nodes: 999 if not node.is_spot(): 1000 return False 1001 return True
1002
1003 - def has_spot_nodes(self):
1004 """ 1005 Returns True if any nodes are spot instances 1006 """ 1007 for node in self.nodes: 1008 if node.is_spot(): 1009 return True 1010 return False
1011
1012 - def is_ebs_cluster(self):
1013 """ 1014 Returns True if all nodes are EBS-backed 1015 """ 1016 nodes = self.nodes 1017 if not nodes: 1018 return False 1019 for node in nodes: 1020 if not node.is_ebs_backed(): 1021 return False 1022 return True
1023
1024 - def has_ebs_nodes(self):
1025 """ 1026 Returns True if any nodes are EBS-backed 1027 """ 1028 for node in self.nodes: 1029 if node.is_ebs_backed(): 1030 return True 1031 return False
1032
1033 - def is_stoppable(self):
1034 """ 1035 Returns True if all nodes are stoppable (i.e. non-spot and EBS-backed) 1036 """ 1037 nodes = self.nodes 1038 if not nodes: 1039 return False 1040 for node in self.nodes: 1041 if not node.is_stoppable(): 1042 return False 1043 return True
1044
1045 - def has_stoppable_nodes(self):
1046 """ 1047 Returns True if any nodes are stoppable (i.e. non-spot and EBS-backed) 1048 """ 1049 nodes = self.nodes 1050 if not nodes: 1051 return False 1052 for node in nodes: 1053 if node.is_stoppable(): 1054 return True 1055 return False
1056
1057 - def is_cluster_compute(self):
1058 """ 1059 Returns true if all instances are Cluster/GPU Compute type 1060 """ 1061 nodes = self.nodes 1062 if not nodes: 1063 return False 1064 for node in nodes: 1065 if not node.is_cluster_compute(): 1066 return False 1067 return True
1068
1069 - def has_cluster_compute_nodes(self):
1070 for node in self.nodes: 1071 if node.is_cluster_compute(): 1072 return True 1073 return False
1074
1075 - def is_cluster_up(self):
1076 """ 1077 Check that all nodes are 'running' and that ssh is up on all nodes 1078 This method will return False if any spot requests are in an 'open' 1079 state. 1080 """ 1081 spots = self.spot_requests 1082 active_spots = filter(lambda x: x.state == 'active', spots) 1083 if len(spots) != len(active_spots): 1084 return False 1085 nodes = self.nodes 1086 if not nodes: 1087 return False 1088 for node in nodes: 1089 if not node.is_up(): 1090 return False 1091 return True
1092
1093 - def get_spinner(self, msg):
1094 """ 1095 Logs a status msg, starts a spinner, and returns the spinner object. 1096 This is useful for long running processes: 1097 1098 s = self.get_spinner("Long running process running...") 1099 (do something) 1100 s.stop() 1101 """ 1102 s = spinner.Spinner() 1103 log.info(msg, extra=dict(__nonewline__=True)) 1104 s.start() 1105 return s
1106 1107 @property
1108 - def progress_bar(self):
1109 if not self._progress_bar: 1110 widgets = ['', progressbar.Fraction(), ' ', 1111 progressbar.Bar(marker=progressbar.RotatingMarker()), 1112 ' ', progressbar.Percentage(), ' ', ' '] 1113 pbar = progressbar.ProgressBar(widgets=widgets, 1114 maxval=self.cluster_size, 1115 force_update=True) 1116 self._progress_bar = pbar 1117 return self._progress_bar
1118
1119 - def wait_for_active_spots(self, spots=None):
1120 """ 1121 Wait for all open spot requests for this cluster to transition to 1122 'active'. 1123 """ 1124 spots = spots or self.spot_requests 1125 open_spots = [spot for spot in spots if spot.state == "open"] 1126 if open_spots: 1127 pbar = self.progress_bar.reset() 1128 log.info('Waiting for open spot requests to become active...') 1129 pbar.maxval = len(spots) 1130 pbar.update(0) 1131 while not pbar.finished: 1132 active_spots = filter(lambda x: x.state == "active", spots) 1133 pbar.maxval = len(spots) 1134 pbar.update(len(active_spots)) 1135 if not pbar.finished: 1136 time.sleep(self.refresh_interval) 1137 spots = self.get_spot_requests_or_raise() 1138 pbar.reset()
1139
1140 - def wait_for_active_instances(self, nodes=None):
1141 """ 1142 Wait indefinitely for cluster nodes to show up. 1143 """ 1144 nodes = nodes or self.nodes 1145 if len(nodes) == 0: 1146 s = self.get_spinner("Waiting for instances to activate...") 1147 while len(nodes) == 0: 1148 time.sleep(self.refresh_interval) 1149 nodes = self.nodes 1150 s.stop()
1151
1152 - def wait_for_running_instances(self, nodes=None):
1153 """ 1154 Wait until all cluster nodes are in a 'running' state 1155 """ 1156 log.info("Waiting for all nodes to be in a 'running' state...") 1157 nodes = nodes or self.get_nodes_or_raise() 1158 pbar = self.progress_bar.reset() 1159 pbar.maxval = len(nodes) 1160 pbar.update(0) 1161 while not pbar.finished: 1162 running_nodes = filter(lambda x: x.state == "running", nodes) 1163 pbar.maxval = len(nodes) 1164 pbar.update(len(running_nodes)) 1165 if not pbar.finished: 1166 time.sleep(self.refresh_interval) 1167 nodes = self.get_nodes_or_raise() 1168 pbar.reset()
1169
1170 - def wait_for_ssh(self, nodes=None):
1171 """ 1172 Wait until all cluster nodes are in a 'running' state 1173 """ 1174 log.info("Waiting for SSH to come up on all nodes...") 1175 nodes = nodes or self.get_nodes_or_raise() 1176 pbar = self.progress_bar.reset() 1177 pbar.maxval = len(nodes) 1178 pbar.update(0) 1179 while not pbar.finished: 1180 active_nodes = filter(lambda n: n.is_up(), nodes) 1181 pbar.maxval = len(nodes) 1182 pbar.update(len(active_nodes)) 1183 if not pbar.finished: 1184 time.sleep(self.refresh_interval) 1185 nodes = self.get_nodes_or_raise() 1186 pbar.finish()
1187
1188 - def wait_for_cluster(self, msg="Waiting for cluster to come up..."):
1189 """ 1190 Wait for cluster to come up and display progress bar. Waits for all 1191 spot requests to become 'active', all instances to be in a 'running' 1192 state, and for all SSH daemons to come up. 1193 1194 msg - custom message to print out before waiting on the cluster 1195 """ 1196 interval = self.refresh_interval 1197 log.info("%s %s" % (msg, "(updating every %ds)" % interval)) 1198 self.wait_for_active_spots() 1199 self.wait_for_active_instances() 1200 self.wait_for_running_instances() 1201 self.wait_for_ssh()
1202
1203 - def is_cluster_stopped(self):
1204 """ 1205 Check whether all nodes are in the 'stopped' state 1206 """ 1207 nodes = self.nodes 1208 if not nodes: 1209 return False 1210 for node in nodes: 1211 if node.state != 'stopped': 1212 return False 1213 return True
1214
1215 - def is_cluster_terminated(self):
1216 """ 1217 Check whether all nodes are in a 'terminated' state 1218 """ 1219 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES) 1220 filters = {'group-name': self._security_group, 1221 'instance-state-name': states} 1222 insts = self.ec2.get_all_instances(filters=filters) 1223 return len(insts) == 0
1224
1225 - def attach_volumes_to_master(self):
1226 """ 1227 Attach each volume to the master node 1228 """ 1229 for vol in self.volumes: 1230 volume = self.volumes.get(vol) 1231 device = volume.get('device') 1232 vol_id = volume.get('volume_id') 1233 vol = self.ec2.get_volume(vol_id) 1234 if vol.attach_data.instance_id == self.master_node.id: 1235 log.info("Volume %s already attached to master...skipping" % \ 1236 vol.id) 1237 continue 1238 if vol.status != "available": 1239 log.error(('Volume %s not available...' + 1240 'please check and try again') % vol.id) 1241 continue 1242 log.info("Attaching volume %s to master node on %s ..." % (vol.id, 1243 device)) 1244 resp = vol.attach(self.master_node.id, device) 1245 log.debug("resp = %s" % resp) 1246 while True: 1247 vol.update() 1248 if vol.attachment_state() == 'attached': 1249 break 1250 time.sleep(5)
1251
1252 - def detach_volumes(self):
1253 """ 1254 Detach all volumes from all nodes 1255 """ 1256 for node in self.nodes: 1257 node.detach_external_volumes()
1258 1259 @print_timing('Restarting cluster')
1260 - def restart_cluster(self):
1261 """ 1262 Reboot all instances and reconfigure the cluster 1263 """ 1264 nodes = self.nodes 1265 if not nodes: 1266 raise exception.ClusterValidationError("No running nodes found") 1267 self.run_plugins(method_name="on_restart", reverse=True) 1268 log.info("Rebooting cluster...") 1269 for node in nodes: 1270 node.reboot() 1271 sleep = 20 1272 log.info("Sleeping for %d seconds..." % sleep) 1273 time.sleep(sleep) 1274 self._setup_cluster()
1275
1276 - def stop_cluster(self, terminate_unstoppable=False):
1277 """ 1278 Shutdown this cluster by detaching all volumes and 'stopping' all nodes 1279 1280 In general, all nodes in the cluster must be 'stoppable' meaning all 1281 nodes are backed by flat-rate EBS-backed instances. If any 1282 'unstoppable' nodes are found an exception is raised. A node is 1283 'unstoppable' if it is backed by either a spot or S3-backed instance. 1284 1285 If the cluster contains a mix of 'stoppable' and 'unstoppable' nodes 1286 you can stop all stoppable nodes and terminate any unstoppable nodes by 1287 setting terminate_unstoppable=True. 1288 1289 This will stop all nodes that can be stopped and terminate the rest. 1290 """ 1291 nodes = self.nodes 1292 if not nodes: 1293 raise exception.ClusterValidationError("No running nodes found") 1294 if not self.is_stoppable(): 1295 has_stoppable_nodes = self.has_stoppable_nodes() 1296 if not terminate_unstoppable and has_stoppable_nodes: 1297 raise exception.InvalidOperation( 1298 "Cluster contains nodes that are not stoppable") 1299 if not has_stoppable_nodes: 1300 raise exception.InvalidOperation( 1301 "Cluster does not contain any stoppable nodes") 1302 try: 1303 self.run_plugins(method_name="on_shutdown", reverse=True) 1304 except exception.MasterDoesNotExist, e: 1305 log.error("Cannot run plugins: %s" % e) 1306 self.detach_volumes() 1307 for node in nodes: 1308 node.shutdown()
1309
1310 - def terminate_cluster(self):
1311 """ 1312 Destroy this cluster by first detaching all volumes, shutting down all 1313 instances, cancelling all spot requests (if any), removing its 1314 placement group (if any), and removing its security group. 1315 """ 1316 try: 1317 self.run_plugins(method_name="on_shutdown", reverse=True) 1318 except exception.MasterDoesNotExist, e: 1319 log.error("Cannot run plugins: %s" % e) 1320 self.detach_volumes() 1321 nodes = self.nodes 1322 for node in nodes: 1323 node.terminate() 1324 for spot in self.spot_requests: 1325 if spot.state not in ['cancelled', 'closed']: 1326 log.info("Cancelling spot instance request: %s" % spot.id) 1327 spot.cancel() 1328 sg = self.ec2.get_group_or_none(self._security_group) 1329 pg = self.ec2.get_placement_group_or_none(self._security_group) 1330 if nodes and sg or pg: 1331 s = self.get_spinner("Waiting for cluster to terminate...") 1332 while not self.is_cluster_terminated(): 1333 time.sleep(5) 1334 s.stop() 1335 if pg: 1336 log.info("Removing %s placement group" % pg.name) 1337 pg.delete() 1338 if sg: 1339 log.info("Removing %s security group" % sg.name) 1340 sg.delete()
1341
1342 - def start(self, create=True, create_only=False, validate=True, 1343 validate_only=False, validate_running=False):
1344 """ 1345 Creates and configures a cluster from this cluster template's settings. 1346 1347 create - create new nodes when starting the cluster. set to False to 1348 use existing nodes 1349 create_only - only create the cluster node instances, don't configure 1350 the cluster 1351 validate - whether or not to validate the cluster settings used. 1352 False will ignore validate_only and validate_running 1353 keywords and is effectively the same as running _start 1354 validate_only - only validate cluster settings, do not create or 1355 configure cluster 1356 validate_running - whether or not to validate the existing instances 1357 being used against this cluster's settings 1358 """ 1359 if validate: 1360 if not create and validate_running: 1361 try: 1362 self._validate_running_instances() 1363 except exception.ClusterValidationError, e: 1364 msg = "Existing nodes are not compatible with cluster " 1365 msg += "settings:\n" 1366 e.msg = msg + e.msg 1367 raise 1368 elif create: 1369 self._validate() 1370 if validate_only: 1371 return 1372 else: 1373 log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK") 1374 return self._start(create=create, create_only=create_only)
1375 1376 @print_timing("Starting cluster")
1377 - def _start(self, create=True, create_only=False):
1378 """ 1379 Create and configure a cluster from this cluster template's settings 1380 (Does not attempt to validate before running) 1381 1382 create - create new nodes when starting the cluster. set to False to 1383 use existing nodes 1384 create_only - only create the cluster node instances, don't configure 1385 the cluster 1386 """ 1387 log.info("Starting cluster...") 1388 if create: 1389 self.create_cluster() 1390 else: 1391 assert self.master_node is not None 1392 for node in self.stopped_nodes: 1393 log.info("Starting stopped node: %s" % node.alias) 1394 node.start() 1395 if create_only: 1396 return 1397 self._setup_cluster()
1398
1399 - def _setup_cluster(self):
1400 """ 1401 This method waits for all nodes to come up and then runs the default 1402 StarCluster setup routines followed by any additional plugin setup 1403 routines 1404 """ 1405 self.wait_for_cluster() 1406 log.info("The master node is %s" % self.master_node.dns_name) 1407 log.info("Setting up the cluster...") 1408 if self.volumes: 1409 self.attach_volumes_to_master() 1410 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 1411 self.disable_threads) 1412 default_plugin.run(self.nodes, self.master_node, self.cluster_user, 1413 self.cluster_shell, self.volumes) 1414 self.run_plugins()
1415
1416 - def run_plugins(self, plugins=None, method_name="run", node=None, 1417 reverse=False):
1418 """ 1419 Run all plugins specified in this Cluster object's self.plugins list 1420 Uses plugins list instead of self.plugins if specified. 1421 1422 plugins must be a tuple: the first element is the plugin's name, the 1423 second element is the plugin object (a subclass of ClusterSetup) 1424 """ 1425 plugs = plugins or self.plugins 1426 if reverse: 1427 plugs = plugs[:] 1428 plugs.reverse() 1429 for plug in plugs: 1430 name, plugin = plug 1431 self.run_plugin(plugin, name, method_name=method_name, node=node)
1432
1433 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1434 """ 1435 Run a StarCluster plugin. 1436 1437 plugin - an instance of the plugin's class 1438 name - a user-friendly label for the plugin 1439 method_name - the method to run within the plugin (default: "run") 1440 node - optional node to pass as first argument to plugin method (used 1441 for on_add_node/on_remove_node) 1442 """ 1443 plugin_name = name or str(plugin) 1444 try: 1445 func = getattr(plugin, method_name, None) 1446 if not func: 1447 log.warn("Plugin %s has no %s method...skipping" % \ 1448 (plugin_name, method_name)) 1449 return 1450 args = [self.nodes, self.master_node, self.cluster_user, 1451 self.cluster_shell, self.volumes] 1452 if node: 1453 args.insert(0, node) 1454 log.info("Running plugin %s" % plugin_name) 1455 func(*args) 1456 except NotImplementedError: 1457 log.debug("method %s not implemented by plugin %s" % (method_name, 1458 plugin_name)) 1459 except exception.MasterDoesNotExist: 1460 raise 1461 except Exception, e: 1462 log.error("Error occured while running plugin '%s':" % plugin_name) 1463 if isinstance(e, exception.ThreadPoolException): 1464 e.print_excs() 1465 log.debug(e.format_excs()) 1466 else: 1467 traceback.print_exc() 1468 log.debug(traceback.format_exc())
1469
1470 - def is_running_valid(self):
1471 """ 1472 Checks whether the current running instances are compatible 1473 with this cluster template's settings 1474 """ 1475 try: 1476 self._validate_running_instances() 1477 return True 1478 except exception.ClusterValidationError, e: 1479 log.error(e.msg) 1480 return False
1481
1482 - def _validate(self):
1483 """ 1484 Checks that all cluster template settings are valid. Raises a 1485 ClusterValidationError exception if not. 1486 """ 1487 log.info("Validating cluster template settings...") 1488 try: 1489 self._has_all_required_settings() 1490 self._validate_spot_bid() 1491 self._validate_cluster_size() 1492 self._validate_shell_setting() 1493 self._validate_permission_settings() 1494 self._validate_credentials() 1495 self._validate_keypair() 1496 self._validate_zone() 1497 self._validate_ebs_settings() 1498 self._validate_ebs_aws_settings() 1499 self._validate_image_settings() 1500 self._validate_instance_types() 1501 self._validate_cluster_compute() 1502 log.info('Cluster template settings are valid') 1503 return True 1504 except exception.ClusterValidationError, e: 1505 e.msg = 'Cluster settings are not valid:\n%s' % e.msg 1506 raise
1507
1508 - def is_valid(self):
1509 """ 1510 Returns True if all cluster template settings are valid 1511 """ 1512 try: 1513 self._validate() 1514 return True 1515 except exception.ClusterValidationError, e: 1516 log.error(e.msg) 1517 return False
1518
1519 - def _validate_spot_bid(self):
1520 if self.spot_bid is not None: 1521 if type(self.spot_bid) not in [int, float]: 1522 raise exception.ClusterValidationError( 1523 'spot_bid must be integer or float') 1524 if self.spot_bid <= 0: 1525 raise exception.ClusterValidationError( 1526 'spot_bid must be an integer or float > 0') 1527 return True
1528
1529 - def _validate_cluster_size(self):
1530 try: 1531 int(self.cluster_size) 1532 if self.cluster_size < 1: 1533 raise ValueError 1534 except (ValueError, TypeError): 1535 raise exception.ClusterValidationError( 1536 'cluster_size must be an integer >= 1') 1537 num_itypes = sum([i.get('size') for i in self.node_instance_types]) 1538 num_nodes = self.cluster_size - 1 1539 if num_itypes > num_nodes: 1540 raise exception.ClusterValidationError( 1541 ("total number of nodes specified in node_instance_type (%s)" + 1542 " must be <= cluster_size-1 (%s)") % (num_itypes, num_nodes)) 1543 return True
1544
1545 - def _validate_shell_setting(self):
1546 cluster_shell = self.cluster_shell 1547 if not self.__available_shells.get(cluster_shell): 1548 raise exception.ClusterValidationError( 1549 'Invalid user shell specified. Options are %s' % \ 1550 ' '.join(self.__available_shells.keys())) 1551 return True
1552
1553 - def _validate_image_settings(self):
1554 master_image_id = self.master_image_id 1555 node_image_id = self.node_image_id 1556 conn = self.ec2 1557 image = conn.get_image_or_none(node_image_id) 1558 if not image or image.id != node_image_id: 1559 raise exception.ClusterValidationError( 1560 'node_image_id %s does not exist' % node_image_id) 1561 if master_image_id: 1562 master_image = conn.get_image_or_none(master_image_id) 1563 if not master_image or master_image.id != master_image_id: 1564 raise exception.ClusterValidationError( 1565 'master_image_id %s does not exist' % master_image_id) 1566 return True
1567
1568 - def _validate_zone(self):
1569 availability_zone = self.availability_zone 1570 if availability_zone: 1571 zone = self.ec2.get_zone(availability_zone) 1572 if not zone: 1573 azone = self.availability_zone 1574 raise exception.ClusterValidationError( 1575 'availability_zone = %s does not exist' % azone) 1576 if zone.state != 'available': 1577 log.warn('The availability_zone = %s ' % zone + 1578 'is not available at this time') 1579 return True
1580
1581 - def __check_platform(self, image_id, instance_type):
1582 """ 1583 Validates whether an image_id (AMI) is compatible with a given 1584 instance_type. image_id_setting and instance_type_setting are the 1585 setting labels in the config file. 1586 """ 1587 image = self.ec2.get_image_or_none(image_id) 1588 if not image: 1589 raise exception.ClusterValidationError('Image %s does not exist' % 1590 image_id) 1591 image_platform = image.architecture 1592 image_is_hvm = (image.virtualization_type == "hvm") 1593 if image_is_hvm and not instance_type in static.CLUSTER_TYPES: 1594 cctypes_list = ', '.join(static.CLUSTER_TYPES) 1595 raise exception.ClusterValidationError( 1596 "Image '%s' is a Cluster Compute/GPU image (HVM) and cannot " 1597 "be used with instance type '%s'\nThe instance type " 1598 "for a Cluster Compute/GPU image (HVM) must be one of: %s" % \ 1599 (image_id, instance_type, cctypes_list)) 1600 instance_platforms = self.__instance_types[instance_type] 1601 if image_platform not in instance_platforms: 1602 error_msg = "Instance type %(instance_type)s is for an " + \ 1603 "%(instance_platform)s platform while " + \ 1604 "%(image_id)s is an %(image_platform)s platform" 1605 error_dict = {'instance_type': instance_type, 1606 'instance_platform': ', '.join(instance_platforms), 1607 'image_id': image_id, 1608 'image_platform': image_platform} 1609 raise exception.ClusterValidationError(error_msg % error_dict) 1610 return True
1611
1612 - def _validate_instance_types(self):
1613 master_image_id = self.master_image_id 1614 node_image_id = self.node_image_id 1615 master_instance_type = self.master_instance_type 1616 node_instance_type = self.node_instance_type 1617 instance_types = self.__instance_types 1618 instance_type_list = ', '.join(instance_types.keys()) 1619 if not node_instance_type in instance_types: 1620 raise exception.ClusterValidationError( 1621 ("You specified an invalid node_instance_type %s \n" + 1622 "Possible options are:\n%s") % \ 1623 (node_instance_type, instance_type_list)) 1624 elif master_instance_type: 1625 if not master_instance_type in instance_types: 1626 raise exception.ClusterValidationError( 1627 ("You specified an invalid master_instance_type %s\n" + \ 1628 "Possible options are:\n%s") % \ 1629 (master_instance_type, instance_type_list)) 1630 try: 1631 self.__check_platform(node_image_id, node_instance_type) 1632 except exception.ClusterValidationError, e: 1633 raise exception.ClusterValidationError( 1634 'Incompatible node_image_id and node_instance_type:\n' + e.msg) 1635 if master_image_id and not master_instance_type: 1636 try: 1637 self.__check_platform(master_image_id, node_instance_type) 1638 except exception.ClusterValidationError, e: 1639 raise exception.ClusterValidationError( 1640 'Incompatible master_image_id and ' + 1641 'node_instance_type\n' + e.msg) 1642 elif master_image_id and master_instance_type: 1643 try: 1644 self.__check_platform(master_image_id, master_instance_type) 1645 except exception.ClusterValidationError, e: 1646 raise exception.ClusterValidationError( 1647 'Incompatible master_image_id and ' + 1648 'master_instance_type\n' + e.msg) 1649 elif master_instance_type and not master_image_id: 1650 try: 1651 self.__check_platform(node_image_id, master_instance_type) 1652 except exception.ClusterValidationError, e: 1653 raise exception.ClusterValidationError( 1654 'Incompatible node_image_id and ' + 1655 'master_instance_type\n' + e.msg) 1656 for itype in self.node_instance_types: 1657 type = itype.get('type') 1658 img = itype.get('image') or node_image_id 1659 if not type in instance_types: 1660 raise exception.ClusterValidationError( 1661 ("You specified an invalid instance type %s \n" + 1662 "Possible options are:\n%s") % (type, instance_type_list)) 1663 try: 1664 self.__check_platform(img, type) 1665 except exception.ClusterValidationError, e: 1666 raise exception.ClusterValidationError( 1667 "Invalid settings for node_instance_type %s: %s" % 1668 (type, e.msg)) 1669 return True
1670
1671 - def _validate_cluster_compute(self):
1672 lmap = self._get_launch_map() 1673 for (type, image) in lmap: 1674 if type in static.CLUSTER_TYPES: 1675 img = self.ec2.get_image(image) 1676 if img.virtualization_type != 'hvm': 1677 raise exception.ClusterValidationError( 1678 'Cluster Compute/GPU instance type %s ' 1679 'can only be used with HVM images.\n' 1680 'Image %s is NOT an HVM image.' % (type, image))
1681
1682 - def _validate_ebs_aws_settings(self):
1683 """ 1684 Verify EBS volumes exists and that each volume's zone matches this 1685 cluster's zone setting. 1686 """ 1687 for vol in self.volumes: 1688 v = self.volumes.get(vol) 1689 vol_id = v.get('volume_id') 1690 vol = self.ec2.get_volume(vol_id) 1691 if vol.status != 'available': 1692 if self.master_node: 1693 if vol.attach_data.instance_id == self.master_node.id: 1694 continue 1695 msg = "volume %s is not available (status: %s)" % (vol_id, 1696 vol.status) 1697 raise exception.ClusterValidationError(msg)
1698
1700 permissions = self.permissions 1701 for perm in permissions: 1702 permission = permissions.get(perm) 1703 protocol = permission.get('ip_protocol') 1704 if protocol not in self.__protocols: 1705 raise exception.InvalidProtocol(protocol) 1706 from_port = permission.get('from_port') 1707 to_port = permission.get('to_port') 1708 try: 1709 from_port = int(from_port) 1710 to_port = int(to_port) 1711 except ValueError: 1712 raise exception.InvalidPortRange( 1713 from_port, to_port, reason="integer range required") 1714 if from_port < 0 or to_port < 0: 1715 raise exception.InvalidPortRange( 1716 from_port, to_port, 1717 reason="from/to must be positive integers") 1718 if from_port > to_port: 1719 raise exception.InvalidPortRange( 1720 from_port, to_port, 1721 reason="'from_port' must be <= 'to_port'") 1722 cidr_ip = permission.get('cidr_ip') 1723 if not iptools.validate_cidr(cidr_ip): 1724 raise exception.InvalidCIDRSpecified(cidr_ip)
1725
1726 - def _validate_ebs_settings(self):
1727 """ 1728 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs 1729 and validate these settings. Does not require AWS credentials. 1730 """ 1731 volmap = {} 1732 devmap = {} 1733 mount_paths = [] 1734 for vol in self.volumes: 1735 vol_name = vol 1736 vol = self.volumes.get(vol) 1737 vol_id = vol.get('volume_id') 1738 device = vol.get('device') 1739 partition = vol.get('partition') 1740 mount_path = vol.get("mount_path") 1741 vmap = volmap.get(vol_id, {}) 1742 devices = vmap.get('device', []) 1743 partitions = vmap.get('partition', []) 1744 if devices and device not in devices: 1745 raise exception.ClusterValidationError( 1746 "Can't attach volume %s to more than one device" % vol_id) 1747 elif partitions and partition in partitions: 1748 raise exception.ClusterValidationError( 1749 "Multiple configurations for %s\n" 1750 "Either pick one or specify a separate partition for " 1751 "each configuration" % vol_id) 1752 vmap['partition'] = partitions + [partition] 1753 vmap['device'] = devices + [device] 1754 volmap[vol_id] = vmap 1755 dmap = devmap.get(device, {}) 1756 vol_ids = dmap.get('volume_id', []) 1757 if vol_ids and vol_id not in vol_ids: 1758 raise exception.ClusterValidationError( 1759 "Can't attach more than one volume on device %s" % device) 1760 dmap['volume_id'] = vol_ids + [vol_id] 1761 devmap[device] = dmap 1762 mount_paths.append(mount_path) 1763 if not device: 1764 raise exception.ClusterValidationError( 1765 'Missing DEVICE setting for volume %s' % vol_name) 1766 if not utils.is_valid_device(device): 1767 raise exception.ClusterValidationError( 1768 "Invalid DEVICE value for volume %s" % vol_name) 1769 if partition: 1770 if not utils.is_valid_partition(partition): 1771 raise exception.ClusterValidationError( 1772 "Invalid PARTITION value for volume %s" % vol_name) 1773 if not partition.startswith(device): 1774 raise exception.ClusterValidationError( 1775 "Volume PARTITION must start with %s" % device) 1776 if not mount_path: 1777 raise exception.ClusterValidationError( 1778 'Missing MOUNT_PATH setting for volume %s' % vol_name) 1779 if not mount_path.startswith('/'): 1780 raise exception.ClusterValidationError( 1781 "MOUNT_PATH for volume %s should start with /" % vol_name) 1782 for path in mount_paths: 1783 if mount_paths.count(path) > 1: 1784 raise exception.ClusterValidationError( 1785 "Can't mount more than one volume on %s" % path) 1786 return True
1787
1788 - def _has_all_required_settings(self):
1789 has_all_required = True 1790 for opt in self.__cluster_settings: 1791 requirements = self.__cluster_settings[opt] 1792 name = opt 1793 required = requirements[1] 1794 if required and self.get(name.lower()) is None: 1795 log.warn('Missing required setting %s' % name) 1796 has_all_required = False 1797 return has_all_required
1798
1799 - def _validate_credentials(self):
1800 if not self.ec2.is_valid_conn(): 1801 raise exception.ClusterValidationError( 1802 'Invalid AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY combination.') 1803 return True
1804
1805 - def _validate_keypair(self):
1806 key_location = self.key_location 1807 if not key_location: 1808 raise exception.ClusterValidationError( 1809 "no key_location specified for key '%s'" % self.keyname) 1810 if not os.path.exists(key_location): 1811 raise exception.ClusterValidationError( 1812 'key_location=%s does not exist.' % \ 1813 key_location) 1814 elif not os.path.isfile(key_location): 1815 raise exception.ClusterValidationError( 1816 'key_location=%s is not a file.' % \ 1817 key_location) 1818 keyname = self.keyname 1819 keypair = self.ec2.get_keypair_or_none(keyname) 1820 if not keypair: 1821 raise exception.ClusterValidationError( 1822 'Account does not contain a key with keyname = %s. ' % keyname) 1823 if self.zone: 1824 z = self.ec2.get_zone(self.zone) 1825 if keypair.region != z.region: 1826 raise exception.ClusterValidationError( 1827 'Keypair %s not in availability zone region %s' % \ 1828 (keyname, z.region)) 1829 return True
1830
1831 - def ssh_to_master(self, user='root'):
1832 self.ssh_to_node('master', user=user)
1833
1834 - def ssh_to_node(self, alias, user='root'):
1835 node = self.get_node_by_alias(alias) 1836 node = node or self.get_node_by_dns_name(alias) 1837 node = node or self.get_node_by_id(alias) 1838 if not node: 1839 raise exception.InstanceDoesNotExist(alias, label='node') 1840 node.shell(user=user)
1841 1842 if __name__ == "__main__": 1843 from starcluster.config import StarClusterConfig 1844 cfg = StarClusterConfig().load() 1845 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster') 1846 if sc.is_valid(): 1847 sc.start(create=True) 1848