Package starcluster :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module starcluster.cluster

   1  #!/usr/bin/env python 
   2  import os 
   3  import re 
   4  import time 
   5  import zlib 
   6  import string 
   7  import pprint 
   8  import base64 
   9  import cPickle 
  10  import traceback 
  11   
  12  from starcluster import utils 
  13  from starcluster import static 
  14  from starcluster import spinner 
  15  from starcluster import iptools 
  16  from starcluster import managers 
  17  from starcluster import exception 
  18  from starcluster import progressbar 
  19  from starcluster import clustersetup 
  20  from starcluster.node import Node 
  21  from starcluster.utils import print_timing 
  22  from starcluster.templates import user_msgs 
  23  from starcluster.logger import log 
24 25 26 -class ClusterManager(managers.Manager):
27 """ 28 Manager class for Cluster objects 29 """
30 - def __repr__(self):
31 return "<ClusterManager: %s>" % self.ec2.region.name
32
33 - def get_cluster(self, cluster_name, group=None, load_receipt=True, 34 load_plugins=True):
35 """ 36 Returns a Cluster object representing an active cluster 37 """ 38 try: 39 clname = self._get_cluster_name(cluster_name) 40 cltag = self.get_tag_from_sg(clname) 41 if not group: 42 group = self.ec2.get_security_group(clname) 43 cl = Cluster(ec2_conn=self.ec2, cluster_tag=cltag, 44 cluster_group=group) 45 if load_receipt: 46 cl.load_receipt(load_plugins=load_plugins) 47 try: 48 key_location = self.cfg.get_key(cl.keyname).get('key_location') 49 cl.key_location = key_location 50 except (exception.KeyNotFound, Exception): 51 pass 52 return cl 53 except exception.SecurityGroupDoesNotExist: 54 raise exception.ClusterDoesNotExist(cluster_name)
55
56 - def get_clusters(self, load_receipt=True, load_plugins=True):
57 """ 58 Returns a list of all active clusters 59 """ 60 cluster_groups = self.get_cluster_security_groups() 61 clusters = [self.get_cluster(g.name, group=g, 62 load_receipt=load_receipt, 63 load_plugins=load_plugins) 64 for g in cluster_groups] 65 return clusters
66
68 """ 69 Returns name of the default cluster template defined in the config 70 """ 71 return self.cfg.get_default_cluster_template()
72
73 - def get_cluster_template(self, template_name, tag_name=None):
74 """ 75 Returns a new Cluster object using the settings from the cluster 76 template template_name 77 78 If tag_name is passed, the Cluster object's cluster_tag setting will 79 be set to tag_name 80 """ 81 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name, 82 ec2_conn=self.ec2) 83 return cl
84
85 - def get_cluster_or_none(self, cluster_name):
86 """ 87 Same as get_cluster but returns None instead of throwing an exception 88 if the cluster does not exist 89 """ 90 try: 91 return self.get_cluster(cluster_name) 92 except exception.ClusterDoesNotExist: 93 pass
94
95 - def cluster_exists(self, tag_name):
96 """ 97 Returns True if cluster exists 98 """ 99 return self.get_cluster_or_none(tag_name) is not None
100
101 - def ssh_to_master(self, cluster_name, user='root', command=None):
102 """ 103 ssh to master node of cluster_name 104 105 user keyword specifies an alternate user to login as 106 """ 107 cluster = self.get_cluster(cluster_name) 108 return cluster.ssh_to_master(user=user, command=command)
109
110 - def ssh_to_cluster_node(self, cluster_name, node_id, user='root', 111 command=None):
112 """ 113 ssh to a node in cluster_name that has either an id, 114 dns name, or alias matching node_id 115 116 user keyword specifies an alternate user to login as 117 """ 118 cluster = self.get_cluster(cluster_name) 119 return cluster.ssh_to_node(node_id, user=user, command=command)
120
121 - def _get_cluster_name(self, cluster_name):
122 """ 123 Returns human readable cluster name/tag prefixed with '@sc-' 124 """ 125 if not cluster_name.startswith(static.SECURITY_GROUP_PREFIX): 126 cluster_name = static.SECURITY_GROUP_TEMPLATE % cluster_name 127 return cluster_name
128
129 - def add_node(self, cluster_name, alias=None, no_create=False):
130 cl = self.get_cluster(cluster_name) 131 cl.add_node(alias, no_create=no_create)
132
133 - def add_nodes(self, cluster_name, num_nodes, aliases=None, 134 no_create=False):
135 """ 136 Add one or more nodes to cluster 137 """ 138 cl = self.get_cluster(cluster_name) 139 cl.add_nodes(num_nodes, aliases=aliases, no_create=no_create)
140
141 - def remove_node(self, cluster_name, alias, terminate=True):
142 """ 143 Remove a single node from a cluster 144 """ 145 cl = self.get_cluster(cluster_name) 146 n = cl.get_node_by_alias(alias) 147 if not n: 148 raise exception.InstanceDoesNotExist(alias, label='node') 149 cl.remove_node(n, terminate=terminate)
150
151 - def restart_cluster(self, cluster_name):
152 """ 153 Reboots and reconfigures cluster_name 154 """ 155 cl = self.get_cluster(cluster_name) 156 cl.restart_cluster()
157
158 - def stop_cluster(self, cluster_name, terminate_unstoppable=False):
159 """ 160 Stop an EBS-backed cluster 161 """ 162 cl = self.get_cluster(cluster_name) 163 cl.stop_cluster(terminate_unstoppable)
164
165 - def terminate_cluster(self, cluster_name):
166 """ 167 Terminates cluster_name 168 """ 169 cl = self.get_cluster(cluster_name) 170 cl.terminate_cluster()
171
172 - def get_cluster_security_group(self, group_name):
173 """ 174 Return all security groups on EC2 that start with '@sc-' 175 """ 176 gname = self._get_cluster_name(group_name) 177 return self.ec2.get_security_group(gname)
178
180 """ 181 Return all security groups on EC2 that start with '@sc-' 182 """ 183 glob = static.SECURITY_GROUP_TEMPLATE % '*' 184 sgs = self.ec2.get_security_groups(filters={'group-name': glob}) 185 return sgs
186
187 - def get_tag_from_sg(self, sg):
188 """ 189 Returns the cluster tag name from a security group name that starts 190 with static.SECURITY_GROUP_PREFIX 191 192 Example: 193 sg = '@sc-mycluster' 194 print get_tag_from_sg(sg) 195 mycluster 196 """ 197 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)') 198 match = regex.match(sg) 199 if match: 200 return match.groups()[0]
201
202 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
203 """ 204 Prints a summary for each active cluster on EC2 205 """ 206 if not cluster_groups: 207 cluster_groups = self.get_cluster_security_groups() 208 if not cluster_groups: 209 log.info("No clusters found...") 210 else: 211 try: 212 cluster_groups = [self.get_cluster_security_group(g) for g \ 213 in cluster_groups] 214 except exception.SecurityGroupDoesNotExist: 215 raise exception.ClusterDoesNotExist(g) 216 for scg in cluster_groups: 217 tag = self.get_tag_from_sg(scg.name) 218 try: 219 cl = self.get_cluster(tag, group=scg, load_plugins=False) 220 except exception.IncompatibleCluster, e: 221 sep = '*' * 60 222 log.error('\n'.join([sep, e.msg, sep]), 223 extra=dict(__textwrap__=True)) 224 continue 225 header = '%s (security group: %s)' % (tag, scg.name) 226 print '-' * len(header) 227 print header 228 print '-' * len(header) 229 nodes = cl.nodes 230 try: 231 n = nodes[0] 232 except IndexError: 233 n = None 234 state = getattr(n, 'state', None) 235 ltime = 'N/A' 236 uptime = 'N/A' 237 if state in ['pending', 'running']: 238 ltime = getattr(n, 'local_launch_time', 'N/A') 239 uptime = getattr(n, 'uptime', 'N/A') 240 print 'Launch time: %s' % ltime 241 print 'Uptime: %s' % uptime 242 print 'Zone: %s' % getattr(n, 'placement', 'N/A') 243 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A') 244 ebs_nodes = [n for n in nodes if n.attached_vols] 245 if ebs_nodes: 246 print 'EBS volumes:' 247 for node in ebs_nodes: 248 devices = node.attached_vols 249 node_id = node.alias or node.id 250 for dev in devices: 251 d = devices.get(dev) 252 vol_id = d.volume_id 253 status = d.status 254 print ' %s on %s:%s (status: %s)' % \ 255 (vol_id, node_id, dev, status) 256 else: 257 print 'EBS volumes: N/A' 258 spot_reqs = cl.spot_requests 259 if spot_reqs: 260 active = len([s for s in spot_reqs if s.state == 'active']) 261 opn = len([s for s in spot_reqs if s.state == 'open']) 262 msg = '' 263 if active != 0: 264 msg += '%d active' % active 265 if opn != 0: 266 if msg: 267 msg += ', ' 268 msg += '%d open' % opn 269 print 'Spot requests: %s' % msg 270 if nodes: 271 print 'Cluster nodes:' 272 for node in nodes: 273 nodeline = " %7s %s %s %s" % (node.alias, node.state, 274 node.id, node.dns_name) 275 if node.spot_id: 276 nodeline += ' (spot %s)' % node.spot_id 277 if show_ssh_status: 278 ssh_status = {True: 'Up', False: 'Down'} 279 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()] 280 print nodeline 281 print 'Total nodes: %d' % len(nodes) 282 else: 283 print 'Cluster nodes: N/A' 284 print
285
286 - def run_plugin(self, plugin_name, cluster_tag):
287 """ 288 Run a plugin defined in the config. 289 290 plugin_name must match the plugin's section name in the config 291 cluster_tag specifies the cluster to run the plugin on 292 """ 293 cl = self.get_cluster(cluster_tag, load_plugins=False) 294 if not cl.is_cluster_up(): 295 raise exception.ClusterNotRunning(cluster_tag) 296 plugs = [self.cfg.get_plugin(plugin_name)] 297 name, plugin = cl.load_plugins(plugs)[0] 298 cl.run_plugin(plugin, name)
299
300 301 -class Cluster(object):
302 - def __init__(self, 303 ec2_conn=None, 304 spot_bid=None, 305 cluster_tag=None, 306 cluster_description=None, 307 cluster_size=None, 308 cluster_user=None, 309 cluster_shell=None, 310 master_image_id=None, 311 master_instance_type=None, 312 node_image_id=None, 313 node_instance_type=None, 314 node_instance_types=[], 315 availability_zone=None, 316 keyname=None, 317 key_location=None, 318 volumes=[], 319 plugins=[], 320 permissions=[], 321 refresh_interval=30, 322 disable_queue=False, 323 disable_threads=False, 324 cluster_group=None, 325 force_spot_master=False, 326 **kwargs):
327 328 now = time.strftime("%Y%m%d%H%M") 329 self.ec2 = ec2_conn 330 self.spot_bid = spot_bid 331 self.cluster_tag = cluster_tag 332 self.cluster_description = cluster_description 333 if self.cluster_tag is None: 334 self.cluster_tag = "cluster%s" % now 335 if cluster_description is None: 336 self.cluster_description = "Cluster created at %s" % now 337 self.cluster_size = cluster_size or 0 338 self.cluster_user = cluster_user 339 self.cluster_shell = cluster_shell 340 self.master_image_id = master_image_id 341 self.master_instance_type = master_instance_type 342 self.node_image_id = node_image_id 343 self.node_instance_type = node_instance_type 344 self.node_instance_types = node_instance_types 345 self.availability_zone = availability_zone 346 self.keyname = keyname 347 self.key_location = key_location 348 self.volumes = self.load_volumes(volumes) 349 self.plugins = self.load_plugins(plugins) 350 self.permissions = permissions 351 self.refresh_interval = refresh_interval 352 self.disable_queue = disable_queue 353 self.disable_threads = disable_threads 354 self.force_spot_master = force_spot_master 355 356 self.__instance_types = static.INSTANCE_TYPES 357 self.__cluster_settings = static.CLUSTER_SETTINGS 358 self.__available_shells = static.AVAILABLE_SHELLS 359 self.__protocols = static.PROTOCOLS 360 self._progress_bar = None 361 self._master_reservation = None 362 self._node_reservation = None 363 self._nodes = [] 364 self._master = None 365 self._zone = None 366 self._plugins = plugins 367 self._cluster_group = None 368 self._placement_group = None
369
370 - def __repr__(self):
371 return '<Cluster: %s (%s-node)>' % (self.cluster_tag, 372 self.cluster_size)
373 374 @property
375 - def zone(self):
376 """ 377 If volumes are specified, this method determines the common 378 availability zone between those volumes. If an availability zone 379 is explicitly specified in the config and does not match the common 380 availability zone of the volumes, an exception is raised. If all 381 volumes are not in the same availabilty zone an exception is raised. 382 If no volumes are specified, returns the user specified availability 383 zone if it exists. 384 """ 385 if not self._zone: 386 zone = None 387 if self.availability_zone: 388 zone = self.ec2.get_zone(self.availability_zone).name 389 common_zone = None 390 for volume in self.volumes: 391 volid = self.volumes.get(volume).get('volume_id') 392 vol = self.ec2.get_volume(volid) 393 if not common_zone: 394 common_zone = vol.zone 395 elif vol.zone != common_zone: 396 vols = [self.volumes.get(v).get('volume_id') 397 for v in self.volumes] 398 raise exception.VolumesZoneError(vols) 399 if common_zone and zone and zone != common_zone: 400 raise exception.InvalidZone(zone, common_zone) 401 if not zone and common_zone: 402 zone = common_zone 403 self._zone = zone 404 return self._zone
405
406 - def load_volumes(self, vols):
407 """ 408 Iterate through vols and set device/partition settings automatically if 409 not specified. 410 411 This method assigns the first volume to /dev/sdz, second to /dev/sdy, 412 etc for all volumes that do not include a device/partition setting 413 """ 414 devices = ['/dev/sd%s' % s for s in string.lowercase] 415 devmap = {} 416 for volname in vols: 417 vol = vols.get(volname) 418 dev = vol.get('device') 419 if dev in devices: 420 #rm user-defined devices from the list of auto-assigned devices 421 devices.remove(dev) 422 volid = vol.get('volume_id') 423 if dev and not volid in devmap: 424 devmap[volid] = dev 425 volumes = {} 426 for volname in vols: 427 vol = vols.get(volname) 428 vol_id = vol.get('volume_id') 429 device = vol.get('device') 430 if not device: 431 if vol_id in devmap: 432 device = devmap.get(vol_id) 433 else: 434 device = devices.pop() 435 devmap[vol_id] = device 436 if not utils.is_valid_device(device): 437 raise exception.InvalidDevice(device) 438 v = volumes[volname] = utils.AttributeDict() 439 v.update(vol) 440 v['device'] = device 441 part = vol.get('partition') 442 if part: 443 partition = device + str(part) 444 if not utils.is_valid_partition(partition): 445 raise exception.InvalidPartition(part) 446 v['partition'] = partition 447 return volumes
448
449 - def load_plugins(self, plugins):
450 plugs = [] 451 for plugin in plugins: 452 setup_class = plugin.get('setup_class') 453 plugin_name = plugin.get('__name__').split()[-1] 454 mod_name = '.'.join(setup_class.split('.')[:-1]) 455 class_name = setup_class.split('.')[-1] 456 try: 457 mod = __import__(mod_name, globals(), locals(), [class_name]) 458 except SyntaxError, e: 459 raise exception.PluginSyntaxError( 460 "Plugin %s (%s) contains a syntax error at line %s" % \ 461 (plugin_name, e.filename, e.lineno)) 462 except ImportError, e: 463 raise exception.PluginLoadError( 464 "Failed to import plugin %s: %s" % \ 465 (plugin_name, e[0])) 466 klass = getattr(mod, class_name, None) 467 if not klass: 468 raise exception.PluginError( 469 'Plugin class %s does not exist' % setup_class) 470 if not issubclass(klass, clustersetup.ClusterSetup): 471 raise exception.PluginError( 472 ("Plugin %s must be a subclass of " + \ 473 "starcluster.clustersetup.ClusterSetup") % setup_class) 474 args, kwargs = utils.get_arg_spec(klass.__init__) 475 config_args = [] 476 missing_args = [] 477 for arg in args: 478 if arg in plugin: 479 config_args.append(plugin.get(arg)) 480 else: 481 missing_args.append(arg) 482 log.debug("config_args = %s" % config_args) 483 if missing_args: 484 raise exception.PluginError( 485 "Not enough settings provided for plugin %s (missing: %s)" 486 % (plugin_name, ', '.join(missing_args))) 487 config_kwargs = {} 488 for arg in kwargs: 489 if arg in plugin: 490 config_kwargs[arg] = plugin.get(arg) 491 log.debug("config_kwargs = %s" % config_kwargs) 492 plugs.append((plugin_name, klass(*config_args, **config_kwargs))) 493 return plugs
494
495 - def update(self, kwargs):
496 for key in kwargs.keys(): 497 if hasattr(self, key): 498 self.__dict__[key] = kwargs[key]
499
501 """ 502 Validate existing instances against this cluster's settings 503 """ 504 self.wait_for_active_spots() 505 nodes = self.nodes 506 if not nodes: 507 raise exception.ClusterValidationError("No existing nodes found!") 508 log.info("Validating existing instances...") 509 mazone = self.master_node.placement 510 rlmap = self._get_launch_map(reverse=True) 511 for node in nodes: 512 itype, image = rlmap.get(node.alias) 513 alias = node.alias 514 ntype = node.instance_type 515 if ntype != itype: 516 raise exception.ClusterValidationError( 517 "%s's instance type (%s) != %s" % (alias, ntype, itype)) 518 nimage = node.image_id 519 if nimage != image: 520 raise exception.ClusterValidationError( 521 "%s's image id (%s) != %s" % (alias, nimage, image)) 522 if node.key_name != self.keyname: 523 raise exception.ClusterValidationError( 524 "%s's key_name (%s) != %s" % (alias, node.key_name, 525 self.keyname)) 526 nazone = node.placement 527 if mazone != nazone: 528 raise exception.ClusterValidationError( 529 "Node '%s' zone (%s) does not match master's zone (%s)" % 530 (alias, nazone, mazone)) 531 # reset zone cache 532 self._zone = None 533 if self.zone and self.zone != mazone: 534 raise exception.ClusterValidationError( 535 "Running cluster's availability_zone (%s) != %s" % 536 (mazone, self.zone))
537
538 - def get(self, name):
539 return self.__dict__.get(name)
540
541 - def __str__(self):
542 cfg = self.__getstate__() 543 return pprint.pformat(cfg)
544
545 - def load_receipt(self, load_plugins=True):
546 """ 547 Load the original settings used to launch this cluster into this 548 Cluster object. The settings are loaded from the cluster group's 549 description field. 550 """ 551 try: 552 desc = self.cluster_group.description 553 version, b64data = desc.split('-', 1) 554 if utils.program_version_greater(version, static.VERSION): 555 d = dict(cluster=self.cluster_tag, old_version=static.VERSION, 556 new_version=version) 557 msg = user_msgs.version_mismatch % d 558 sep = '*' * 60 559 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1}) 560 compressed_data = base64.b64decode(b64data) 561 pkl_data = zlib.decompress(compressed_data) 562 cluster_settings = cPickle.loads(str(pkl_data)).__dict__ 563 except (cPickle.PickleError, zlib.error, ValueError, TypeError, 564 EOFError, IndexError), e: 565 log.debug('load receipt exception: ', exc_info=True) 566 raise exception.IncompatibleCluster(self.cluster_group) 567 except Exception, e: 568 raise exception.ClusterReceiptError( 569 'failed to load cluster receipt: %s' % e) 570 for key in cluster_settings: 571 if hasattr(self, key): 572 setattr(self, key, cluster_settings.get(key)) 573 if load_plugins: 574 try: 575 self.plugins = self.load_plugins(self._plugins) 576 except exception.PluginError, e: 577 log.warn(e) 578 log.warn("An error occured while loading plugins") 579 log.warn("Not running any plugins") 580 except Exception, e: 581 raise exception.ClusterReceiptError( 582 'failed to load cluster receipt: %s' % e) 583 return True
584
585 - def __getstate__(self):
586 cfg = {} 587 exclude = ['key_location', 'plugins'] 588 include = ['_zone', '_plugins'] 589 for key in self.__dict__.keys(): 590 private = key.startswith('_') 591 if (not private or key in include) and not key in exclude: 592 val = getattr(self, key) 593 if type(val) in [str, unicode, bool, int, float, list, dict]: 594 cfg[key] = val 595 elif type(val) is utils.AttributeDict: 596 cfg[key] = dict(val) 597 return cfg
598 599 @property
600 - def _security_group(self):
601 return static.SECURITY_GROUP_TEMPLATE % self.cluster_tag
602 603 @property
604 - def cluster_group(self):
605 if self._cluster_group is None: 606 ssh_port = static.DEFAULT_SSH_PORT 607 desc = base64.b64encode(zlib.compress(cPickle.dumps(self))) 608 desc = '-'.join([static.VERSION, desc]) 609 sg = self.ec2.get_or_create_group(self._security_group, 610 desc, 611 auth_ssh=True, 612 auth_group_traffic=True) 613 for p in self.permissions: 614 perm = self.permissions.get(p) 615 ip_protocol = perm.get('ip_protocol', 'tcp') 616 from_port = perm.get('from_port') 617 to_port = perm.get('to_port') 618 cidr_ip = perm.get('cidr_ip', static.WORLD_CIDRIP) 619 if not self.ec2.has_permission(sg, ip_protocol, from_port, 620 to_port, cidr_ip): 621 log.info("Opening %s port range %s-%s for CIDR %s" % 622 (ip_protocol, from_port, to_port, cidr_ip)) 623 sg.authorize(ip_protocol, from_port, to_port, cidr_ip) 624 if ip_protocol == 'tcp' and from_port <= ssh_port <= to_port: 625 sg.revoke(ip_protocol, ssh_port, ssh_port, 626 static.WORLD_CIDRIP) 627 self._cluster_group = sg 628 return self._cluster_group
629 630 @property
631 - def placement_group(self):
632 if self._placement_group is None: 633 pg = self.ec2.get_or_create_placement_group(self._security_group) 634 self._placement_group = pg 635 return self._placement_group
636 637 @property
638 - def master_node(self):
639 if not self._master: 640 for node in self.nodes: 641 if node.is_master(): 642 self._master = node 643 if not self._master: 644 raise exception.MasterDoesNotExist() 645 return self._master
646 647 @property
648 - def nodes(self):
649 states = ['pending', 'running', 'stopping', 'stopped'] 650 filters = {'group-name': self._security_group, 651 'instance-state-name': states} 652 nodes = self.ec2.get_all_instances(filters=filters) 653 # remove any cached nodes not in the current node list from EC2 654 current_ids = [n.id for n in nodes] 655 remove_nodes = [n for n in self._nodes if n.id not in current_ids] 656 for node in remove_nodes: 657 self._nodes.remove(node) 658 # update node cache with latest instance data from EC2 659 existing_nodes = dict([(n.id, n) for n in self._nodes]) 660 log.debug('existing nodes: %s' % existing_nodes) 661 for node in nodes: 662 if node.id in existing_nodes: 663 log.debug('updating existing node %s in self._nodes' % node.id) 664 enode = existing_nodes.get(node.id) 665 enode.key_location = self.key_location 666 enode.instance = node 667 else: 668 log.debug('adding node %s to self._nodes list' % node.id) 669 n = Node(node, self.key_location) 670 if n.is_master(): 671 self._master = n 672 self._nodes.insert(0, n) 673 else: 674 self._nodes.append(n) 675 self._nodes.sort(key=lambda n: n.alias) 676 log.debug('returning self._nodes = %s' % self._nodes) 677 return self._nodes
678
679 - def get_nodes_or_raise(self):
680 nodes = self.nodes 681 if not nodes: 682 filters = {'group-name': self._security_group} 683 terminated_nodes = self.ec2.get_all_instances(filters=filters) 684 raise exception.NoClusterNodesFound(terminated_nodes) 685 return nodes
686
687 - def get_node_by_dns_name(self, dns_name):
688 for node in self.nodes: 689 if node.dns_name == dns_name: 690 return node 691 raise exception.InstanceDoesNotExist(dns_name, label='node')
692
693 - def get_node_by_id(self, instance_id):
694 for node in self.nodes: 695 if node.id == instance_id: 696 return node 697 raise exception.InstanceDoesNotExist(instance_id, label='node')
698
699 - def get_node_by_alias(self, alias):
700 for node in self.nodes: 701 if node.alias == alias: 702 return node 703 raise exception.InstanceDoesNotExist(alias, label='node')
704
705 - def _nodes_in_states(self, states):
706 return filter(lambda x: x.state in states, self.nodes)
707 708 @property
709 - def running_nodes(self):
710 return self._nodes_in_states(['running'])
711 712 @property
713 - def stopped_nodes(self):
714 return self._nodes_in_states(['stopping', 'stopped'])
715 716 @property
717 - def spot_requests(self):
718 filters = {'launch.group-id': self.cluster_group.id, 719 'state': ['active', 'open']} 720 return self.ec2.get_all_spot_requests(filters=filters)
721
722 - def get_spot_requests_or_raise(self):
723 spots = self.spot_requests 724 if not spots: 725 raise exception.NoClusterSpotRequests 726 return spots
727
728 - def create_node(self, alias, image_id=None, instance_type=None, zone=None, 729 placement_group=None, spot_bid=None, force_flat=False):
730 return self.create_nodes([alias], image_id=image_id, 731 instance_type=instance_type, count=1, 732 zone=zone, placement_group=placement_group, 733 spot_bid=spot_bid, force_flat=force_flat)[0]
734
735 - def create_nodes(self, aliases, image_id=None, instance_type=None, count=1, 736 zone=None, placement_group=None, spot_bid=None, 737 force_flat=False):
738 """ 739 Convenience method for requesting instances with this cluster's 740 settings. All settings (kwargs) except force_flat default to cluster 741 settings if not provided. Passing force_flat=True ignores spot_bid 742 completely forcing a flat-rate instance to be requested. 743 """ 744 spot_bid = spot_bid or self.spot_bid 745 if force_flat: 746 spot_bid = None 747 cluster_sg = self.cluster_group.name 748 instance_type = instance_type or self.node_instance_type 749 if not placement_group and instance_type in static.CLUSTER_TYPES: 750 placement_group = self.placement_group.name 751 image_id = image_id or self.node_image_id 752 kwargs = dict(price=spot_bid, instance_type=instance_type, 753 min_count=count, max_count=count, count=count, 754 key_name=self.keyname, security_groups=[cluster_sg], 755 availability_zone_group=cluster_sg, 756 launch_group=cluster_sg, placement=zone or self.zone, 757 user_data='|'.join(aliases), 758 placement_group=placement_group) 759 resvs = [] 760 if spot_bid: 761 for alias in aliases: 762 kwargs['user_data'] = alias 763 resvs.extend(self.ec2.request_instances(image_id, **kwargs)) 764 else: 765 resvs.append(self.ec2.request_instances(image_id, **kwargs)) 766 for resv in resvs: 767 log.info(str(resv), extra=dict(__raw__=True)) 768 return resvs
769
770 - def _get_next_node_num(self):
771 nodes = self._nodes_in_states(['pending', 'running']) 772 nodes = filter(lambda x: not x.is_master(), nodes) 773 highest = 0 774 for n in nodes: 775 try: 776 highest = max(highest, int(n.alias[4:8])) 777 except ValueError: 778 pass 779 next = highest + 1 780 log.debug("Highest node number is %d. choosing %d." % (highest, next)) 781 return next
782
783 - def add_node(self, alias=None, no_create=False):
784 """ 785 Add a single node to this cluster 786 """ 787 aliases = None 788 if alias: 789 aliases = [alias] 790 self.add_nodes(1, aliases=aliases, no_create=no_create)
791
792 - def add_nodes(self, num_nodes, aliases=None, no_create=False):
793 """ 794 Add new nodes to this cluster 795 796 aliases - list of aliases to assign to new nodes (len must equal 797 num_nodes) 798 """ 799 running_pending = self._nodes_in_states(['pending', 'running']) 800 aliases = aliases or [] 801 if not aliases: 802 next_node_id = self._get_next_node_num() 803 for i in range(next_node_id, next_node_id + num_nodes): 804 alias = 'node%.3d' % i 805 aliases.append(alias) 806 assert len(aliases) == num_nodes 807 if "master" in aliases: 808 raise exception.ClusterValidationError( 809 "worker nodes cannot have master as an alias") 810 if not no_create: 811 for node in running_pending: 812 if node.alias in aliases: 813 raise exception.ClusterValidationError( 814 "node with alias %s already exists" % node.alias) 815 log.info("Launching node(s): %s" % ', '.join(aliases)) 816 self.create_nodes(aliases, count=len(aliases)) 817 self.wait_for_cluster(msg="Waiting for node(s) to come up...") 818 log.debug("Adding node(s): %s" % aliases) 819 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 820 self.disable_threads) 821 for alias in aliases: 822 node = self.get_node_by_alias(alias) 823 default_plugin.on_add_node( 824 node, self.nodes, self.master_node, 825 self.cluster_user, self.cluster_shell, 826 self.volumes) 827 self.run_plugins(method_name="on_add_node", node=node)
828
829 - def remove_node(self, node, terminate=True):
830 """ 831 Remove a single node from this cluster 832 """ 833 return self.remove_nodes([node], terminate=terminate)
834
835 - def remove_nodes(self, nodes, terminate=True):
836 """ 837 Remove a list of nodes from this cluster 838 """ 839 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 840 self.disable_threads) 841 for node in nodes: 842 if node.is_master(): 843 raise exception.InvalidOperation("cannot remove master node") 844 self.run_plugins(method_name="on_remove_node", 845 node=node, reverse=True) 846 default_plugin.on_remove_node( 847 node, self.nodes, self.master_node, 848 self.cluster_user, self.cluster_shell, 849 self.volumes) 850 if not terminate: 851 continue 852 if node.spot_id: 853 log.info("Cancelling spot request %s" % node.spot_id) 854 node.get_spot_request().cancel() 855 node.terminate()
856
857 - def _get_launch_map(self, reverse=False):
858 """ 859 Groups all node-aliases that have similar instance types/image ids 860 Returns a dictionary that's used to launch all similar instance types 861 and image ids in the same request. Example return value: 862 863 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'], 864 ('m1.large', 'ami-a5c02dcc'): ['node003'], 865 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'], 866 ('m1.small', 'ami-19e17a2b'): ['node004']} 867 868 Passing reverse=True will return the same information only keyed by 869 node aliases: 870 871 {'master': ('m1.small', 'ami-17b15e7e'), 872 'node001': ('c1.xlarge', 'ami-a5c02dcc'), 873 'node002': ('c1.xlarge', 'ami-a5c02dcc'), 874 'node003': ('m1.large', 'ami-a5c02dcc'), 875 'node004': ('m1.small', 'ami-19e17a2b'), 876 'node005': ('m1.small', 'ami-17b15e7e'), 877 'node006': ('m1.small', 'ami-17b15e7e')} 878 """ 879 lmap = {} 880 mtype = self.master_instance_type or self.node_instance_type 881 mimage = self.master_image_id or self.node_image_id 882 lmap[(mtype, mimage)] = ['master'] 883 id_start = 1 884 for itype in self.node_instance_types: 885 count = itype['size'] 886 image_id = itype['image'] or self.node_image_id 887 type = itype['type'] or self.node_instance_type 888 if not (type, image_id) in lmap: 889 lmap[(type, image_id)] = [] 890 for id in range(id_start, id_start + count): 891 alias = 'node%.3d' % id 892 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 893 (alias, image_id, type)) 894 lmap[(type, image_id)].append(alias) 895 id_start += 1 896 ntype = self.node_instance_type 897 nimage = self.node_image_id 898 if not (ntype, nimage) in lmap: 899 lmap[(ntype, nimage)] = [] 900 for id in range(id_start, self.cluster_size): 901 alias = 'node%.3d' % id 902 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 903 (alias, nimage, ntype)) 904 lmap[(ntype, nimage)].append(alias) 905 if reverse: 906 rlmap = {} 907 for (itype, image_id) in lmap: 908 aliases = lmap.get((itype, image_id)) 909 for alias in aliases: 910 rlmap[alias] = (itype, image_id) 911 return rlmap 912 return lmap
913
914 - def _get_type_and_image_id(self, alias):
915 """ 916 Returns (instance_type,image_id) for a given alias based 917 on the map returned from self._get_launch_map 918 """ 919 lmap = self._get_launch_map() 920 for (type, image) in lmap: 921 key = (type, image) 922 if alias in lmap.get(key): 923 return key
924
925 - def create_cluster(self):
926 """ 927 Launches all EC2 instances based on this cluster's settings. 928 """ 929 log.info("Launching a %d-node cluster..." % self.cluster_size) 930 mtype = self.master_instance_type or self.node_instance_type 931 self.master_instance_type = mtype 932 if self.spot_bid: 933 self._create_spot_cluster() 934 else: 935 self._create_flat_rate_cluster()
936
937 - def _create_flat_rate_cluster(self):
938 """ 939 Launches cluster using flat-rate instances. This method attempts to 940 minimize the number of launch requests by grouping nodes of the same 941 type/ami and launching each group simultaneously within a single launch 942 request. This is especially important for Cluster Compute instances 943 given that Amazon *highly* recommends requesting all CCI in a single 944 launch request. 945 """ 946 lmap = self._get_launch_map() 947 zone = None 948 master_map = None 949 for (type, image) in lmap: 950 # launch all aliases that match master's itype/image_id 951 aliases = lmap.get((type, image)) 952 if 'master' in aliases: 953 master_map = (type, image) 954 for alias in aliases: 955 log.debug("Launching %s (ami: %s, type: %s)" % \ 956 (alias, image, type)) 957 master_response = self.create_nodes(aliases, image_id=image, 958 instance_type=type, 959 count=len(aliases), 960 force_flat=True)[0] 961 zone = master_response.instances[0].placement 962 lmap.pop(master_map) 963 if self.cluster_size <= 1: 964 return 965 for (type, image) in lmap: 966 aliases = lmap.get((type, image)) 967 for alias in aliases: 968 log.debug("Launching %s (ami: %s, type: %s)" % \ 969 (alias, image, type)) 970 self.create_nodes(aliases, image_id=image, instance_type=type, 971 count=len(aliases), zone=zone, force_flat=True)
972
973 - def _create_spot_cluster(self):
974 """ 975 Launches cluster using all spot instances. This method makes a single 976 spot request for each node in the cluster since spot instances 977 *always* have an ami_launch_index of 0. This is needed in order to 978 correctly assign aliases to nodes. 979 """ 980 (mtype, mimage) = self._get_type_and_image_id('master') 981 log.info("Launching master node (ami: %s, type: %s)..." % \ 982 (mimage, mtype)) 983 force_flat = not self.force_spot_master and self.cluster_size > 1 984 master_response = self.create_node('master', 985 image_id=mimage, 986 instance_type=mtype, 987 force_flat=force_flat) 988 zone = None 989 if not force_flat and self.spot_bid: 990 # Make sure nodes are in same zone as master 991 launch_spec = master_response.launch_specification 992 zone = launch_spec.placement 993 else: 994 # Make sure nodes are in same zone as master 995 zone = master_response.instances[0].placement 996 if self.cluster_size <= 1: 997 return 998 for id in range(1, self.cluster_size): 999 alias = 'node%.3d' % id 1000 (ntype, nimage) = self._get_type_and_image_id(alias) 1001 log.info("Launching %s (ami: %s, type: %s)" % 1002 (alias, nimage, ntype)) 1003 self.create_node(alias, image_id=nimage, instance_type=ntype, 1004 zone=zone)
1005
1006 - def is_spot_cluster(self):
1007 """ 1008 Returns True if all nodes are spot instances 1009 """ 1010 nodes = self.nodes 1011 if not nodes: 1012 return False 1013 for node in nodes: 1014 if not node.is_spot(): 1015 return False 1016 return True
1017
1018 - def has_spot_nodes(self):
1019 """ 1020 Returns True if any nodes are spot instances 1021 """ 1022 for node in self.nodes: 1023 if node.is_spot(): 1024 return True 1025 return False
1026
1027 - def is_ebs_cluster(self):
1028 """ 1029 Returns True if all nodes are EBS-backed 1030 """ 1031 nodes = self.nodes 1032 if not nodes: 1033 return False 1034 for node in nodes: 1035 if not node.is_ebs_backed(): 1036 return False 1037 return True
1038
1039 - def has_ebs_nodes(self):
1040 """ 1041 Returns True if any nodes are EBS-backed 1042 """ 1043 for node in self.nodes: 1044 if node.is_ebs_backed(): 1045 return True 1046 return False
1047
1048 - def is_stoppable(self):
1049 """ 1050 Returns True if all nodes are stoppable (i.e. non-spot and EBS-backed) 1051 """ 1052 nodes = self.nodes 1053 if not nodes: 1054 return False 1055 for node in self.nodes: 1056 if not node.is_stoppable(): 1057 return False 1058 return True
1059
1060 - def has_stoppable_nodes(self):
1061 """ 1062 Returns True if any nodes are stoppable (i.e. non-spot and EBS-backed) 1063 """ 1064 nodes = self.nodes 1065 if not nodes: 1066 return False 1067 for node in nodes: 1068 if node.is_stoppable(): 1069 return True 1070 return False
1071
1072 - def is_cluster_compute(self):
1073 """ 1074 Returns true if all instances are Cluster/GPU Compute type 1075 """ 1076 nodes = self.nodes 1077 if not nodes: 1078 return False 1079 for node in nodes: 1080 if not node.is_cluster_compute(): 1081 return False 1082 return True
1083
1084 - def has_cluster_compute_nodes(self):
1085 for node in self.nodes: 1086 if node.is_cluster_compute(): 1087 return True 1088 return False
1089
1090 - def is_cluster_up(self):
1091 """ 1092 Check that all nodes are 'running' and that ssh is up on all nodes 1093 This method will return False if any spot requests are in an 'open' 1094 state. 1095 """ 1096 spots = self.spot_requests 1097 active_spots = filter(lambda x: x.state == 'active', spots) 1098 if len(spots) != len(active_spots): 1099 return False 1100 nodes = self.nodes 1101 if not nodes: 1102 return False 1103 for node in nodes: 1104 if not node.is_up(): 1105 return False 1106 return True
1107
1108 - def get_spinner(self, msg):
1109 """ 1110 Logs a status msg, starts a spinner, and returns the spinner object. 1111 This is useful for long running processes: 1112 1113 s = self.get_spinner("Long running process running...") 1114 (do something) 1115 s.stop() 1116 """ 1117 s = spinner.Spinner() 1118 log.info(msg, extra=dict(__nonewline__=True)) 1119 s.start() 1120 return s
1121 1122 @property
1123 - def progress_bar(self):
1124 if not self._progress_bar: 1125 widgets = ['', progressbar.Fraction(), ' ', 1126 progressbar.Bar(marker=progressbar.RotatingMarker()), 1127 ' ', progressbar.Percentage(), ' ', ' '] 1128 pbar = progressbar.ProgressBar(widgets=widgets, 1129 maxval=self.cluster_size, 1130 force_update=True) 1131 self._progress_bar = pbar 1132 return self._progress_bar
1133
1134 - def wait_for_active_spots(self, spots=None):
1135 """ 1136 Wait for all open spot requests for this cluster to transition to 1137 'active'. 1138 """ 1139 spots = spots or self.spot_requests 1140 open_spots = [spot for spot in spots if spot.state == "open"] 1141 if open_spots: 1142 pbar = self.progress_bar.reset() 1143 log.info('Waiting for open spot requests to become active...') 1144 pbar.maxval = len(spots) 1145 pbar.update(0) 1146 while not pbar.finished: 1147 active_spots = filter(lambda x: x.state == "active", spots) 1148 pbar.maxval = len(spots) 1149 pbar.update(len(active_spots)) 1150 if not pbar.finished: 1151 time.sleep(self.refresh_interval) 1152 spots = self.get_spot_requests_or_raise() 1153 pbar.reset()
1154
1155 - def wait_for_active_instances(self, nodes=None):
1156 """ 1157 Wait indefinitely for cluster nodes to show up. 1158 """ 1159 nodes = nodes or self.nodes 1160 if len(nodes) == 0: 1161 s = self.get_spinner("Waiting for instances to activate...") 1162 while len(nodes) == 0: 1163 time.sleep(self.refresh_interval) 1164 nodes = self.nodes 1165 s.stop()
1166
1167 - def wait_for_running_instances(self, nodes=None):
1168 """ 1169 Wait until all cluster nodes are in a 'running' state 1170 """ 1171 log.info("Waiting for all nodes to be in a 'running' state...") 1172 nodes = nodes or self.get_nodes_or_raise() 1173 pbar = self.progress_bar.reset() 1174 pbar.maxval = len(nodes) 1175 pbar.update(0) 1176 while not pbar.finished: 1177 running_nodes = filter(lambda x: x.state == "running", nodes) 1178 pbar.maxval = len(nodes) 1179 pbar.update(len(running_nodes)) 1180 if not pbar.finished: 1181 time.sleep(self.refresh_interval) 1182 nodes = self.get_nodes_or_raise() 1183 pbar.reset()
1184
1185 - def wait_for_ssh(self, nodes=None):
1186 """ 1187 Wait until all cluster nodes are in a 'running' state 1188 """ 1189 log.info("Waiting for SSH to come up on all nodes...") 1190 nodes = nodes or self.get_nodes_or_raise() 1191 pbar = self.progress_bar.reset() 1192 pbar.maxval = len(nodes) 1193 pbar.update(0) 1194 while not pbar.finished: 1195 active_nodes = filter(lambda n: n.is_up(), nodes) 1196 pbar.maxval = len(nodes) 1197 pbar.update(len(active_nodes)) 1198 if not pbar.finished: 1199 time.sleep(self.refresh_interval) 1200 nodes = self.get_nodes_or_raise() 1201 pbar.finish()
1202 1203 @print_timing("Waiting for cluster to come up")
1204 - def wait_for_cluster(self, msg="Waiting for cluster to come up..."):
1205 """ 1206 Wait for cluster to come up and display progress bar. Waits for all 1207 spot requests to become 'active', all instances to be in a 'running' 1208 state, and for all SSH daemons to come up. 1209 1210 msg - custom message to print out before waiting on the cluster 1211 """ 1212 interval = self.refresh_interval 1213 log.info("%s %s" % (msg, "(updating every %ds)" % interval)) 1214 self.wait_for_active_spots() 1215 self.wait_for_active_instances() 1216 self.wait_for_running_instances() 1217 self.wait_for_ssh()
1218
1219 - def is_cluster_stopped(self):
1220 """ 1221 Check whether all nodes are in the 'stopped' state 1222 """ 1223 nodes = self.nodes 1224 if not nodes: 1225 return False 1226 for node in nodes: 1227 if node.state != 'stopped': 1228 return False 1229 return True
1230
1231 - def is_cluster_terminated(self):
1232 """ 1233 Check whether all nodes are in a 'terminated' state 1234 """ 1235 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES) 1236 filters = {'group-name': self._security_group, 1237 'instance-state-name': states} 1238 insts = self.ec2.get_all_instances(filters=filters) 1239 return len(insts) == 0
1240
1241 - def attach_volumes_to_master(self):
1242 """ 1243 Attach each volume to the master node 1244 """ 1245 for vol in self.volumes: 1246 volume = self.volumes.get(vol) 1247 device = volume.get('device') 1248 vol_id = volume.get('volume_id') 1249 vol = self.ec2.get_volume(vol_id) 1250 if vol.attach_data.instance_id == self.master_node.id: 1251 log.info("Volume %s already attached to master...skipping" % \ 1252 vol.id) 1253 continue 1254 if vol.status != "available": 1255 log.error(('Volume %s not available...' + 1256 'please check and try again') % vol.id) 1257 continue 1258 log.info("Attaching volume %s to master node on %s ..." % (vol.id, 1259 device)) 1260 resp = vol.attach(self.master_node.id, device) 1261 log.debug("resp = %s" % resp) 1262 while True: 1263 vol.update() 1264 if vol.attachment_state() == 'attached': 1265 break 1266 time.sleep(5)
1267
1268 - def detach_volumes(self):
1269 """ 1270 Detach all volumes from all nodes 1271 """ 1272 for node in self.nodes: 1273 node.detach_external_volumes()
1274 1275 @print_timing('Restarting cluster')
1276 - def restart_cluster(self):
1277 """ 1278 Reboot all instances and reconfigure the cluster 1279 """ 1280 nodes = self.nodes 1281 if not nodes: 1282 raise exception.ClusterValidationError("No running nodes found") 1283 self.run_plugins(method_name="on_restart", reverse=True) 1284 log.info("Rebooting cluster...") 1285 for node in nodes: 1286 node.reboot() 1287 sleep = 20 1288 log.info("Sleeping for %d seconds..." % sleep) 1289 time.sleep(sleep) 1290 self.setup_cluster()
1291
1292 - def stop_cluster(self, terminate_unstoppable=False):
1293 """ 1294 Shutdown this cluster by detaching all volumes and 'stopping' all nodes 1295 1296 In general, all nodes in the cluster must be 'stoppable' meaning all 1297 nodes are backed by flat-rate EBS-backed instances. If any 1298 'unstoppable' nodes are found an exception is raised. A node is 1299 'unstoppable' if it is backed by either a spot or S3-backed instance. 1300 1301 If the cluster contains a mix of 'stoppable' and 'unstoppable' nodes 1302 you can stop all stoppable nodes and terminate any unstoppable nodes by 1303 setting terminate_unstoppable=True. 1304 1305 This will stop all nodes that can be stopped and terminate the rest. 1306 """ 1307 nodes = self.nodes 1308 if not nodes: 1309 raise exception.ClusterValidationError("No running nodes found") 1310 if not self.is_stoppable(): 1311 has_stoppable_nodes = self.has_stoppable_nodes() 1312 if not terminate_unstoppable and has_stoppable_nodes: 1313 raise exception.InvalidOperation( 1314 "Cluster contains nodes that are not stoppable") 1315 if not has_stoppable_nodes: 1316 raise exception.InvalidOperation( 1317 "Cluster does not contain any stoppable nodes") 1318 try: 1319 self.run_plugins(method_name="on_shutdown", reverse=True) 1320 except exception.MasterDoesNotExist, e: 1321 log.warn("Cannot run plugins: %s" % e) 1322 self.detach_volumes() 1323 for node in nodes: 1324 node.shutdown()
1325
1326 - def terminate_cluster(self):
1327 """ 1328 Destroy this cluster by first detaching all volumes, shutting down all 1329 instances, cancelling all spot requests (if any), removing its 1330 placement group (if any), and removing its security group. 1331 """ 1332 try: 1333 self.run_plugins(method_name="on_shutdown", reverse=True) 1334 except exception.MasterDoesNotExist, e: 1335 log.warn("Cannot run plugins: %s" % e) 1336 self.detach_volumes() 1337 nodes = self.nodes 1338 for node in nodes: 1339 node.terminate() 1340 for spot in self.spot_requests: 1341 if spot.state not in ['cancelled', 'closed']: 1342 log.info("Cancelling spot instance request: %s" % spot.id) 1343 spot.cancel() 1344 sg = self.ec2.get_group_or_none(self._security_group) 1345 pg = self.ec2.get_placement_group_or_none(self._security_group) 1346 if nodes and sg or pg: 1347 s = self.get_spinner("Waiting for cluster to terminate...") 1348 while not self.is_cluster_terminated(): 1349 time.sleep(5) 1350 s.stop() 1351 if pg: 1352 log.info("Removing %s placement group" % pg.name) 1353 pg.delete() 1354 if sg: 1355 log.info("Removing %s security group" % sg.name) 1356 sg.delete()
1357
1358 - def start(self, create=True, create_only=False, validate=True, 1359 validate_only=False, validate_running=False):
1360 """ 1361 Creates and configures a cluster from this cluster template's settings. 1362 1363 create - create new nodes when starting the cluster. set to False to 1364 use existing nodes 1365 create_only - only create the cluster node instances, don't configure 1366 the cluster 1367 validate - whether or not to validate the cluster settings used. 1368 False will ignore validate_only and validate_running 1369 keywords and is effectively the same as running _start 1370 validate_only - only validate cluster settings, do not create or 1371 configure cluster 1372 validate_running - whether or not to validate the existing instances 1373 being used against this cluster's settings 1374 """ 1375 if validate: 1376 if not create and validate_running: 1377 try: 1378 self._validate_running_instances() 1379 except exception.ClusterValidationError, e: 1380 msg = "Existing nodes are not compatible with cluster " 1381 msg += "settings:\n" 1382 e.msg = msg + e.msg 1383 raise 1384 elif create: 1385 self._validate() 1386 if validate_only: 1387 return 1388 else: 1389 log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK") 1390 return self._start(create=create, create_only=create_only)
1391 1392 @print_timing("Starting cluster")
1393 - def _start(self, create=True, create_only=False):
1394 """ 1395 Create and configure a cluster from this cluster template's settings 1396 (Does not attempt to validate before running) 1397 1398 create - create new nodes when starting the cluster. set to False to 1399 use existing nodes 1400 create_only - only create the cluster node instances, don't configure 1401 the cluster 1402 """ 1403 log.info("Starting cluster...") 1404 if create: 1405 self.create_cluster() 1406 else: 1407 assert self.master_node is not None 1408 for node in self.stopped_nodes: 1409 log.info("Starting stopped node: %s" % node.alias) 1410 node.start() 1411 if create_only: 1412 return 1413 self.setup_cluster()
1414
1415 - def setup_cluster(self):
1416 """ 1417 Waits for all nodes to come up and then runs the default 1418 StarCluster setup routines followed by any additional plugin setup 1419 routines 1420 """ 1421 self.wait_for_cluster() 1422 self._setup_cluster()
1423 1424 @print_timing("Configuring cluster")
1425 - def _setup_cluster(self):
1426 """ 1427 Runs the default StarCluster setup routines followed by any additional 1428 plugin setup routines. Does not wait for nodes to come up. 1429 """ 1430 log.info("The master node is %s" % self.master_node.dns_name) 1431 log.info("Setting up the cluster...") 1432 if self.volumes: 1433 self.attach_volumes_to_master() 1434 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 1435 self.disable_threads) 1436 default_plugin.run(self.nodes, self.master_node, self.cluster_user, 1437 self.cluster_shell, self.volumes) 1438 self.run_plugins()
1439
1440 - def run_plugins(self, plugins=None, method_name="run", node=None, 1441 reverse=False):
1442 """ 1443 Run all plugins specified in this Cluster object's self.plugins list 1444 Uses plugins list instead of self.plugins if specified. 1445 1446 plugins must be a tuple: the first element is the plugin's name, the 1447 second element is the plugin object (a subclass of ClusterSetup) 1448 """ 1449 plugs = plugins or self.plugins 1450 if reverse: 1451 plugs = plugs[:] 1452 plugs.reverse() 1453 for plug in plugs: 1454 name, plugin = plug 1455 self.run_plugin(plugin, name, method_name=method_name, node=node)
1456
1457 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1458 """ 1459 Run a StarCluster plugin. 1460 1461 plugin - an instance of the plugin's class 1462 name - a user-friendly label for the plugin 1463 method_name - the method to run within the plugin (default: "run") 1464 node - optional node to pass as first argument to plugin method (used 1465 for on_add_node/on_remove_node) 1466 """ 1467 plugin_name = name or str(plugin) 1468 try: 1469 func = getattr(plugin, method_name, None) 1470 if not func: 1471 log.warn("Plugin %s has no %s method...skipping" % \ 1472 (plugin_name, method_name)) 1473 return 1474 args = [self.nodes, self.master_node, self.cluster_user, 1475 self.cluster_shell, self.volumes] 1476 if node: 1477 args.insert(0, node) 1478 log.info("Running plugin %s" % plugin_name) 1479 func(*args) 1480 except NotImplementedError: 1481 log.debug("method %s not implemented by plugin %s" % (method_name, 1482 plugin_name)) 1483 except exception.MasterDoesNotExist: 1484 raise 1485 except Exception, e: 1486 log.error("Error occured while running plugin '%s':" % plugin_name) 1487 if isinstance(e, exception.ThreadPoolException): 1488 e.print_excs() 1489 log.debug(e.format_excs()) 1490 else: 1491 traceback.print_exc() 1492 log.debug(traceback.format_exc())
1493
1494 - def is_running_valid(self):
1495 """ 1496 Checks whether the current running instances are compatible 1497 with this cluster template's settings 1498 """ 1499 try: 1500 self._validate_running_instances() 1501 return True 1502 except exception.ClusterValidationError, e: 1503 log.error(e.msg) 1504 return False
1505
1506 - def _validate(self):
1507 """ 1508 Checks that all cluster template settings are valid. Raises a 1509 ClusterValidationError exception if not. 1510 """ 1511 log.info("Validating cluster template settings...") 1512 try: 1513 self._has_all_required_settings() 1514 self._validate_spot_bid() 1515 self._validate_cluster_size() 1516 self._validate_shell_setting() 1517 self._validate_permission_settings() 1518 self._validate_credentials() 1519 self._validate_keypair() 1520 self._validate_zone() 1521 self._validate_ebs_settings() 1522 self._validate_ebs_aws_settings() 1523 self._validate_image_settings() 1524 self._validate_instance_types() 1525 self._validate_cluster_compute() 1526 log.info('Cluster template settings are valid') 1527 return True 1528 except exception.ClusterValidationError, e: 1529 e.msg = 'Cluster settings are not valid:\n%s' % e.msg 1530 raise
1531
1532 - def is_valid(self):
1533 """ 1534 Returns True if all cluster template settings are valid 1535 """ 1536 try: 1537 self._validate() 1538 return True 1539 except exception.ClusterValidationError, e: 1540 log.error(e.msg) 1541 return False
1542
1543 - def _validate_spot_bid(self):
1544 if self.spot_bid is not None: 1545 if type(self.spot_bid) not in [int, float]: 1546 raise exception.ClusterValidationError( 1547 'spot_bid must be integer or float') 1548 if self.spot_bid <= 0: 1549 raise exception.ClusterValidationError( 1550 'spot_bid must be an integer or float > 0') 1551 return True
1552
1553 - def _validate_cluster_size(self):
1554 try: 1555 int(self.cluster_size) 1556 if self.cluster_size < 1: 1557 raise ValueError 1558 except (ValueError, TypeError): 1559 raise exception.ClusterValidationError( 1560 'cluster_size must be an integer >= 1') 1561 num_itypes = sum([i.get('size') for i in self.node_instance_types]) 1562 num_nodes = self.cluster_size - 1 1563 if num_itypes > num_nodes: 1564 raise exception.ClusterValidationError( 1565 ("total number of nodes specified in node_instance_type (%s)" + 1566 " must be <= cluster_size-1 (%s)") % (num_itypes, num_nodes)) 1567 return True
1568
1569 - def _validate_shell_setting(self):
1570 cluster_shell = self.cluster_shell 1571 if not self.__available_shells.get(cluster_shell): 1572 raise exception.ClusterValidationError( 1573 'Invalid user shell specified. Options are %s' % \ 1574 ' '.join(self.__available_shells.keys())) 1575 return True
1576
1577 - def _validate_image_settings(self):
1578 master_image_id = self.master_image_id 1579 node_image_id = self.node_image_id 1580 conn = self.ec2 1581 image = conn.get_image_or_none(node_image_id) 1582 if not image or image.id != node_image_id: 1583 raise exception.ClusterValidationError( 1584 'node_image_id %s does not exist' % node_image_id) 1585 if master_image_id: 1586 master_image = conn.get_image_or_none(master_image_id) 1587 if not master_image or master_image.id != master_image_id: 1588 raise exception.ClusterValidationError( 1589 'master_image_id %s does not exist' % master_image_id) 1590 return True
1591
1592 - def _validate_zone(self):
1593 availability_zone = self.availability_zone 1594 if availability_zone: 1595 zone = self.ec2.get_zone(availability_zone) 1596 if not zone: 1597 azone = self.availability_zone 1598 raise exception.ClusterValidationError( 1599 'availability_zone = %s does not exist' % azone) 1600 if zone.state != 'available': 1601 log.warn('The availability_zone = %s ' % zone + 1602 'is not available at this time') 1603 return True
1604
1605 - def __check_platform(self, image_id, instance_type):
1606 """ 1607 Validates whether an image_id (AMI) is compatible with a given 1608 instance_type. image_id_setting and instance_type_setting are the 1609 setting labels in the config file. 1610 """ 1611 image = self.ec2.get_image_or_none(image_id) 1612 if not image: 1613 raise exception.ClusterValidationError('Image %s does not exist' % 1614 image_id) 1615 image_platform = image.architecture 1616 image_is_hvm = (image.virtualization_type == "hvm") 1617 if image_is_hvm and not instance_type in static.CLUSTER_TYPES: 1618 cctypes_list = ', '.join(static.CLUSTER_TYPES) 1619 raise exception.ClusterValidationError( 1620 "Image '%s' is a Cluster Compute/GPU image (HVM) and cannot " 1621 "be used with instance type '%s'\nThe instance type " 1622 "for a Cluster Compute/GPU image (HVM) must be one of: %s" % \ 1623 (image_id, instance_type, cctypes_list)) 1624 instance_platforms = self.__instance_types[instance_type] 1625 if image_platform not in instance_platforms: 1626 error_msg = "Instance type %(instance_type)s is for an " + \ 1627 "%(instance_platform)s platform while " + \ 1628 "%(image_id)s is an %(image_platform)s platform" 1629 error_dict = {'instance_type': instance_type, 1630 'instance_platform': ', '.join(instance_platforms), 1631 'image_id': image_id, 1632 'image_platform': image_platform} 1633 raise exception.ClusterValidationError(error_msg % error_dict) 1634 return True
1635
1636 - def _validate_instance_types(self):
1637 master_image_id = self.master_image_id 1638 node_image_id = self.node_image_id 1639 master_instance_type = self.master_instance_type 1640 node_instance_type = self.node_instance_type 1641 instance_types = self.__instance_types 1642 instance_type_list = ', '.join(instance_types.keys()) 1643 if not node_instance_type in instance_types: 1644 raise exception.ClusterValidationError( 1645 ("You specified an invalid node_instance_type %s \n" + 1646 "Possible options are:\n%s") % \ 1647 (node_instance_type, instance_type_list)) 1648 elif master_instance_type: 1649 if not master_instance_type in instance_types: 1650 raise exception.ClusterValidationError( 1651 ("You specified an invalid master_instance_type %s\n" + \ 1652 "Possible options are:\n%s") % \ 1653 (master_instance_type, instance_type_list)) 1654 try: 1655 self.__check_platform(node_image_id, node_instance_type) 1656 except exception.ClusterValidationError, e: 1657 raise exception.ClusterValidationError( 1658 'Incompatible node_image_id and node_instance_type:\n' + e.msg) 1659 if master_image_id and not master_instance_type: 1660 try: 1661 self.__check_platform(master_image_id, node_instance_type) 1662 except exception.ClusterValidationError, e: 1663 raise exception.ClusterValidationError( 1664 'Incompatible master_image_id and ' + 1665 'node_instance_type\n' + e.msg) 1666 elif master_image_id and master_instance_type: 1667 try: 1668 self.__check_platform(master_image_id, master_instance_type) 1669 except exception.ClusterValidationError, e: 1670 raise exception.ClusterValidationError( 1671 'Incompatible master_image_id and ' + 1672 'master_instance_type\n' + e.msg) 1673 elif master_instance_type and not master_image_id: 1674 try: 1675 self.__check_platform(node_image_id, master_instance_type) 1676 except exception.ClusterValidationError, e: 1677 raise exception.ClusterValidationError( 1678 'Incompatible node_image_id and ' + 1679 'master_instance_type\n' + e.msg) 1680 for itype in self.node_instance_types: 1681 type = itype.get('type') 1682 img = itype.get('image') or node_image_id 1683 if not type in instance_types: 1684 raise exception.ClusterValidationError( 1685 ("You specified an invalid instance type %s \n" + 1686 "Possible options are:\n%s") % (type, instance_type_list)) 1687 try: 1688 self.__check_platform(img, type) 1689 except exception.ClusterValidationError, e: 1690 raise exception.ClusterValidationError( 1691 "Invalid settings for node_instance_type %s: %s" % 1692 (type, e.msg)) 1693 return True
1694
1695 - def _validate_cluster_compute(self):
1696 lmap = self._get_launch_map() 1697 for (type, image) in lmap: 1698 if type in static.CLUSTER_TYPES: 1699 img = self.ec2.get_image(image) 1700 if img.virtualization_type != 'hvm': 1701 raise exception.ClusterValidationError( 1702 'Cluster Compute/GPU instance type %s ' 1703 'can only be used with HVM images.\n' 1704 'Image %s is NOT an HVM image.' % (type, image))
1705
1706 - def _validate_ebs_aws_settings(self):
1707 """ 1708 Verify EBS volumes exists and that each volume's zone matches this 1709 cluster's zone setting. 1710 """ 1711 for vol in self.volumes: 1712 v = self.volumes.get(vol) 1713 vol_id = v.get('volume_id') 1714 vol = self.ec2.get_volume(vol_id) 1715 if vol.status != 'available': 1716 if self.master_node: 1717 if vol.attach_data.instance_id == self.master_node.id: 1718 continue 1719 msg = "volume %s is not available (status: %s)" % (vol_id, 1720 vol.status) 1721 raise exception.ClusterValidationError(msg)
1722
1724 permissions = self.permissions 1725 for perm in permissions: 1726 permission = permissions.get(perm) 1727 protocol = permission.get('ip_protocol') 1728 if protocol not in self.__protocols: 1729 raise exception.InvalidProtocol(protocol) 1730 from_port = permission.get('from_port') 1731 to_port = permission.get('to_port') 1732 try: 1733 from_port = int(from_port) 1734 to_port = int(to_port) 1735 except ValueError: 1736 raise exception.InvalidPortRange( 1737 from_port, to_port, reason="integer range required") 1738 if from_port < 0 or to_port < 0: 1739 raise exception.InvalidPortRange( 1740 from_port, to_port, 1741 reason="from/to must be positive integers") 1742 if from_port > to_port: 1743 raise exception.InvalidPortRange( 1744 from_port, to_port, 1745 reason="'from_port' must be <= 'to_port'") 1746 cidr_ip = permission.get('cidr_ip') 1747 if not iptools.validate_cidr(cidr_ip): 1748 raise exception.InvalidCIDRSpecified(cidr_ip)
1749
1750 - def _validate_ebs_settings(self):
1751 """ 1752 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs 1753 and validate these settings. Does not require AWS credentials. 1754 """ 1755 volmap = {} 1756 devmap = {} 1757 mount_paths = [] 1758 for vol in self.volumes: 1759 vol_name = vol 1760 vol = self.volumes.get(vol) 1761 vol_id = vol.get('volume_id') 1762 device = vol.get('device') 1763 partition = vol.get('partition') 1764 mount_path = vol.get("mount_path") 1765 vmap = volmap.get(vol_id, {}) 1766 devices = vmap.get('device', []) 1767 partitions = vmap.get('partition', []) 1768 if devices and device not in devices: 1769 raise exception.ClusterValidationError( 1770 "Can't attach volume %s to more than one device" % vol_id) 1771 elif partitions and partition in partitions: 1772 raise exception.ClusterValidationError( 1773 "Multiple configurations for %s\n" 1774 "Either pick one or specify a separate partition for " 1775 "each configuration" % vol_id) 1776 vmap['partition'] = partitions + [partition] 1777 vmap['device'] = devices + [device] 1778 volmap[vol_id] = vmap 1779 dmap = devmap.get(device, {}) 1780 vol_ids = dmap.get('volume_id', []) 1781 if vol_ids and vol_id not in vol_ids: 1782 raise exception.ClusterValidationError( 1783 "Can't attach more than one volume on device %s" % device) 1784 dmap['volume_id'] = vol_ids + [vol_id] 1785 devmap[device] = dmap 1786 mount_paths.append(mount_path) 1787 if not device: 1788 raise exception.ClusterValidationError( 1789 'Missing DEVICE setting for volume %s' % vol_name) 1790 if not utils.is_valid_device(device): 1791 raise exception.ClusterValidationError( 1792 "Invalid DEVICE value for volume %s" % vol_name) 1793 if partition: 1794 if not utils.is_valid_partition(partition): 1795 raise exception.ClusterValidationError( 1796 "Invalid PARTITION value for volume %s" % vol_name) 1797 if not partition.startswith(device): 1798 raise exception.ClusterValidationError( 1799 "Volume PARTITION must start with %s" % device) 1800 if not mount_path: 1801 raise exception.ClusterValidationError( 1802 'Missing MOUNT_PATH setting for volume %s' % vol_name) 1803 if not mount_path.startswith('/'): 1804 raise exception.ClusterValidationError( 1805 "MOUNT_PATH for volume %s should start with /" % vol_name) 1806 for path in mount_paths: 1807 if mount_paths.count(path) > 1: 1808 raise exception.ClusterValidationError( 1809 "Can't mount more than one volume on %s" % path) 1810 return True
1811
1812 - def _has_all_required_settings(self):
1813 has_all_required = True 1814 for opt in self.__cluster_settings: 1815 requirements = self.__cluster_settings[opt] 1816 name = opt 1817 required = requirements[1] 1818 if required and self.get(name.lower()) is None: 1819 log.warn('Missing required setting %s' % name) 1820 has_all_required = False 1821 return has_all_required
1822
1823 - def _validate_credentials(self):
1824 if not self.ec2.is_valid_conn(): 1825 raise exception.ClusterValidationError( 1826 'Invalid AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY combination.') 1827 return True
1828
1829 - def _validate_keypair(self):
1830 key_location = self.key_location 1831 if not key_location: 1832 raise exception.ClusterValidationError( 1833 "no key_location specified for key '%s'" % self.keyname) 1834 if not os.path.exists(key_location): 1835 raise exception.ClusterValidationError( 1836 'key_location=%s does not exist.' % \ 1837 key_location) 1838 elif not os.path.isfile(key_location): 1839 raise exception.ClusterValidationError( 1840 'key_location=%s is not a file.' % \ 1841 key_location) 1842 keyname = self.keyname 1843 keypair = self.ec2.get_keypair_or_none(keyname) 1844 if not keypair: 1845 raise exception.ClusterValidationError( 1846 'Account does not contain a key with keyname = %s. ' % keyname) 1847 if self.zone: 1848 z = self.ec2.get_zone(self.zone) 1849 if keypair.region != z.region: 1850 raise exception.ClusterValidationError( 1851 'Keypair %s not in availability zone region %s' % \ 1852 (keyname, z.region)) 1853 return True
1854
1855 - def ssh_to_master(self, user='root', command=None):
1856 return self.ssh_to_node('master', user=user, command=command)
1857
1858 - def ssh_to_node(self, alias, user='root', command=None):
1859 node = self.get_node_by_alias(alias) 1860 node = node or self.get_node_by_dns_name(alias) 1861 node = node or self.get_node_by_id(alias) 1862 if not node: 1863 raise exception.InstanceDoesNotExist(alias, label='node') 1864 if command: 1865 orig_user = node.ssh.get_current_user() 1866 node.ssh.switch_user(user) 1867 node.ssh.execute(command, silent=False) 1868 node.ssh.switch_user(orig_user) 1869 return node.ssh.get_last_status() 1870 else: 1871 node.shell(user=user)
1872 1873 if __name__ == "__main__": 1874 from starcluster.config import StarClusterConfig 1875 cfg = StarClusterConfig().load() 1876 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster') 1877 if sc.is_valid(): 1878 sc.start(create=True) 1879