Package starcluster :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module starcluster.cluster

   1  import os 
   2  import re 
   3  import time 
   4  import zlib 
   5  import string 
   6  import pprint 
   7  import base64 
   8  import cPickle 
   9  import traceback 
  10   
  11  from starcluster import ssh 
  12  from starcluster import utils 
  13  from starcluster import static 
  14  from starcluster import spinner 
  15  from starcluster import iptools 
  16  from starcluster import managers 
  17  from starcluster import exception 
  18  from starcluster import progressbar 
  19  from starcluster import clustersetup 
  20  from starcluster.node import Node 
  21  from starcluster.utils import print_timing 
  22  from starcluster.templates import user_msgs 
  23  from starcluster.logger import log 
24 25 26 -class ClusterManager(managers.Manager):
27 """ 28 Manager class for Cluster objects 29 """
30 - def __repr__(self):
31 return "<ClusterManager: %s>" % self.ec2.region.name
32
33 - def get_cluster(self, cluster_name, group=None, load_receipt=True, 34 load_plugins=True, require_keys=True):
35 """ 36 Returns a Cluster object representing an active cluster 37 """ 38 try: 39 clname = self._get_cluster_name(cluster_name) 40 cltag = self.get_tag_from_sg(clname) 41 if not group: 42 group = self.ec2.get_security_group(clname) 43 cl = Cluster(ec2_conn=self.ec2, cluster_tag=cltag, 44 cluster_group=group) 45 if load_receipt: 46 cl.load_receipt(load_plugins=load_plugins) 47 try: 48 key_location = self.cfg.get_key(cl.keyname).get('key_location') 49 cl.key_location = key_location 50 except exception.KeyNotFound: 51 if require_keys: 52 raise 53 cl.key_location = '' 54 if require_keys: 55 cl._validate_keypair() 56 return cl 57 except exception.SecurityGroupDoesNotExist: 58 raise exception.ClusterDoesNotExist(cluster_name)
59
60 - def get_clusters(self, load_receipt=True, load_plugins=True):
61 """ 62 Returns a list of all active clusters 63 """ 64 cluster_groups = self.get_cluster_security_groups() 65 clusters = [self.get_cluster(g.name, group=g, 66 load_receipt=load_receipt, 67 load_plugins=load_plugins) 68 for g in cluster_groups] 69 return clusters
70
72 """ 73 Returns name of the default cluster template defined in the config 74 """ 75 return self.cfg.get_default_cluster_template()
76
77 - def get_cluster_template(self, template_name, tag_name=None):
78 """ 79 Returns a new Cluster object using the settings from the cluster 80 template template_name 81 82 If tag_name is passed, the Cluster object's cluster_tag setting will 83 be set to tag_name 84 """ 85 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name, 86 ec2_conn=self.ec2) 87 return cl
88
89 - def get_cluster_or_none(self, cluster_name, **kwargs):
90 """ 91 Same as get_cluster but returns None instead of throwing an exception 92 if the cluster does not exist 93 """ 94 try: 95 return self.get_cluster(cluster_name, **kwargs) 96 except exception.ClusterDoesNotExist: 97 pass
98
99 - def cluster_exists(self, tag_name):
100 """ 101 Returns True if cluster exists 102 """ 103 return self.get_cluster_or_none(tag_name) is not None
104
105 - def ssh_to_master(self, cluster_name, user='root', command=None):
106 """ 107 ssh to master node of cluster_name 108 109 user keyword specifies an alternate user to login as 110 """ 111 cluster = self.get_cluster(cluster_name) 112 return cluster.ssh_to_master(user=user, command=command)
113
114 - def ssh_to_cluster_node(self, cluster_name, node_id, user='root', 115 command=None):
116 """ 117 ssh to a node in cluster_name that has either an id, 118 dns name, or alias matching node_id 119 120 user keyword specifies an alternate user to login as 121 """ 122 cluster = self.get_cluster(cluster_name) 123 return cluster.ssh_to_node(node_id, user=user, command=command)
124
125 - def _get_cluster_name(self, cluster_name):
126 """ 127 Returns human readable cluster name/tag prefixed with '@sc-' 128 """ 129 if not cluster_name.startswith(static.SECURITY_GROUP_PREFIX): 130 cluster_name = static.SECURITY_GROUP_TEMPLATE % cluster_name 131 return cluster_name
132
133 - def add_node(self, cluster_name, alias=None, no_create=False):
134 cl = self.get_cluster(cluster_name) 135 cl.add_node(alias, no_create=no_create)
136
137 - def add_nodes(self, cluster_name, num_nodes, aliases=None, 138 no_create=False):
139 """ 140 Add one or more nodes to cluster 141 """ 142 cl = self.get_cluster(cluster_name) 143 cl.add_nodes(num_nodes, aliases=aliases, no_create=no_create)
144
145 - def remove_node(self, cluster_name, alias, terminate=True):
146 """ 147 Remove a single node from a cluster 148 """ 149 cl = self.get_cluster(cluster_name) 150 n = cl.get_node_by_alias(alias) 151 if not n: 152 raise exception.InstanceDoesNotExist(alias, label='node') 153 cl.remove_node(n, terminate=terminate)
154
155 - def restart_cluster(self, cluster_name):
156 """ 157 Reboots and reconfigures cluster_name 158 """ 159 cl = self.get_cluster(cluster_name) 160 cl.restart_cluster()
161
162 - def stop_cluster(self, cluster_name, terminate_unstoppable=False):
163 """ 164 Stop an EBS-backed cluster 165 """ 166 cl = self.get_cluster(cluster_name, require_keys=False) 167 cl.stop_cluster(terminate_unstoppable)
168
169 - def terminate_cluster(self, cluster_name):
170 """ 171 Terminates cluster_name 172 """ 173 cl = self.get_cluster(cluster_name, require_keys=False) 174 cl.terminate_cluster()
175
176 - def get_cluster_security_group(self, group_name):
177 """ 178 Return all security groups on EC2 that start with '@sc-' 179 """ 180 gname = self._get_cluster_name(group_name) 181 return self.ec2.get_security_group(gname)
182
184 """ 185 Return all security groups on EC2 that start with '@sc-' 186 """ 187 glob = static.SECURITY_GROUP_TEMPLATE % '*' 188 sgs = self.ec2.get_security_groups(filters={'group-name': glob}) 189 return sgs
190
191 - def get_tag_from_sg(self, sg):
192 """ 193 Returns the cluster tag name from a security group name that starts 194 with static.SECURITY_GROUP_PREFIX 195 196 Example: 197 sg = '@sc-mycluster' 198 print get_tag_from_sg(sg) 199 mycluster 200 """ 201 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)') 202 match = regex.match(sg) 203 if match: 204 return match.groups()[0]
205
206 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
207 """ 208 Prints a summary for each active cluster on EC2 209 """ 210 if not cluster_groups: 211 cluster_groups = self.get_cluster_security_groups() 212 if not cluster_groups: 213 log.info("No clusters found...") 214 else: 215 try: 216 cluster_groups = [self.get_cluster_security_group(g) for g 217 in cluster_groups] 218 except exception.SecurityGroupDoesNotExist: 219 raise exception.ClusterDoesNotExist(g) 220 for scg in cluster_groups: 221 tag = self.get_tag_from_sg(scg.name) 222 try: 223 cl = self.get_cluster(tag, group=scg, load_plugins=False, 224 require_keys=False) 225 except exception.IncompatibleCluster, e: 226 sep = '*' * 60 227 log.error('\n'.join([sep, e.msg, sep]), 228 extra=dict(__textwrap__=True)) 229 continue 230 header = '%s (security group: %s)' % (tag, scg.name) 231 print '-' * len(header) 232 print header 233 print '-' * len(header) 234 nodes = cl.nodes 235 try: 236 n = nodes[0] 237 except IndexError: 238 n = None 239 state = getattr(n, 'state', None) 240 ltime = 'N/A' 241 uptime = 'N/A' 242 if state in ['pending', 'running']: 243 ltime = getattr(n, 'local_launch_time', 'N/A') 244 uptime = getattr(n, 'uptime', 'N/A') 245 print 'Launch time: %s' % ltime 246 print 'Uptime: %s' % uptime 247 print 'Zone: %s' % getattr(n, 'placement', 'N/A') 248 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A') 249 ebs_nodes = [n for n in nodes if n.attached_vols] 250 if ebs_nodes: 251 print 'EBS volumes:' 252 for node in ebs_nodes: 253 devices = node.attached_vols 254 node_id = node.alias or node.id 255 for dev in devices: 256 d = devices.get(dev) 257 vol_id = d.volume_id 258 status = d.status 259 print ' %s on %s:%s (status: %s)' % \ 260 (vol_id, node_id, dev, status) 261 else: 262 print 'EBS volumes: N/A' 263 spot_reqs = cl.spot_requests 264 if spot_reqs: 265 active = len([s for s in spot_reqs if s.state == 'active']) 266 opn = len([s for s in spot_reqs if s.state == 'open']) 267 msg = '' 268 if active != 0: 269 msg += '%d active' % active 270 if opn != 0: 271 if msg: 272 msg += ', ' 273 msg += '%d open' % opn 274 print 'Spot requests: %s' % msg 275 if nodes: 276 print 'Cluster nodes:' 277 for node in nodes: 278 nodeline = " %7s %s %s %s" % (node.alias, node.state, 279 node.id, node.dns_name) 280 if node.spot_id: 281 nodeline += ' (spot %s)' % node.spot_id 282 if show_ssh_status: 283 ssh_status = {True: 'Up', False: 'Down'} 284 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()] 285 print nodeline 286 print 'Total nodes: %d' % len(nodes) 287 else: 288 print 'Cluster nodes: N/A' 289 print
290
291 - def run_plugin(self, plugin_name, cluster_tag):
292 """ 293 Run a plugin defined in the config. 294 295 plugin_name must match the plugin's section name in the config 296 cluster_tag specifies the cluster to run the plugin on 297 """ 298 cl = self.get_cluster(cluster_tag, load_plugins=False) 299 if not cl.is_cluster_up(): 300 raise exception.ClusterNotRunning(cluster_tag) 301 plugs = [self.cfg.get_plugin(plugin_name)] 302 name, plugin = cl.load_plugins(plugs)[0] 303 cl.run_plugin(plugin, name)
304
305 306 -class Cluster(object):
307 - def __init__(self, 308 ec2_conn=None, 309 spot_bid=None, 310 cluster_tag=None, 311 cluster_description=None, 312 cluster_size=None, 313 cluster_user=None, 314 cluster_shell=None, 315 master_image_id=None, 316 master_instance_type=None, 317 node_image_id=None, 318 node_instance_type=None, 319 node_instance_types=[], 320 availability_zone=None, 321 keyname=None, 322 key_location=None, 323 volumes=[], 324 plugins=[], 325 permissions=[], 326 refresh_interval=30, 327 disable_queue=False, 328 disable_threads=False, 329 cluster_group=None, 330 force_spot_master=False, 331 **kwargs):
332 333 now = time.strftime("%Y%m%d%H%M") 334 self.ec2 = ec2_conn 335 self.spot_bid = spot_bid 336 self.cluster_tag = cluster_tag 337 self.cluster_description = cluster_description 338 if self.cluster_tag is None: 339 self.cluster_tag = "cluster%s" % now 340 if cluster_description is None: 341 self.cluster_description = "Cluster created at %s" % now 342 self.cluster_size = cluster_size or 0 343 self.cluster_user = cluster_user 344 self.cluster_shell = cluster_shell 345 self.master_image_id = master_image_id 346 self.master_instance_type = master_instance_type 347 self.node_image_id = node_image_id 348 self.node_instance_type = node_instance_type 349 self.node_instance_types = node_instance_types 350 self.availability_zone = availability_zone 351 self.keyname = keyname 352 self.key_location = key_location 353 self.volumes = self.load_volumes(volumes) 354 self.plugins = self.load_plugins(plugins) 355 self.permissions = permissions 356 self.refresh_interval = refresh_interval 357 self.disable_queue = disable_queue 358 self.disable_threads = disable_threads 359 self.force_spot_master = force_spot_master 360 361 self.__instance_types = static.INSTANCE_TYPES 362 self.__cluster_settings = static.CLUSTER_SETTINGS 363 self.__available_shells = static.AVAILABLE_SHELLS 364 self.__protocols = static.PROTOCOLS 365 self._progress_bar = None 366 self._master_reservation = None 367 self._node_reservation = None 368 self._nodes = [] 369 self._master = None 370 self._zone = None 371 self._plugins = plugins 372 self._cluster_group = None 373 self._placement_group = None
374
375 - def __repr__(self):
376 return '<Cluster: %s (%s-node)>' % (self.cluster_tag, 377 self.cluster_size)
378 379 @property
380 - def zone(self):
381 """ 382 If volumes are specified, this method determines the common 383 availability zone between those volumes. If an availability zone 384 is explicitly specified in the config and does not match the common 385 availability zone of the volumes, an exception is raised. If all 386 volumes are not in the same availability zone an exception is raised. 387 If no volumes are specified, returns the user specified availability 388 zone if it exists. 389 """ 390 if not self._zone: 391 zone = None 392 if self.availability_zone: 393 zone = self.ec2.get_zone(self.availability_zone).name 394 common_zone = None 395 for volume in self.volumes: 396 volid = self.volumes.get(volume).get('volume_id') 397 vol = self.ec2.get_volume(volid) 398 if not common_zone: 399 common_zone = vol.zone 400 elif vol.zone != common_zone: 401 vols = [self.volumes.get(v).get('volume_id') 402 for v in self.volumes] 403 raise exception.VolumesZoneError(vols) 404 if common_zone and zone and zone != common_zone: 405 raise exception.InvalidZone(zone, common_zone) 406 if not zone and common_zone: 407 zone = common_zone 408 self._zone = zone 409 return self._zone
410
411 - def load_volumes(self, vols):
412 """ 413 Iterate through vols and set device/partition settings automatically if 414 not specified. 415 416 This method assigns the first volume to /dev/sdz, second to /dev/sdy, 417 etc. for all volumes that do not include a device/partition setting 418 """ 419 devices = ['/dev/sd%s' % s for s in string.lowercase] 420 devmap = {} 421 for volname in vols: 422 vol = vols.get(volname) 423 dev = vol.get('device') 424 if dev in devices: 425 #rm user-defined devices from the list of auto-assigned devices 426 devices.remove(dev) 427 volid = vol.get('volume_id') 428 if dev and not volid in devmap: 429 devmap[volid] = dev 430 volumes = {} 431 for volname in vols: 432 vol = vols.get(volname) 433 vol_id = vol.get('volume_id') 434 device = vol.get('device') 435 if not device: 436 if vol_id in devmap: 437 device = devmap.get(vol_id) 438 else: 439 device = devices.pop() 440 devmap[vol_id] = device 441 if not utils.is_valid_device(device): 442 raise exception.InvalidDevice(device) 443 v = volumes[volname] = utils.AttributeDict() 444 v.update(vol) 445 v['device'] = device 446 part = vol.get('partition') 447 if part: 448 partition = device + str(part) 449 if not utils.is_valid_partition(partition): 450 raise exception.InvalidPartition(part) 451 v['partition'] = partition 452 return volumes
453
454 - def load_plugins(self, plugins):
455 plugs = [] 456 for plugin in plugins: 457 setup_class = plugin.get('setup_class') 458 plugin_name = plugin.get('__name__').split()[-1] 459 mod_name = '.'.join(setup_class.split('.')[:-1]) 460 class_name = setup_class.split('.')[-1] 461 try: 462 mod = __import__(mod_name, globals(), locals(), [class_name]) 463 except SyntaxError, e: 464 raise exception.PluginSyntaxError( 465 "Plugin %s (%s) contains a syntax error at line %s" % 466 (plugin_name, e.filename, e.lineno)) 467 except ImportError, e: 468 raise exception.PluginLoadError( 469 "Failed to import plugin %s: %s" % 470 (plugin_name, e[0])) 471 klass = getattr(mod, class_name, None) 472 if not klass: 473 raise exception.PluginError( 474 'Plugin class %s does not exist' % setup_class) 475 if not issubclass(klass, clustersetup.ClusterSetup): 476 raise exception.PluginError( 477 "Plugin %s must be a subclass of " 478 "starcluster.clustersetup.ClusterSetup" % setup_class) 479 args, kwargs = utils.get_arg_spec(klass.__init__) 480 config_args = [] 481 missing_args = [] 482 for arg in args: 483 if arg in plugin: 484 config_args.append(plugin.get(arg)) 485 else: 486 missing_args.append(arg) 487 log.debug("config_args = %s" % config_args) 488 if missing_args: 489 raise exception.PluginError( 490 "Not enough settings provided for plugin %s (missing: %s)" 491 % (plugin_name, ', '.join(missing_args))) 492 config_kwargs = {} 493 for arg in kwargs: 494 if arg in plugin: 495 config_kwargs[arg] = plugin.get(arg) 496 log.debug("config_kwargs = %s" % config_kwargs) 497 plugs.append((plugin_name, klass(*config_args, **config_kwargs))) 498 return plugs
499
500 - def update(self, kwargs):
501 for key in kwargs.keys(): 502 if hasattr(self, key): 503 self.__dict__[key] = kwargs[key]
504
506 """ 507 Validate existing instances against this cluster's settings 508 """ 509 self.wait_for_active_spots() 510 nodes = self.nodes 511 if not nodes: 512 raise exception.ClusterValidationError("No existing nodes found!") 513 log.info("Validating existing instances...") 514 mazone = self.master_node.placement 515 rlmap = self._get_launch_map(reverse=True) 516 for node in nodes: 517 itype, image = rlmap.get(node.alias) 518 alias = node.alias 519 ntype = node.instance_type 520 if ntype != itype: 521 raise exception.ClusterValidationError( 522 "%s's instance type (%s) != %s" % (alias, ntype, itype)) 523 nimage = node.image_id 524 if nimage != image: 525 raise exception.ClusterValidationError( 526 "%s's image id (%s) != %s" % (alias, nimage, image)) 527 if node.key_name != self.keyname: 528 raise exception.ClusterValidationError( 529 "%s's key_name (%s) != %s" % (alias, node.key_name, 530 self.keyname)) 531 nazone = node.placement 532 if mazone != nazone: 533 raise exception.ClusterValidationError( 534 "Node '%s' zone (%s) does not match master's zone (%s)" % 535 (alias, nazone, mazone)) 536 # reset zone cache 537 self._zone = None 538 if self.zone and self.zone != mazone: 539 raise exception.ClusterValidationError( 540 "Running cluster's availability_zone (%s) != %s" % 541 (mazone, self.zone))
542
543 - def get(self, name):
544 return self.__dict__.get(name)
545
546 - def __str__(self):
547 cfg = self.__getstate__() 548 return pprint.pformat(cfg)
549
550 - def load_receipt(self, load_plugins=True):
551 """ 552 Load the original settings used to launch this cluster into this 553 Cluster object. The settings are loaded from the cluster group's 554 description field. 555 """ 556 try: 557 desc = self.cluster_group.description 558 version, b64data = desc.split('-', 1) 559 if utils.program_version_greater(version, static.VERSION): 560 d = dict(cluster=self.cluster_tag, old_version=static.VERSION, 561 new_version=version) 562 msg = user_msgs.version_mismatch % d 563 sep = '*' * 60 564 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1}) 565 compressed_data = base64.b64decode(b64data) 566 pkl_data = zlib.decompress(compressed_data) 567 cluster_settings = cPickle.loads(str(pkl_data)).__dict__ 568 except (cPickle.PickleError, zlib.error, ValueError, TypeError, 569 EOFError, IndexError), e: 570 log.debug('load receipt exception: ', exc_info=True) 571 raise exception.IncompatibleCluster(self.cluster_group) 572 except Exception, e: 573 raise exception.ClusterReceiptError( 574 'failed to load cluster receipt: %s' % e) 575 for key in cluster_settings: 576 if hasattr(self, key): 577 setattr(self, key, cluster_settings.get(key)) 578 if load_plugins: 579 try: 580 self.plugins = self.load_plugins(self._plugins) 581 except exception.PluginError, e: 582 log.warn(e) 583 log.warn("An error occurred while loading plugins") 584 log.warn("Not running any plugins") 585 except Exception, e: 586 raise exception.ClusterReceiptError( 587 'failed to load cluster receipt: %s' % e) 588 return True
589
590 - def __getstate__(self):
591 cfg = {} 592 exclude = ['key_location', 'plugins'] 593 include = ['_zone', '_plugins'] 594 for key in self.__dict__.keys(): 595 private = key.startswith('_') 596 if (not private or key in include) and not key in exclude: 597 val = getattr(self, key) 598 if type(val) in [str, unicode, bool, int, float, list, dict]: 599 cfg[key] = val 600 elif type(val) is utils.AttributeDict: 601 cfg[key] = dict(val) 602 return cfg
603 604 @property
605 - def _security_group(self):
606 return static.SECURITY_GROUP_TEMPLATE % self.cluster_tag
607 608 @property
609 - def cluster_group(self):
610 if self._cluster_group is None: 611 ssh_port = static.DEFAULT_SSH_PORT 612 desc = base64.b64encode(zlib.compress(cPickle.dumps(self))) 613 desc = '-'.join([static.VERSION, desc]) 614 sg = self.ec2.get_or_create_group(self._security_group, 615 desc, 616 auth_ssh=True, 617 auth_group_traffic=True) 618 for p in self.permissions: 619 perm = self.permissions.get(p) 620 ip_protocol = perm.get('ip_protocol', 'tcp') 621 from_port = perm.get('from_port') 622 to_port = perm.get('to_port') 623 cidr_ip = perm.get('cidr_ip', static.WORLD_CIDRIP) 624 if not self.ec2.has_permission(sg, ip_protocol, from_port, 625 to_port, cidr_ip): 626 log.info("Opening %s port range %s-%s for CIDR %s" % 627 (ip_protocol, from_port, to_port, cidr_ip)) 628 sg.authorize(ip_protocol, from_port, to_port, cidr_ip) 629 if ip_protocol == 'tcp' and from_port <= ssh_port <= to_port: 630 sg.revoke(ip_protocol, ssh_port, ssh_port, 631 static.WORLD_CIDRIP) 632 self._cluster_group = sg 633 return self._cluster_group
634 635 @property
636 - def placement_group(self):
637 if self._placement_group is None: 638 pg = self.ec2.get_or_create_placement_group(self._security_group) 639 self._placement_group = pg 640 return self._placement_group
641 642 @property
643 - def master_node(self):
644 if not self._master: 645 for node in self.nodes: 646 if node.is_master(): 647 self._master = node 648 if not self._master: 649 raise exception.MasterDoesNotExist() 650 return self._master
651 652 @property
653 - def nodes(self):
654 states = ['pending', 'running', 'stopping', 'stopped'] 655 filters = {'group-name': self._security_group, 656 'instance-state-name': states} 657 nodes = self.ec2.get_all_instances(filters=filters) 658 # remove any cached nodes not in the current node list from EC2 659 current_ids = [n.id for n in nodes] 660 remove_nodes = [n for n in self._nodes if n.id not in current_ids] 661 for node in remove_nodes: 662 self._nodes.remove(node) 663 # update node cache with latest instance data from EC2 664 existing_nodes = dict([(n.id, n) for n in self._nodes]) 665 log.debug('existing nodes: %s' % existing_nodes) 666 for node in nodes: 667 if node.id in existing_nodes: 668 log.debug('updating existing node %s in self._nodes' % node.id) 669 enode = existing_nodes.get(node.id) 670 enode.key_location = self.key_location 671 enode.instance = node 672 else: 673 log.debug('adding node %s to self._nodes list' % node.id) 674 n = Node(node, self.key_location) 675 if n.is_master(): 676 self._master = n 677 self._nodes.insert(0, n) 678 else: 679 self._nodes.append(n) 680 self._nodes.sort(key=lambda n: n.alias) 681 log.debug('returning self._nodes = %s' % self._nodes) 682 return self._nodes
683
684 - def get_nodes_or_raise(self):
685 nodes = self.nodes 686 if not nodes: 687 filters = {'group-name': self._security_group} 688 terminated_nodes = self.ec2.get_all_instances(filters=filters) 689 raise exception.NoClusterNodesFound(terminated_nodes) 690 return nodes
691
692 - def get_node_by_dns_name(self, dns_name):
693 for node in self.nodes: 694 if node.dns_name == dns_name: 695 return node 696 raise exception.InstanceDoesNotExist(dns_name, label='node')
697
698 - def get_node_by_id(self, instance_id):
699 for node in self.nodes: 700 if node.id == instance_id: 701 return node 702 raise exception.InstanceDoesNotExist(instance_id, label='node')
703
704 - def get_node_by_alias(self, alias):
705 for node in self.nodes: 706 if node.alias == alias: 707 return node 708 raise exception.InstanceDoesNotExist(alias, label='node')
709
710 - def _nodes_in_states(self, states):
711 return filter(lambda x: x.state in states, self.nodes)
712 713 @property
714 - def running_nodes(self):
715 return self._nodes_in_states(['running'])
716 717 @property
718 - def stopped_nodes(self):
719 return self._nodes_in_states(['stopping', 'stopped'])
720 721 @property
722 - def spot_requests(self):
723 filters = {'launch.group-id': self.cluster_group.id, 724 'state': ['active', 'open']} 725 return self.ec2.get_all_spot_requests(filters=filters)
726
727 - def get_spot_requests_or_raise(self):
728 spots = self.spot_requests 729 if not spots: 730 raise exception.NoClusterSpotRequests 731 return spots
732
733 - def create_node(self, alias, image_id=None, instance_type=None, zone=None, 734 placement_group=None, spot_bid=None, force_flat=False):
735 return self.create_nodes([alias], image_id=image_id, 736 instance_type=instance_type, zone=zone, 737 placement_group=placement_group, 738 spot_bid=spot_bid, force_flat=force_flat)[0]
739
740 - def create_nodes(self, aliases, image_id=None, instance_type=None, 741 zone=None, placement_group=None, spot_bid=None, 742 force_flat=False):
743 """ 744 Convenience method for requesting instances with this cluster's 745 settings. All settings (kwargs) except force_flat default to cluster 746 settings if not provided. Passing force_flat=True ignores spot_bid 747 completely forcing a flat-rate instance to be requested. 748 """ 749 spot_bid = spot_bid or self.spot_bid 750 if force_flat: 751 spot_bid = None 752 cluster_sg = self.cluster_group.name 753 instance_type = instance_type or self.node_instance_type 754 if not placement_group and instance_type in static.CLUSTER_TYPES: 755 placement_group = self.placement_group.name 756 image_id = image_id or self.node_image_id 757 count = len(aliases) if not spot_bid else 1 758 kwargs = dict(price=spot_bid, instance_type=instance_type, 759 min_count=count, max_count=count, count=count, 760 key_name=self.keyname, security_groups=[cluster_sg], 761 availability_zone_group=cluster_sg, 762 launch_group=cluster_sg, placement=zone or self.zone, 763 user_data='|'.join(aliases), 764 placement_group=placement_group) 765 resvs = [] 766 if spot_bid: 767 for alias in aliases: 768 kwargs['user_data'] = alias 769 resvs.extend(self.ec2.request_instances(image_id, **kwargs)) 770 else: 771 resvs.append(self.ec2.request_instances(image_id, **kwargs)) 772 for resv in resvs: 773 log.info(str(resv), extra=dict(__raw__=True)) 774 return resvs
775
776 - def _get_next_node_num(self):
777 nodes = self._nodes_in_states(['pending', 'running']) 778 nodes = filter(lambda x: not x.is_master(), nodes) 779 highest = 0 780 for n in nodes: 781 try: 782 highest = max(highest, int(n.alias[4:8])) 783 except ValueError: 784 pass 785 next = highest + 1 786 log.debug("Highest node number is %d. choosing %d." % (highest, next)) 787 return next
788
789 - def add_node(self, alias=None, no_create=False):
790 """ 791 Add a single node to this cluster 792 """ 793 aliases = None 794 if alias: 795 aliases = [alias] 796 self.add_nodes(1, aliases=aliases, no_create=no_create)
797
798 - def add_nodes(self, num_nodes, aliases=None, no_create=False):
799 """ 800 Add new nodes to this cluster 801 802 aliases - list of aliases to assign to new nodes (len must equal 803 num_nodes) 804 """ 805 running_pending = self._nodes_in_states(['pending', 'running']) 806 aliases = aliases or [] 807 if not aliases: 808 next_node_id = self._get_next_node_num() 809 for i in range(next_node_id, next_node_id + num_nodes): 810 alias = 'node%.3d' % i 811 aliases.append(alias) 812 assert len(aliases) == num_nodes 813 if "master" in aliases: 814 raise exception.ClusterValidationError( 815 "worker nodes cannot have master as an alias") 816 if not no_create: 817 for node in running_pending: 818 if node.alias in aliases: 819 raise exception.ClusterValidationError( 820 "node with alias %s already exists" % node.alias) 821 log.info("Launching node(s): %s" % ', '.join(aliases)) 822 self.create_nodes(aliases) 823 self.wait_for_cluster(msg="Waiting for node(s) to come up...") 824 log.debug("Adding node(s): %s" % aliases) 825 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 826 self.disable_threads) 827 for alias in aliases: 828 node = self.get_node_by_alias(alias) 829 default_plugin.on_add_node( 830 node, self.nodes, self.master_node, 831 self.cluster_user, self.cluster_shell, 832 self.volumes) 833 self.run_plugins(method_name="on_add_node", node=node)
834
835 - def remove_node(self, node, terminate=True):
836 """ 837 Remove a single node from this cluster 838 """ 839 return self.remove_nodes([node], terminate=terminate)
840
841 - def remove_nodes(self, nodes, terminate=True):
842 """ 843 Remove a list of nodes from this cluster 844 """ 845 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 846 self.disable_threads) 847 for node in nodes: 848 if node.is_master(): 849 raise exception.InvalidOperation("cannot remove master node") 850 self.run_plugins(method_name="on_remove_node", 851 node=node, reverse=True) 852 default_plugin.on_remove_node( 853 node, self.nodes, self.master_node, 854 self.cluster_user, self.cluster_shell, 855 self.volumes) 856 if not terminate: 857 continue 858 if node.spot_id: 859 log.info("Canceling spot request %s" % node.spot_id) 860 node.get_spot_request().cancel() 861 node.terminate()
862
863 - def _get_launch_map(self, reverse=False):
864 """ 865 Groups all node-aliases that have similar instance types/image ids 866 Returns a dictionary that's used to launch all similar instance types 867 and image ids in the same request. Example return value: 868 869 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'], 870 ('m1.large', 'ami-a5c02dcc'): ['node003'], 871 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'], 872 ('m1.small', 'ami-19e17a2b'): ['node004']} 873 874 Passing reverse=True will return the same information only keyed by 875 node aliases: 876 877 {'master': ('m1.small', 'ami-17b15e7e'), 878 'node001': ('c1.xlarge', 'ami-a5c02dcc'), 879 'node002': ('c1.xlarge', 'ami-a5c02dcc'), 880 'node003': ('m1.large', 'ami-a5c02dcc'), 881 'node004': ('m1.small', 'ami-19e17a2b'), 882 'node005': ('m1.small', 'ami-17b15e7e'), 883 'node006': ('m1.small', 'ami-17b15e7e')} 884 """ 885 lmap = {} 886 mtype = self.master_instance_type or self.node_instance_type 887 mimage = self.master_image_id or self.node_image_id 888 lmap[(mtype, mimage)] = ['master'] 889 id_start = 1 890 for itype in self.node_instance_types: 891 count = itype['size'] 892 image_id = itype['image'] or self.node_image_id 893 type = itype['type'] or self.node_instance_type 894 if not (type, image_id) in lmap: 895 lmap[(type, image_id)] = [] 896 for id in range(id_start, id_start + count): 897 alias = 'node%.3d' % id 898 log.debug("Launch map: %s (ami: %s, type: %s)..." % 899 (alias, image_id, type)) 900 lmap[(type, image_id)].append(alias) 901 id_start += 1 902 ntype = self.node_instance_type 903 nimage = self.node_image_id 904 if not (ntype, nimage) in lmap: 905 lmap[(ntype, nimage)] = [] 906 for id in range(id_start, self.cluster_size): 907 alias = 'node%.3d' % id 908 log.debug("Launch map: %s (ami: %s, type: %s)..." % 909 (alias, nimage, ntype)) 910 lmap[(ntype, nimage)].append(alias) 911 if reverse: 912 rlmap = {} 913 for (itype, image_id) in lmap: 914 aliases = lmap.get((itype, image_id)) 915 for alias in aliases: 916 rlmap[alias] = (itype, image_id) 917 return rlmap 918 return lmap
919
920 - def _get_type_and_image_id(self, alias):
921 """ 922 Returns (instance_type,image_id) for a given alias based 923 on the map returned from self._get_launch_map 924 """ 925 lmap = self._get_launch_map() 926 for (type, image) in lmap: 927 key = (type, image) 928 if alias in lmap.get(key): 929 return key
930
931 - def create_cluster(self):
932 """ 933 Launches all EC2 instances based on this cluster's settings. 934 """ 935 log.info("Launching a %d-node cluster..." % self.cluster_size) 936 mtype = self.master_instance_type or self.node_instance_type 937 self.master_instance_type = mtype 938 if self.spot_bid: 939 self._create_spot_cluster() 940 else: 941 self._create_flat_rate_cluster()
942
943 - def _create_flat_rate_cluster(self):
944 """ 945 Launches cluster using flat-rate instances. This method attempts to 946 minimize the number of launch requests by grouping nodes of the same 947 type/ami and launching each group simultaneously within a single launch 948 request. This is especially important for Cluster Compute instances 949 given that Amazon *highly* recommends requesting all CCI in a single 950 launch request. 951 """ 952 lmap = self._get_launch_map() 953 zone = None 954 master_map = None 955 for (type, image) in lmap: 956 # launch all aliases that match master's itype/image_id 957 aliases = lmap.get((type, image)) 958 if 'master' in aliases: 959 master_map = (type, image) 960 for alias in aliases: 961 log.debug("Launching %s (ami: %s, type: %s)" % 962 (alias, image, type)) 963 master_response = self.create_nodes(aliases, image_id=image, 964 instance_type=type, 965 force_flat=True)[0] 966 zone = master_response.instances[0].placement 967 lmap.pop(master_map) 968 if self.cluster_size <= 1: 969 return 970 for (type, image) in lmap: 971 aliases = lmap.get((type, image)) 972 for alias in aliases: 973 log.debug("Launching %s (ami: %s, type: %s)" % 974 (alias, image, type)) 975 self.create_nodes(aliases, image_id=image, instance_type=type, 976 zone=zone, force_flat=True)
977
978 - def _create_spot_cluster(self):
979 """ 980 Launches cluster using all spot instances. This method makes a single 981 spot request for each node in the cluster since spot instances 982 *always* have an ami_launch_index of 0. This is needed in order to 983 correctly assign aliases to nodes. 984 """ 985 (mtype, mimage) = self._get_type_and_image_id('master') 986 log.info("Launching master node (ami: %s, type: %s)..." % 987 (mimage, mtype)) 988 force_flat = not self.force_spot_master and self.cluster_size > 1 989 master_response = self.create_node('master', 990 image_id=mimage, 991 instance_type=mtype, 992 force_flat=force_flat) 993 zone = None 994 if not force_flat and self.spot_bid: 995 # Make sure nodes are in same zone as master 996 launch_spec = master_response.launch_specification 997 zone = launch_spec.placement 998 else: 999 # Make sure nodes are in same zone as master 1000 zone = master_response.instances[0].placement 1001 if self.cluster_size <= 1: 1002 return 1003 for id in range(1, self.cluster_size): 1004 alias = 'node%.3d' % id 1005 (ntype, nimage) = self._get_type_and_image_id(alias) 1006 log.info("Launching %s (ami: %s, type: %s)" % 1007 (alias, nimage, ntype)) 1008 self.create_node(alias, image_id=nimage, instance_type=ntype, 1009 zone=zone)
1010
1011 - def is_spot_cluster(self):
1012 """ 1013 Returns True if all nodes are spot instances 1014 """ 1015 nodes = self.nodes 1016 if not nodes: 1017 return False 1018 for node in nodes: 1019 if not node.is_spot(): 1020 return False 1021 return True
1022
1023 - def has_spot_nodes(self):
1024 """ 1025 Returns True if any nodes are spot instances 1026 """ 1027 for node in self.nodes: 1028 if node.is_spot(): 1029 return True 1030 return False
1031
1032 - def is_ebs_cluster(self):
1033 """ 1034 Returns True if all nodes are EBS-backed 1035 """ 1036 nodes = self.nodes 1037 if not nodes: 1038 return False 1039 for node in nodes: 1040 if not node.is_ebs_backed(): 1041 return False 1042 return True
1043
1044 - def has_ebs_nodes(self):
1045 """ 1046 Returns True if any nodes are EBS-backed 1047 """ 1048 for node in self.nodes: 1049 if node.is_ebs_backed(): 1050 return True 1051 return False
1052
1053 - def is_stoppable(self):
1054 """ 1055 Returns True if all nodes are stoppable (i.e. non-spot and EBS-backed) 1056 """ 1057 nodes = self.nodes 1058 if not nodes: 1059 return False 1060 for node in self.nodes: 1061 if not node.is_stoppable(): 1062 return False 1063 return True
1064
1065 - def has_stoppable_nodes(self):
1066 """ 1067 Returns True if any nodes are stoppable (i.e. non-spot and EBS-backed) 1068 """ 1069 nodes = self.nodes 1070 if not nodes: 1071 return False 1072 for node in nodes: 1073 if node.is_stoppable(): 1074 return True 1075 return False
1076
1077 - def is_cluster_compute(self):
1078 """ 1079 Returns true if all instances are Cluster/GPU Compute type 1080 """ 1081 nodes = self.nodes 1082 if not nodes: 1083 return False 1084 for node in nodes: 1085 if not node.is_cluster_compute(): 1086 return False 1087 return True
1088
1089 - def has_cluster_compute_nodes(self):
1090 for node in self.nodes: 1091 if node.is_cluster_compute(): 1092 return True 1093 return False
1094
1095 - def is_cluster_up(self):
1096 """ 1097 Check that all nodes are 'running' and that ssh is up on all nodes 1098 This method will return False if any spot requests are in an 'open' 1099 state. 1100 """ 1101 spots = self.spot_requests 1102 active_spots = filter(lambda x: x.state == 'active', spots) 1103 if len(spots) != len(active_spots): 1104 return False 1105 nodes = self.nodes 1106 if not nodes: 1107 return False 1108 for node in nodes: 1109 if not node.is_up(): 1110 return False 1111 return True
1112
1113 - def get_spinner(self, msg):
1114 """ 1115 Logs a status msg, starts a spinner, and returns the spinner object. 1116 This is useful for long running processes: 1117 1118 s = self.get_spinner("Long running process running...") 1119 (do something) 1120 s.stop() 1121 """ 1122 s = spinner.Spinner() 1123 log.info(msg, extra=dict(__nonewline__=True)) 1124 s.start() 1125 return s
1126 1127 @property
1128 - def progress_bar(self):
1129 if not self._progress_bar: 1130 widgets = ['', progressbar.Fraction(), ' ', 1131 progressbar.Bar(marker=progressbar.RotatingMarker()), 1132 ' ', progressbar.Percentage(), ' ', ' '] 1133 pbar = progressbar.ProgressBar(widgets=widgets, 1134 maxval=self.cluster_size, 1135 force_update=True) 1136 self._progress_bar = pbar 1137 return self._progress_bar
1138
1139 - def wait_for_active_spots(self, spots=None):
1140 """ 1141 Wait for all open spot requests for this cluster to transition to 1142 'active'. 1143 """ 1144 spots = spots or self.spot_requests 1145 open_spots = [spot for spot in spots if spot.state == "open"] 1146 if open_spots: 1147 pbar = self.progress_bar.reset() 1148 log.info('Waiting for open spot requests to become active...') 1149 pbar.maxval = len(spots) 1150 pbar.update(0) 1151 while not pbar.finished: 1152 active_spots = filter(lambda x: x.state == "active", spots) 1153 pbar.maxval = len(spots) 1154 pbar.update(len(active_spots)) 1155 if not pbar.finished: 1156 time.sleep(self.refresh_interval) 1157 spots = self.get_spot_requests_or_raise() 1158 pbar.reset()
1159
1160 - def wait_for_active_instances(self, nodes=None):
1161 """ 1162 Wait indefinitely for cluster nodes to show up. 1163 """ 1164 nodes = nodes or self.nodes 1165 if len(nodes) == 0: 1166 s = self.get_spinner("Waiting for instances to activate...") 1167 while len(nodes) == 0: 1168 time.sleep(self.refresh_interval) 1169 nodes = self.nodes 1170 s.stop()
1171
1172 - def wait_for_running_instances(self, nodes=None):
1173 """ 1174 Wait until all cluster nodes are in a 'running' state 1175 """ 1176 log.info("Waiting for all nodes to be in a 'running' state...") 1177 nodes = nodes or self.get_nodes_or_raise() 1178 pbar = self.progress_bar.reset() 1179 pbar.maxval = len(nodes) 1180 pbar.update(0) 1181 while not pbar.finished: 1182 running_nodes = filter(lambda x: x.state == "running", nodes) 1183 pbar.maxval = len(nodes) 1184 pbar.update(len(running_nodes)) 1185 if not pbar.finished: 1186 time.sleep(self.refresh_interval) 1187 nodes = self.get_nodes_or_raise() 1188 pbar.reset()
1189
1190 - def wait_for_ssh(self, nodes=None):
1191 """ 1192 Wait until all cluster nodes are in a 'running' state 1193 """ 1194 log.info("Waiting for SSH to come up on all nodes...") 1195 nodes = nodes or self.get_nodes_or_raise() 1196 pbar = self.progress_bar.reset() 1197 pbar.maxval = len(nodes) 1198 pbar.update(0) 1199 while not pbar.finished: 1200 active_nodes = filter(lambda n: n.is_up(), nodes) 1201 pbar.maxval = len(nodes) 1202 pbar.update(len(active_nodes)) 1203 if not pbar.finished: 1204 time.sleep(self.refresh_interval) 1205 nodes = self.get_nodes_or_raise() 1206 pbar.finish()
1207 1208 @print_timing("Waiting for cluster to come up")
1209 - def wait_for_cluster(self, msg="Waiting for cluster to come up..."):
1210 """ 1211 Wait for cluster to come up and display progress bar. Waits for all 1212 spot requests to become 'active', all instances to be in a 'running' 1213 state, and for all SSH daemons to come up. 1214 1215 msg - custom message to print out before waiting on the cluster 1216 """ 1217 interval = self.refresh_interval 1218 log.info("%s %s" % (msg, "(updating every %ds)" % interval)) 1219 self.wait_for_active_spots() 1220 self.wait_for_active_instances() 1221 self.wait_for_running_instances() 1222 self.wait_for_ssh()
1223
1224 - def is_cluster_stopped(self):
1225 """ 1226 Check whether all nodes are in the 'stopped' state 1227 """ 1228 nodes = self.nodes 1229 if not nodes: 1230 return False 1231 for node in nodes: 1232 if node.state != 'stopped': 1233 return False 1234 return True
1235
1236 - def is_cluster_terminated(self):
1237 """ 1238 Check whether all nodes are in a 'terminated' state 1239 """ 1240 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES) 1241 filters = {'group-name': self._security_group, 1242 'instance-state-name': states} 1243 insts = self.ec2.get_all_instances(filters=filters) 1244 return len(insts) == 0
1245
1246 - def attach_volumes_to_master(self):
1247 """ 1248 Attach each volume to the master node 1249 """ 1250 for vol in self.volumes: 1251 volume = self.volumes.get(vol) 1252 device = volume.get('device') 1253 vol_id = volume.get('volume_id') 1254 vol = self.ec2.get_volume(vol_id) 1255 if vol.attach_data.instance_id == self.master_node.id: 1256 log.info("Volume %s already attached to master...skipping" % 1257 vol.id) 1258 continue 1259 if vol.status != "available": 1260 log.error('Volume %s not available...' 1261 'please check and try again' % vol.id) 1262 continue 1263 log.info("Attaching volume %s to master node on %s ..." % (vol.id, 1264 device)) 1265 resp = vol.attach(self.master_node.id, device) 1266 log.debug("resp = %s" % resp) 1267 while True: 1268 vol.update() 1269 if vol.attachment_state() == 'attached': 1270 break 1271 time.sleep(5)
1272
1273 - def detach_volumes(self):
1274 """ 1275 Detach all volumes from all nodes 1276 """ 1277 for node in self.nodes: 1278 node.detach_external_volumes()
1279 1280 @print_timing('Restarting cluster')
1281 - def restart_cluster(self):
1282 """ 1283 Reboot all instances and reconfigure the cluster 1284 """ 1285 nodes = self.nodes 1286 if not nodes: 1287 raise exception.ClusterValidationError("No running nodes found") 1288 self.run_plugins(method_name="on_restart", reverse=True) 1289 log.info("Rebooting cluster...") 1290 for node in nodes: 1291 node.reboot() 1292 sleep = 20 1293 log.info("Sleeping for %d seconds..." % sleep) 1294 time.sleep(sleep) 1295 self.setup_cluster()
1296
1297 - def stop_cluster(self, terminate_unstoppable=False):
1298 """ 1299 Shutdown this cluster by detaching all volumes and 'stopping' all nodes 1300 1301 In general, all nodes in the cluster must be 'stoppable' meaning all 1302 nodes are backed by flat-rate EBS-backed instances. If any 1303 'unstoppable' nodes are found an exception is raised. A node is 1304 'unstoppable' if it is backed by either a spot or S3-backed instance. 1305 1306 If the cluster contains a mix of 'stoppable' and 'unstoppable' nodes 1307 you can stop all stoppable nodes and terminate any unstoppable nodes by 1308 setting terminate_unstoppable=True. 1309 1310 This will stop all nodes that can be stopped and terminate the rest. 1311 """ 1312 nodes = self.nodes 1313 if not nodes: 1314 raise exception.ClusterValidationError("No running nodes found") 1315 if not self.is_stoppable(): 1316 has_stoppable_nodes = self.has_stoppable_nodes() 1317 if not terminate_unstoppable and has_stoppable_nodes: 1318 raise exception.InvalidOperation( 1319 "Cluster contains nodes that are not stoppable") 1320 if not has_stoppable_nodes: 1321 raise exception.InvalidOperation( 1322 "Cluster does not contain any stoppable nodes") 1323 try: 1324 self.run_plugins(method_name="on_shutdown", reverse=True) 1325 except exception.MasterDoesNotExist, e: 1326 log.warn("Cannot run plugins: %s" % e) 1327 self.detach_volumes() 1328 for node in nodes: 1329 node.shutdown()
1330
1331 - def terminate_cluster(self):
1332 """ 1333 Destroy this cluster by first detaching all volumes, shutting down all 1334 instances, canceling all spot requests (if any), removing its placement 1335 group (if any), and removing its security group. 1336 """ 1337 try: 1338 self.run_plugins(method_name="on_shutdown", reverse=True) 1339 except exception.MasterDoesNotExist, e: 1340 log.warn("Cannot run plugins: %s" % e) 1341 self.detach_volumes() 1342 nodes = self.nodes 1343 for node in nodes: 1344 node.terminate() 1345 for spot in self.spot_requests: 1346 if spot.state not in ['cancelled', 'closed']: 1347 log.info("Canceling spot instance request: %s" % spot.id) 1348 spot.cancel() 1349 sg = self.ec2.get_group_or_none(self._security_group) 1350 pg = self.ec2.get_placement_group_or_none(self._security_group) 1351 s = self.get_spinner("Waiting for cluster to terminate...") 1352 while not self.is_cluster_terminated(): 1353 time.sleep(5) 1354 s.stop() 1355 if pg: 1356 log.info("Removing %s placement group" % pg.name) 1357 pg.delete() 1358 if sg: 1359 log.info("Removing %s security group" % sg.name) 1360 sg.delete()
1361
1362 - def start(self, create=True, create_only=False, validate=True, 1363 validate_only=False, validate_running=False):
1364 """ 1365 Creates and configures a cluster from this cluster template's settings. 1366 1367 create - create new nodes when starting the cluster. set to False to 1368 use existing nodes 1369 create_only - only create the cluster node instances, don't configure 1370 the cluster 1371 validate - whether or not to validate the cluster settings used. 1372 False will ignore validate_only and validate_running 1373 keywords and is effectively the same as running _start 1374 validate_only - only validate cluster settings, do not create or 1375 configure cluster 1376 validate_running - whether or not to validate the existing instances 1377 being used against this cluster's settings 1378 """ 1379 if validate: 1380 if not create and validate_running: 1381 try: 1382 self._validate_running_instances() 1383 except exception.ClusterValidationError, e: 1384 msg = "Existing nodes are not compatible with cluster " 1385 msg += "settings:\n" 1386 e.msg = msg + e.msg 1387 raise 1388 self._validate() 1389 if validate_only: 1390 return 1391 else: 1392 log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK") 1393 return self._start(create=create, create_only=create_only)
1394 1395 @print_timing("Starting cluster")
1396 - def _start(self, create=True, create_only=False):
1397 """ 1398 Create and configure a cluster from this cluster template's settings 1399 (Does not attempt to validate before running) 1400 1401 create - create new nodes when starting the cluster. set to False to 1402 use existing nodes 1403 create_only - only create the cluster node instances, don't configure 1404 the cluster 1405 """ 1406 log.info("Starting cluster...") 1407 if create: 1408 self.create_cluster() 1409 else: 1410 assert self.master_node is not None 1411 for node in self.stopped_nodes: 1412 log.info("Starting stopped node: %s" % node.alias) 1413 node.start() 1414 if create_only: 1415 return 1416 self.setup_cluster()
1417
1418 - def setup_cluster(self):
1419 """ 1420 Waits for all nodes to come up and then runs the default 1421 StarCluster setup routines followed by any additional plugin setup 1422 routines 1423 """ 1424 self.wait_for_cluster() 1425 self._setup_cluster()
1426 1427 @print_timing("Configuring cluster")
1428 - def _setup_cluster(self):
1429 """ 1430 Runs the default StarCluster setup routines followed by any additional 1431 plugin setup routines. Does not wait for nodes to come up. 1432 """ 1433 log.info("The master node is %s" % self.master_node.dns_name) 1434 log.info("Setting up the cluster...") 1435 if self.volumes: 1436 self.attach_volumes_to_master() 1437 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 1438 self.disable_threads) 1439 default_plugin.run(self.nodes, self.master_node, self.cluster_user, 1440 self.cluster_shell, self.volumes) 1441 self.run_plugins()
1442
1443 - def run_plugins(self, plugins=None, method_name="run", node=None, 1444 reverse=False):
1445 """ 1446 Run all plugins specified in this Cluster object's self.plugins list 1447 Uses plugins list instead of self.plugins if specified. 1448 1449 plugins must be a tuple: the first element is the plugin's name, the 1450 second element is the plugin object (a subclass of ClusterSetup) 1451 """ 1452 plugs = plugins or self.plugins 1453 if reverse: 1454 plugs = plugs[:] 1455 plugs.reverse() 1456 for plug in plugs: 1457 name, plugin = plug 1458 self.run_plugin(plugin, name, method_name=method_name, node=node)
1459
1460 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1461 """ 1462 Run a StarCluster plugin. 1463 1464 plugin - an instance of the plugin's class 1465 name - a user-friendly label for the plugin 1466 method_name - the method to run within the plugin (default: "run") 1467 node - optional node to pass as first argument to plugin method (used 1468 for on_add_node/on_remove_node) 1469 """ 1470 plugin_name = name or str(plugin) 1471 try: 1472 func = getattr(plugin, method_name, None) 1473 if not func: 1474 log.warn("Plugin %s has no %s method...skipping" % 1475 (plugin_name, method_name)) 1476 return 1477 args = [self.nodes, self.master_node, self.cluster_user, 1478 self.cluster_shell, self.volumes] 1479 if node: 1480 args.insert(0, node) 1481 log.info("Running plugin %s" % plugin_name) 1482 func(*args) 1483 except NotImplementedError: 1484 log.debug("method %s not implemented by plugin %s" % (method_name, 1485 plugin_name)) 1486 except exception.MasterDoesNotExist: 1487 raise 1488 except Exception, e: 1489 log.error("Error occurred while running plugin '%s':" % 1490 plugin_name) 1491 if isinstance(e, exception.ThreadPoolException): 1492 e.print_excs() 1493 log.debug(e.format_excs()) 1494 else: 1495 traceback.print_exc() 1496 log.debug(traceback.format_exc())
1497
1498 - def is_running_valid(self):
1499 """ 1500 Checks whether the current running instances are compatible 1501 with this cluster template's settings 1502 """ 1503 try: 1504 self._validate_running_instances() 1505 return True 1506 except exception.ClusterValidationError, e: 1507 log.error(e.msg) 1508 return False
1509
1510 - def _validate(self):
1511 """ 1512 Checks that all cluster template settings are valid. Raises a 1513 ClusterValidationError exception if not. 1514 """ 1515 log.info("Validating cluster template settings...") 1516 try: 1517 self._has_all_required_settings() 1518 self._validate_spot_bid() 1519 self._validate_cluster_size() 1520 self._validate_shell_setting() 1521 self._validate_permission_settings() 1522 self._validate_credentials() 1523 self._validate_keypair() 1524 self._validate_zone() 1525 self._validate_ebs_settings() 1526 self._validate_ebs_aws_settings() 1527 self._validate_image_settings() 1528 self._validate_instance_types() 1529 self._validate_cluster_compute() 1530 log.info('Cluster template settings are valid') 1531 return True 1532 except exception.ClusterValidationError, e: 1533 e.msg = 'Cluster settings are not valid:\n%s' % e.msg 1534 raise
1535
1536 - def is_valid(self):
1537 """ 1538 Returns True if all cluster template settings are valid 1539 """ 1540 try: 1541 self._validate() 1542 return True 1543 except exception.ClusterValidationError, e: 1544 log.error(e.msg) 1545 return False
1546
1547 - def _validate_spot_bid(self):
1548 if self.spot_bid is not None: 1549 if type(self.spot_bid) not in [int, float]: 1550 raise exception.ClusterValidationError( 1551 'spot_bid must be integer or float') 1552 if self.spot_bid <= 0: 1553 raise exception.ClusterValidationError( 1554 'spot_bid must be an integer or float > 0') 1555 return True
1556
1557 - def _validate_cluster_size(self):
1558 try: 1559 int(self.cluster_size) 1560 if self.cluster_size < 1: 1561 raise ValueError 1562 except (ValueError, TypeError): 1563 raise exception.ClusterValidationError( 1564 'cluster_size must be an integer >= 1') 1565 num_itypes = sum([i.get('size') for i in self.node_instance_types]) 1566 num_nodes = self.cluster_size - 1 1567 if num_itypes > num_nodes: 1568 raise exception.ClusterValidationError( 1569 "total number of nodes specified in node_instance_type (%s) " 1570 "must be <= cluster_size-1 (%s)" % (num_itypes, num_nodes)) 1571 return True
1572
1573 - def _validate_shell_setting(self):
1574 cluster_shell = self.cluster_shell 1575 if not self.__available_shells.get(cluster_shell): 1576 raise exception.ClusterValidationError( 1577 'Invalid user shell specified. Options are %s' % 1578 ' '.join(self.__available_shells.keys())) 1579 return True
1580
1581 - def _validate_image_settings(self):
1582 master_image_id = self.master_image_id 1583 node_image_id = self.node_image_id 1584 conn = self.ec2 1585 image = conn.get_image_or_none(node_image_id) 1586 if not image or image.id != node_image_id: 1587 raise exception.ClusterValidationError( 1588 'node_image_id %s does not exist' % node_image_id) 1589 if master_image_id: 1590 master_image = conn.get_image_or_none(master_image_id) 1591 if not master_image or master_image.id != master_image_id: 1592 raise exception.ClusterValidationError( 1593 'master_image_id %s does not exist' % master_image_id) 1594 return True
1595
1596 - def _validate_zone(self):
1597 availability_zone = self.availability_zone 1598 if availability_zone: 1599 zone = self.ec2.get_zone(availability_zone) 1600 if not zone: 1601 azone = self.availability_zone 1602 raise exception.ClusterValidationError( 1603 'availability_zone = %s does not exist' % azone) 1604 if zone.state != 'available': 1605 log.warn('The availability_zone = %s ' 1606 'is not available at this time' % zone) 1607 return True
1608
1609 - def __check_platform(self, image_id, instance_type):
1610 """ 1611 Validates whether an image_id (AMI) is compatible with a given 1612 instance_type. image_id_setting and instance_type_setting are the 1613 setting labels in the config file. 1614 """ 1615 image = self.ec2.get_image_or_none(image_id) 1616 if not image: 1617 raise exception.ClusterValidationError('Image %s does not exist' % 1618 image_id) 1619 image_platform = image.architecture 1620 image_is_hvm = (image.virtualization_type == "hvm") 1621 if image_is_hvm and not instance_type in static.CLUSTER_TYPES: 1622 cctypes_list = ', '.join(static.CLUSTER_TYPES) 1623 raise exception.ClusterValidationError( 1624 "Image '%s' is a Cluster Compute/GPU image (HVM) and cannot " 1625 "be used with instance type '%s'\nThe instance type " 1626 "for a Cluster Compute/GPU image (HVM) must be one of: %s" % 1627 (image_id, instance_type, cctypes_list)) 1628 instance_platforms = self.__instance_types[instance_type] 1629 if image_platform not in instance_platforms: 1630 error_msg = "Instance type %(instance_type)s is for an " \ 1631 "%(instance_platform)s platform while " \ 1632 "%(image_id)s is an %(image_platform)s platform" 1633 error_dict = {'instance_type': instance_type, 1634 'instance_platform': ', '.join(instance_platforms), 1635 'image_id': image_id, 1636 'image_platform': image_platform} 1637 raise exception.ClusterValidationError(error_msg % error_dict) 1638 return True
1639
1640 - def _validate_instance_types(self):
1641 master_image_id = self.master_image_id 1642 node_image_id = self.node_image_id 1643 master_instance_type = self.master_instance_type 1644 node_instance_type = self.node_instance_type 1645 instance_types = self.__instance_types 1646 instance_type_list = ', '.join(instance_types.keys()) 1647 if not node_instance_type in instance_types: 1648 raise exception.ClusterValidationError( 1649 "You specified an invalid node_instance_type %s\n" 1650 "Possible options are:\n%s" % 1651 (node_instance_type, instance_type_list)) 1652 elif master_instance_type: 1653 if not master_instance_type in instance_types: 1654 raise exception.ClusterValidationError( 1655 "You specified an invalid master_instance_type %s\n" 1656 "Possible options are:\n%s" % 1657 (master_instance_type, instance_type_list)) 1658 try: 1659 self.__check_platform(node_image_id, node_instance_type) 1660 except exception.ClusterValidationError, e: 1661 raise exception.ClusterValidationError( 1662 'Incompatible node_image_id and node_instance_type:\n' + e.msg) 1663 if master_image_id and not master_instance_type: 1664 try: 1665 self.__check_platform(master_image_id, node_instance_type) 1666 except exception.ClusterValidationError, e: 1667 raise exception.ClusterValidationError( 1668 'Incompatible master_image_id and node_instance_type\n' + 1669 e.msg) 1670 elif master_image_id and master_instance_type: 1671 try: 1672 self.__check_platform(master_image_id, master_instance_type) 1673 except exception.ClusterValidationError, e: 1674 raise exception.ClusterValidationError( 1675 'Incompatible master_image_id and master_instance_type\n' + 1676 e.msg) 1677 elif master_instance_type and not master_image_id: 1678 try: 1679 self.__check_platform(node_image_id, master_instance_type) 1680 except exception.ClusterValidationError, e: 1681 raise exception.ClusterValidationError( 1682 'Incompatible node_image_id and master_instance_type\n' + 1683 e.msg) 1684 for itype in self.node_instance_types: 1685 type = itype.get('type') 1686 img = itype.get('image') or node_image_id 1687 if not type in instance_types: 1688 raise exception.ClusterValidationError( 1689 "You specified an invalid instance type %s\n" 1690 "Possible options are:\n%s" % (type, instance_type_list)) 1691 try: 1692 self.__check_platform(img, type) 1693 except exception.ClusterValidationError, e: 1694 raise exception.ClusterValidationError( 1695 "Invalid settings for node_instance_type %s: %s" % 1696 (type, e.msg)) 1697 return True
1698
1699 - def _validate_cluster_compute(self):
1700 lmap = self._get_launch_map() 1701 for (type, image) in lmap: 1702 if type in static.CLUSTER_TYPES: 1703 img = self.ec2.get_image(image) 1704 if img.virtualization_type != 'hvm': 1705 raise exception.ClusterValidationError( 1706 'Cluster Compute/GPU instance type %s ' 1707 'can only be used with HVM images.\n' 1708 'Image %s is NOT an HVM image.' % (type, image))
1709
1710 - def _validate_ebs_aws_settings(self):
1711 """ 1712 Verify EBS volumes exists and that each volume's zone matches this 1713 cluster's zone setting. 1714 """ 1715 for vol in self.volumes: 1716 v = self.volumes.get(vol) 1717 vol_id = v.get('volume_id') 1718 vol = self.ec2.get_volume(vol_id) 1719 if vol.status != 'available': 1720 try: 1721 if vol.attach_data.instance_id == self.master_node.id: 1722 continue 1723 except exception.MasterDoesNotExist: 1724 pass 1725 msg = "volume %s is not available (status: %s)" % (vol_id, 1726 vol.status) 1727 raise exception.ClusterValidationError(msg)
1728
1730 permissions = self.permissions 1731 for perm in permissions: 1732 permission = permissions.get(perm) 1733 protocol = permission.get('ip_protocol') 1734 if protocol not in self.__protocols: 1735 raise exception.InvalidProtocol(protocol) 1736 from_port = permission.get('from_port') 1737 to_port = permission.get('to_port') 1738 try: 1739 from_port = int(from_port) 1740 to_port = int(to_port) 1741 except ValueError: 1742 raise exception.InvalidPortRange( 1743 from_port, to_port, reason="integer range required") 1744 if from_port < 0 or to_port < 0: 1745 raise exception.InvalidPortRange( 1746 from_port, to_port, 1747 reason="from/to must be positive integers") 1748 if from_port > to_port: 1749 raise exception.InvalidPortRange( 1750 from_port, to_port, 1751 reason="'from_port' must be <= 'to_port'") 1752 cidr_ip = permission.get('cidr_ip') 1753 if not iptools.validate_cidr(cidr_ip): 1754 raise exception.InvalidCIDRSpecified(cidr_ip)
1755
1756 - def _validate_ebs_settings(self):
1757 """ 1758 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs 1759 and validate these settings. Does not require AWS credentials. 1760 """ 1761 volmap = {} 1762 devmap = {} 1763 mount_paths = [] 1764 for vol in self.volumes: 1765 vol_name = vol 1766 vol = self.volumes.get(vol) 1767 vol_id = vol.get('volume_id') 1768 device = vol.get('device') 1769 partition = vol.get('partition') 1770 mount_path = vol.get("mount_path") 1771 vmap = volmap.get(vol_id, {}) 1772 devices = vmap.get('device', []) 1773 partitions = vmap.get('partition', []) 1774 if devices and device not in devices: 1775 raise exception.ClusterValidationError( 1776 "Can't attach volume %s to more than one device" % vol_id) 1777 elif partitions and partition in partitions: 1778 raise exception.ClusterValidationError( 1779 "Multiple configurations for %s\n" 1780 "Either pick one or specify a separate partition for " 1781 "each configuration" % vol_id) 1782 vmap['partition'] = partitions + [partition] 1783 vmap['device'] = devices + [device] 1784 volmap[vol_id] = vmap 1785 dmap = devmap.get(device, {}) 1786 vol_ids = dmap.get('volume_id', []) 1787 if vol_ids and vol_id not in vol_ids: 1788 raise exception.ClusterValidationError( 1789 "Can't attach more than one volume on device %s" % device) 1790 dmap['volume_id'] = vol_ids + [vol_id] 1791 devmap[device] = dmap 1792 mount_paths.append(mount_path) 1793 if not device: 1794 raise exception.ClusterValidationError( 1795 'Missing DEVICE setting for volume %s' % vol_name) 1796 if not utils.is_valid_device(device): 1797 raise exception.ClusterValidationError( 1798 "Invalid DEVICE value for volume %s" % vol_name) 1799 if partition: 1800 if not utils.is_valid_partition(partition): 1801 raise exception.ClusterValidationError( 1802 "Invalid PARTITION value for volume %s" % vol_name) 1803 if not partition.startswith(device): 1804 raise exception.ClusterValidationError( 1805 "Volume PARTITION must start with %s" % device) 1806 if not mount_path: 1807 raise exception.ClusterValidationError( 1808 'Missing MOUNT_PATH setting for volume %s' % vol_name) 1809 if not mount_path.startswith('/'): 1810 raise exception.ClusterValidationError( 1811 "MOUNT_PATH for volume %s should start with /" % vol_name) 1812 for path in mount_paths: 1813 if mount_paths.count(path) > 1: 1814 raise exception.ClusterValidationError( 1815 "Can't mount more than one volume on %s" % path) 1816 return True
1817
1818 - def _has_all_required_settings(self):
1819 has_all_required = True 1820 for opt in self.__cluster_settings: 1821 requirements = self.__cluster_settings[opt] 1822 name = opt 1823 required = requirements[1] 1824 if required and self.get(name.lower()) is None: 1825 log.warn('Missing required setting %s' % name) 1826 has_all_required = False 1827 return has_all_required
1828
1829 - def _validate_credentials(self):
1830 if not self.ec2.is_valid_conn(): 1831 raise exception.ClusterValidationError( 1832 'Invalid AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY combination.') 1833 return True
1834
1835 - def _validate_keypair(self):
1836 key_location = self.key_location 1837 if not key_location: 1838 raise exception.ClusterValidationError( 1839 "no key_location specified for key '%s'" % self.keyname) 1840 if not os.path.exists(key_location): 1841 raise exception.ClusterValidationError( 1842 "key_location '%s' does not exist" % key_location) 1843 elif not os.path.isfile(key_location): 1844 raise exception.ClusterValidationError( 1845 "key_location '%s' is not a file" % key_location) 1846 keyname = self.keyname 1847 keypair = self.ec2.get_keypair_or_none(keyname) 1848 if not keypair: 1849 raise exception.ClusterValidationError( 1850 "Account does not contain a key with keyname: %s" % keyname) 1851 fingerprint = keypair.fingerprint 1852 if len(fingerprint) == 59: 1853 localfingerprint = ssh.get_private_rsa_fingerprint(key_location) 1854 if localfingerprint != fingerprint: 1855 raise exception.ClusterValidationError( 1856 "Incorrect fingerprint for key_location '%s'\n\n" 1857 "local fingerprint: %s\n\nkeypair fingerprint: %s" 1858 % (key_location, localfingerprint, fingerprint)) 1859 else: 1860 # Skip fingerprint validation for keys created using EC2 import 1861 # keys until I can figure out the mystery behind the import keys 1862 # fingerprint. I'm able to match ssh-keygen's public key 1863 # fingerprint, however, Amazon doesn't for some reason... 1864 log.warn("Skipping keypair fingerprint validation...") 1865 if self.zone: 1866 z = self.ec2.get_zone(self.zone) 1867 if keypair.region != z.region: 1868 raise exception.ClusterValidationError( 1869 'Keypair %s not in availability zone region %s' % 1870 (keyname, z.region)) 1871 return True
1872
1873 - def ssh_to_master(self, user='root', command=None):
1874 return self.ssh_to_node('master', user=user, command=command)
1875
1876 - def ssh_to_node(self, alias, user='root', command=None):
1877 node = self.get_node_by_alias(alias) 1878 node = node or self.get_node_by_dns_name(alias) 1879 node = node or self.get_node_by_id(alias) 1880 if not node: 1881 raise exception.InstanceDoesNotExist(alias, label='node') 1882 if command: 1883 orig_user = node.ssh.get_current_user() 1884 node.ssh.switch_user(user) 1885 node.ssh.execute(command, silent=False) 1886 node.ssh.switch_user(orig_user) 1887 return node.ssh.get_last_status() 1888 else: 1889 node.shell(user=user)
1890 1891 if __name__ == "__main__": 1892 from starcluster.config import StarClusterConfig 1893 cfg = StarClusterConfig().load() 1894 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster') 1895 if sc.is_valid(): 1896 sc.start(create=True) 1897