Package starcluster :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module starcluster.cluster

   1  #!/usr/bin/env python 
   2  import os 
   3  import re 
   4  import time 
   5  import zlib 
   6  import string 
   7  import pprint 
   8  import base64 
   9  import inspect 
  10  import cPickle 
  11  import traceback 
  12   
  13  from starcluster import utils 
  14  from starcluster import static 
  15  from starcluster import spinner 
  16  from starcluster import iptools 
  17  from starcluster import managers 
  18  from starcluster import exception 
  19  from starcluster import progressbar 
  20  from starcluster import clustersetup 
  21  from starcluster.node import Node 
  22  from starcluster.utils import print_timing 
  23  from starcluster.templates import user_msgs 
  24  from starcluster.logger import log 
25 26 27 -class ClusterManager(managers.Manager):
28 """ 29 Manager class for Cluster objects 30 """
31 - def __repr__(self):
32 return "<ClusterManager: %s>" % self.ec2.region.name
33
34 - def get_cluster(self, cluster_name, group=None, load_receipt=True, 35 load_plugins=True):
36 """ 37 Returns a Cluster object representing an active cluster 38 """ 39 try: 40 clname = self._get_cluster_name(cluster_name) 41 if not group: 42 group = self.ec2.get_security_group(clname) 43 cl = Cluster(ec2_conn=self.ec2, cluster_tag=cluster_name, 44 cluster_group=group) 45 if load_receipt: 46 cl.load_receipt(load_plugins=load_plugins) 47 try: 48 key_location = self.cfg.get_key(cl.keyname).get('key_location') 49 cl.key_location = key_location 50 except (exception.KeyNotFound, Exception): 51 pass 52 return cl 53 except exception.SecurityGroupDoesNotExist: 54 raise exception.ClusterDoesNotExist(cluster_name)
55
57 """ 58 Returns name of the default cluster template defined in the config 59 """ 60 return self.cfg.get_default_cluster_template()
61
62 - def get_cluster_template(self, template_name, tag_name=None):
63 """ 64 Returns a new Cluster object using the settings from the cluster 65 template template_name 66 67 If tag_name is passed, the Cluster object's cluster_tag setting will 68 be set to tag_name 69 """ 70 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name, 71 ec2_conn=self.ec2) 72 return cl
73
74 - def get_cluster_or_none(self, cluster_name):
75 """ 76 Same as get_cluster but returns None instead of throwing an exception 77 if the cluster does not exist 78 """ 79 try: 80 return self.get_cluster(cluster_name) 81 except exception.ClusterDoesNotExist: 82 pass
83
84 - def cluster_exists(self, tag_name):
85 """ 86 Returns True if cluster exists 87 """ 88 return self.get_cluster_or_none(tag_name) is not None
89
90 - def ssh_to_master(self, cluster_name, user='root'):
91 """ 92 ssh to master node of cluster_name 93 94 user keyword specifies an alternate user to login as 95 """ 96 cluster = self.get_cluster(cluster_name) 97 cluster.ssh_to_master(user=user)
98
99 - def ssh_to_cluster_node(self, cluster_name, node_id, user='root'):
100 """ 101 ssh to a node in cluster_name that has either an id, 102 dns name, or alias matching node_id 103 104 user keyword specifies an alternate user to login as 105 """ 106 cluster = self.get_cluster(cluster_name) 107 cluster.ssh_to_node(node_id, user=user)
108
109 - def _get_cluster_name(self, cluster_name):
110 """ 111 Returns human readable cluster name/tag prefixed with '@sc-' 112 """ 113 if not cluster_name.startswith(static.SECURITY_GROUP_PREFIX): 114 cluster_name = static.SECURITY_GROUP_TEMPLATE % cluster_name 115 return cluster_name
116
117 - def add_node(self, cluster_name, alias=None):
118 cl = self.get_cluster(cluster_name) 119 cl.add_node(alias)
120
121 - def add_nodes(self, cluster_name, num_nodes, aliases=None):
122 """ 123 Add one or more nodes to cluster 124 """ 125 cl = self.get_cluster(cluster_name) 126 cl.add_nodes(num_nodes, aliases=aliases)
127
128 - def remove_node(self, cluster_name, alias):
129 """ 130 Remove a single node from a cluster 131 """ 132 cl = self.get_cluster(cluster_name) 133 n = cl.get_node_by_alias(alias) 134 if not n: 135 raise exception.InstanceDoesNotExist(alias, label='node') 136 cl.remove_node(n)
137
138 - def restart_cluster(self, cluster_name):
139 """ 140 Reboots and reconfigures cluster_name 141 """ 142 cl = self.get_cluster(cluster_name) 143 cl.restart_cluster()
144
145 - def stop_cluster(self, cluster_name):
146 """ 147 Stops cluster_name if it's an EBS cluster, otherwise terminates the 148 cluster 149 """ 150 cl = self.get_cluster(cluster_name) 151 cl.stop_cluster()
152
153 - def terminate_cluster(self, cluster_name):
154 """ 155 Terminates cluster_name 156 """ 157 cl = self.get_cluster(cluster_name) 158 cl.terminate_cluster()
159
160 - def get_cluster_security_group(self, group_name):
161 """ 162 Return all security groups on EC2 that start with '@sc-' 163 """ 164 gname = self._get_cluster_name(group_name) 165 return self.ec2.get_security_group(gname)
166
168 """ 169 Return all security groups on EC2 that start with '@sc-' 170 """ 171 glob = static.SECURITY_GROUP_TEMPLATE % '*' 172 sgs = self.ec2.get_security_groups(filters={'group-name': glob}) 173 return sgs
174
175 - def get_tag_from_sg(self, sg):
176 """ 177 Returns the cluster tag name from a security group name that starts 178 with static.SECURITY_GROUP_PREFIX 179 180 Example: 181 sg = '@sc-mycluster' 182 print get_tag_from_sg(sg) 183 mycluster 184 """ 185 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)') 186 match = regex.match(sg) 187 if match: 188 return match.groups()[0]
189
190 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
191 """ 192 Prints a summary for each active cluster on EC2 193 """ 194 if not cluster_groups: 195 cluster_groups = self.get_cluster_security_groups() 196 if not cluster_groups: 197 log.info("No clusters found...") 198 else: 199 try: 200 cluster_groups = [self.get_cluster_security_group(g) for g \ 201 in cluster_groups] 202 except exception.SecurityGroupDoesNotExist: 203 raise exception.ClusterDoesNotExist(g) 204 for scg in cluster_groups: 205 tag = self.get_tag_from_sg(scg.name) 206 try: 207 cl = self.get_cluster(tag, group=scg, load_plugins=False) 208 except exception.IncompatibleCluster, e: 209 sep = '*' * 60 210 log.error('\n'.join([sep, e.msg, sep]), 211 extra=dict(__textwrap__=True)) 212 continue 213 header = '%s (security group: %s)' % (tag, scg.name) 214 print '-' * len(header) 215 print header 216 print '-' * len(header) 217 nodes = cl.nodes 218 try: 219 n = nodes[0] 220 except IndexError: 221 n = None 222 state = getattr(n, 'state', None) 223 ltime = 'N/A' 224 uptime = 'N/A' 225 if state in ['pending', 'running']: 226 ltime = getattr(n, 'local_launch_time', 'N/A') 227 uptime = getattr(n, 'uptime', 'N/A') 228 print 'Launch time: %s' % ltime 229 print 'Uptime: %s' % uptime 230 print 'Zone: %s' % getattr(n, 'placement', 'N/A') 231 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A') 232 ebs_nodes = [n for n in nodes if n.attached_vols] 233 if ebs_nodes: 234 print 'EBS volumes:' 235 for node in ebs_nodes: 236 devices = node.attached_vols 237 node_id = node.alias or node.id 238 for dev in devices: 239 d = devices.get(dev) 240 vol_id = d.volume_id 241 status = d.status 242 print ' %s on %s:%s (status: %s)' % \ 243 (vol_id, node_id, dev, status) 244 else: 245 print 'EBS volumes: N/A' 246 if nodes: 247 print 'Cluster nodes:' 248 for node in nodes: 249 nodeline = " %7s %s %s %s" % (node.alias, node.state, 250 node.id, node.dns_name) 251 if node.spot_id: 252 nodeline += ' (spot %s)' % node.spot_id 253 if show_ssh_status: 254 ssh_status = {True: 'Up', False: 'Down'} 255 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()] 256 print nodeline 257 print 'Total nodes: %d' % len(nodes) 258 else: 259 print 'Cluster nodes: N/A' 260 print
261
262 - def run_plugin(self, plugin_name, cluster_tag):
263 """ 264 Run a plugin defined in the config. 265 266 plugin_name must match the plugin's section name in the config 267 cluster_tag specifies the cluster to run the plugin on 268 """ 269 cl = self.get_cluster(cluster_tag, load_plugins=False) 270 if not cl.is_cluster_up(): 271 raise exception.ClusterNotRunning(cluster_tag) 272 plugs = [self.cfg.get_plugin(plugin_name)] 273 name, plugin = cl.load_plugins(plugs)[0] 274 cl.run_plugin(plugin, name)
275
276 277 -class Cluster(object):
278 - def __init__(self, 279 ec2_conn=None, 280 spot_bid=None, 281 cluster_tag=None, 282 cluster_description=None, 283 cluster_size=None, 284 cluster_user=None, 285 cluster_shell=None, 286 master_image_id=None, 287 master_instance_type=None, 288 node_image_id=None, 289 node_instance_type=None, 290 node_instance_types=[], 291 availability_zone=None, 292 keyname=None, 293 key_location=None, 294 volumes=[], 295 plugins=[], 296 permissions=[], 297 refresh_interval=30, 298 disable_queue=False, 299 disable_threads=False, 300 cluster_group=None, 301 **kwargs):
302 303 now = time.strftime("%Y%m%d%H%M") 304 305 self.ec2 = ec2_conn 306 self.spot_bid = spot_bid 307 self.cluster_tag = cluster_tag 308 self.cluster_description = cluster_description 309 if self.cluster_tag is None: 310 self.cluster_tag = "cluster%s" % now 311 if cluster_description is None: 312 self.cluster_description = "Cluster created at %s" % now 313 self.cluster_size = cluster_size or 0 314 self.cluster_user = cluster_user 315 self.cluster_shell = cluster_shell 316 self.master_image_id = master_image_id 317 self.master_instance_type = master_instance_type 318 self.node_image_id = node_image_id 319 self.node_instance_type = node_instance_type 320 self.node_instance_types = node_instance_types 321 self.availability_zone = availability_zone 322 self.keyname = keyname 323 self.key_location = key_location 324 self.volumes = self.load_volumes(volumes) 325 self.plugins = self.load_plugins(plugins) 326 self.permissions = permissions 327 self.refresh_interval = refresh_interval 328 self.disable_queue = disable_queue 329 self.disable_threads = disable_threads 330 331 self.__instance_types = static.INSTANCE_TYPES 332 self.__cluster_settings = static.CLUSTER_SETTINGS 333 self.__available_shells = static.AVAILABLE_SHELLS 334 self.__protocols = static.PROTOCOLS 335 self._progress_bar = None 336 self._master_reservation = None 337 self._node_reservation = None 338 self._nodes = [] 339 self._master = None 340 self._zone = None 341 self._plugins = plugins 342 self._cluster_group = None 343 self._placement_group = None
344
345 - def __repr__(self):
346 return '<Cluster: %s (%s-node)>' % (self.cluster_tag, 347 self.cluster_size)
348 349 @property
350 - def zone(self):
351 """ 352 If volumes are specified, this method determines the common 353 availability zone between those volumes. If an availability zone 354 is explicitly specified in the config and does not match the common 355 availability zone of the volumes, an exception is raised. If all 356 volumes are not in the same availabilty zone an exception is raised. 357 If no volumes are specified, returns the user specified availability 358 zone if it exists. 359 """ 360 if not self._zone: 361 zone = None 362 if self.availability_zone: 363 zone = self.ec2.get_zone(self.availability_zone).name 364 common_zone = None 365 for volume in self.volumes: 366 volid = self.volumes.get(volume).get('volume_id') 367 vol = self.ec2.get_volume(volid) 368 if not common_zone: 369 common_zone = vol.zone 370 elif vol.zone != common_zone: 371 vols = [self.volumes.get(v).get('volume_id') 372 for v in self.volumes] 373 raise exception.VolumesZoneError(vols) 374 if common_zone and zone and zone != common_zone: 375 raise exception.InvalidZone(zone, common_zone) 376 if not zone and common_zone: 377 zone = common_zone 378 self._zone = zone 379 return self._zone
380
381 - def load_volumes(self, vols):
382 """ 383 Iterate through vols and set device/partition settings automatically if 384 not specified. 385 386 This method assigns the first volume to /dev/sdz, second to /dev/sdy, 387 etc for all volumes that do not include a device/partition setting 388 """ 389 devices = ['/dev/sd%s' % s for s in string.lowercase] 390 for volname in vols: 391 vol = vols.get(volname) 392 dev = vol.get('device') 393 if dev in devices: 394 #rm user-defined devices from the list of auto-assigned devices 395 devices.remove(dev) 396 volumes = {} 397 for volname in vols: 398 vol = vols.get(volname) 399 device = vol.get('device') 400 if not device: 401 device = devices.pop() 402 if not utils.is_valid_device(device): 403 raise exception.InvalidDevice(device) 404 v = volumes[volname] = utils.AttributeDict() 405 v.update(vol) 406 v['device'] = device 407 part = vol.get('partition') 408 if part: 409 partition = device + str(part) 410 if not utils.is_valid_partition(partition): 411 raise exception.InvalidPartition(part) 412 v['partition'] = partition 413 return volumes
414
415 - def load_plugins(self, plugins):
416 plugs = [] 417 for plugin in plugins: 418 setup_class = plugin.get('setup_class') 419 plugin_name = plugin.get('__name__').split()[-1] 420 mod_name = '.'.join(setup_class.split('.')[:-1]) 421 class_name = setup_class.split('.')[-1] 422 try: 423 mod = __import__(mod_name, globals(), locals(), [class_name]) 424 except SyntaxError, e: 425 raise exception.PluginSyntaxError( 426 "Plugin %s (%s) contains a syntax error at line %s" % \ 427 (plugin_name, e.filename, e.lineno)) 428 except ImportError, e: 429 raise exception.PluginLoadError( 430 "Failed to import plugin %s: %s" % \ 431 (plugin_name, e.message)) 432 klass = getattr(mod, class_name, None) 433 if not klass: 434 raise exception.PluginError( 435 'Plugin class %s does not exist' % setup_class) 436 if not issubclass(klass, clustersetup.ClusterSetup): 437 raise exception.PluginError( 438 ("Plugin %s must be a subclass of " + \ 439 "starcluster.clustersetup.ClusterSetup") % setup_class) 440 (args, varargs, 441 keywords, defaults) = inspect.getargspec(klass.__init__) 442 log.debug('plugin args = %s' % args) 443 log.debug('plugin varargs = %s' % varargs) 444 log.debug('plugin keywords = %s' % keywords) 445 log.debug('plugin defaults = %s' % str(defaults)) 446 args = args[1:] # ignore self 447 nargs = len(args) 448 ndefaults = 0 449 if defaults: 450 ndefaults = len(defaults) 451 nrequired = nargs - ndefaults 452 args = args[:nrequired] 453 kwargs = args[nrequired:] 454 config_args = [] 455 for arg in args: 456 if arg in plugin: 457 config_args.append(plugin.get(arg)) 458 config_kwargs = {} 459 for arg in kwargs: 460 if arg in plugin: 461 config_kwargs[arg] = plugin.get(arg) 462 log.debug("config_args = %s" % config_args) 463 log.debug("config_kwargs = %s" % config_kwargs) 464 if nrequired > len(config_args): 465 raise exception.PluginError( 466 "Not enough settings provided for plugin %s" % plugin_name) 467 plugs.append((plugin_name, klass(*config_args, **config_kwargs))) 468 return plugs
469
470 - def update(self, kwargs):
471 for key in kwargs.keys(): 472 if hasattr(self, key): 473 self.__dict__[key] = kwargs[key]
474
476 """ 477 Validate existing instances against this template's settings 478 """ 479 self._validate_instance_types() 480 num_running = len(self.nodes) 481 if num_running != self.cluster_size: 482 raise exception.ClusterValidationError( 483 "Number of existing instances (%s) != cluster_size (%s)" % \ 484 (num_running, self.cluster_size)) 485 mtype = self.master_node.instance_type 486 mastertype = self.master_instance_type or self.node_instance_type 487 if mtype != mastertype: 488 raise exception.ClusterValidationError( 489 "The existing master node's instance type (%s) != %s" % \ 490 (mtype, mastertype)) 491 masterimage = self.master_image_id or self.node_image_id 492 mimage = self.master_node.image_id 493 if mimage != masterimage: 494 raise exception.ClusterValidationError( 495 "The existing master node's image id (%s) != %s" % \ 496 (mimage, masterimage)) 497 mkey = self.master_node.key_name 498 if mkey != self.keyname: 499 raise exception.ClusterValidationError( 500 "The existing master's keypair (%s) != %s" % \ 501 (mkey, self.keyname)) 502 try: 503 nodes = self.nodes[1:] 504 except IndexError: 505 raise exception.ClusterValidationError( 506 "Cluster has no running instances") 507 mazone = self.master_node.placement 508 id_start = 0 509 for itype in self.node_instance_types: 510 size = itype['size'] 511 image = itype['image'] or self.node_image_id 512 type = itype['type'] or self.node_instance_type 513 for i in range(id_start, id_start + size): 514 n = nodes[i] 515 ntype = n.instance_type 516 if ntype != type: 517 raise exception.ClusterValidationError( 518 "Running node's instance type (%s) != %s" % \ 519 (ntype, type)) 520 nimage = n.image_id 521 if nimage != image: 522 raise exception.ClusterValidationError( 523 "Running node's image id (%s) != %s" % \ 524 (nimage, image)) 525 id_start += 1 526 for n in nodes[id_start:]: 527 ntype = n.instance_type 528 if n.instance_type != self.node_instance_type: 529 raise exception.ClusterValidationError( 530 "Running node's instance type (%s) != %s" % \ 531 (ntype, self.node_instance_type)) 532 nimage = n.image_id 533 if nimage != self.node_image_id: 534 raise exception.ClusterValidationError( 535 "Running node's image id (%s) != %s" % \ 536 (nimage, image)) 537 for n in nodes: 538 if n.key_name != self.keyname: 539 raise exception.ClusterValidationError( 540 "Running node's key_name (%s) != %s" % \ 541 (n.key_name, self.keyname)) 542 nazone = n.placement 543 if mazone != nazone: 544 raise exception.ClusterValidationError( 545 ("Running master's zone (%s) " + \ 546 "does not match node zone (%s)") % \ 547 (mazone, nazone)) 548 # reset zone 549 self._zone = None 550 if self.zone and self.zone != mazone: 551 raise exception.ClusterValidationError( 552 "Running cluster's availability_zone (%s) != %s" % \ 553 (mazone, self.zone))
554
555 - def get(self, name):
556 return self.__dict__.get(name)
557
558 - def __str__(self):
559 cfg = self.__getstate__() 560 return pprint.pformat(cfg)
561
562 - def load_receipt(self, load_plugins=True):
563 """ 564 Load the original settings used to launch this cluster into this 565 Cluster object. The settings are loaded from the cluster group's 566 description field. 567 """ 568 try: 569 desc = self.cluster_group.description 570 version, b64data = desc.split('-', 1) 571 if utils.program_version_greater(version, static.VERSION): 572 d = dict(cluster=self.cluster_tag, old_version=static.VERSION, 573 new_version=version) 574 msg = user_msgs.version_mismatch % d 575 sep = '*' * 60 576 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1}) 577 compressed_data = base64.b64decode(b64data) 578 pkl_data = zlib.decompress(compressed_data) 579 cluster_settings = cPickle.loads(str(pkl_data)).__dict__ 580 except (cPickle.PickleError, zlib.error, ValueError, TypeError, 581 EOFError, IndexError), e: 582 log.debug('load receipt exception: ', exc_info=True) 583 raise exception.IncompatibleCluster(self.cluster_group) 584 except Exception, e: 585 raise exception.ClusterReceiptError( 586 'failed to load cluster receipt: %s' % e) 587 for key in cluster_settings: 588 if hasattr(self, key): 589 setattr(self, key, cluster_settings.get(key)) 590 if load_plugins: 591 try: 592 self.plugins = self.load_plugins(self._plugins) 593 except exception.PluginError, e: 594 log.warn(e) 595 log.warn("An error occured while loading plugins") 596 log.warn("Not running any plugins") 597 except Exception, e: 598 raise exception.ClusterReceiptError( 599 'failed to load cluster receipt: %s' % e) 600 return True
601
602 - def __getstate__(self):
603 cfg = {} 604 exclude = ['key_location', 'plugins'] 605 include = ['_zone', '_plugins'] 606 for key in self.__dict__.keys(): 607 private = key.startswith('_') 608 if (not private or key in include) and not key in exclude: 609 val = getattr(self, key) 610 if type(val) in [str, unicode, bool, int, float, list, dict]: 611 cfg[key] = val 612 elif type(val) is utils.AttributeDict: 613 cfg[key] = dict(val) 614 return cfg
615 616 @property
617 - def _security_group(self):
618 return static.SECURITY_GROUP_TEMPLATE % self.cluster_tag
619 620 @property
621 - def cluster_group(self):
622 if self._cluster_group is None: 623 desc = base64.b64encode(zlib.compress(cPickle.dumps(self))) 624 desc = '-'.join([static.VERSION, desc]) 625 sg = self.ec2.get_or_create_group(self._security_group, 626 desc, 627 auth_ssh=True, 628 auth_group_traffic=True) 629 for p in self.permissions: 630 perm = self.permissions.get(p) 631 ip_protocol = perm.get('ip_protocol', 'tcp') 632 from_port = perm.get('from_port') 633 to_port = perm.get('to_port') 634 cidr_ip = perm.get('cidr_ip', '0.0.0.0/0') 635 if not self.ec2.has_permission(sg, ip_protocol, from_port, 636 to_port, cidr_ip): 637 log.info("Opening %s port range %s-%s for CIDR %s" % 638 (ip_protocol, from_port, to_port, cidr_ip)) 639 sg.authorize(ip_protocol, from_port, to_port, cidr_ip) 640 self._cluster_group = sg 641 return self._cluster_group
642 643 @property
644 - def placement_group(self):
645 if self._placement_group is None: 646 pg = self.ec2.get_or_create_placement_group(self._security_group) 647 self._placement_group = pg 648 return self._placement_group
649 650 @property
651 - def master_node(self):
652 if not self._master: 653 for node in self.nodes: 654 if node.is_master(): 655 self._master = node 656 return self._master
657 658 @property
659 - def nodes(self):
660 states = ['pending', 'running', 'stopping', 'stopped'] 661 filters = {'group-id': self._security_group, 662 'instance-state-name': states} 663 nodes = self.ec2.get_all_instances(filters=filters) 664 # remove any cached nodes not in the current node list from EC2 665 current_ids = map(lambda n: n.id, nodes) 666 remove_nodes = filter(lambda n: n.id not in current_ids, self._nodes) 667 map(lambda n: self._nodes.remove(n), remove_nodes) 668 # update node cache with latest instance data from EC2 669 existing_nodes = dict(map(lambda x: (x.id, x), self._nodes)) 670 log.debug('existing nodes: %s' % existing_nodes) 671 for node in nodes: 672 if node.id in existing_nodes: 673 log.debug('updating existing node %s in self._nodes' % node.id) 674 enode = existing_nodes.get(node.id) 675 enode.key_location = self.key_location 676 enode.instance = node 677 else: 678 log.debug('adding node %s to self._nodes list' % node.id) 679 n = Node(node, self.key_location) 680 if n.is_master(): 681 self._master = n 682 self._nodes.insert(0, n) 683 else: 684 self._nodes.append(n) 685 self._nodes.sort(key=lambda n: n.alias) 686 log.debug('returning self._nodes = %s' % self._nodes) 687 return self._nodes
688
689 - def get_node_by_dns_name(self, dns_name):
690 for node in self.nodes: 691 if node.dns_name == dns_name: 692 return node 693 raise exception.InstanceDoesNotExist(dns_name, label='node')
694
695 - def get_node_by_id(self, instance_id):
696 for node in self.nodes: 697 if node.id == instance_id: 698 return node 699 raise exception.InstanceDoesNotExist(instance_id, label='node')
700
701 - def get_node_by_alias(self, alias):
702 for node in self.nodes: 703 if node.alias == alias: 704 return node 705 raise exception.InstanceDoesNotExist(alias, label='node')
706
707 - def _nodes_in_states(self, states):
708 return filter(lambda x: x.state in states, self.nodes)
709 710 @property
711 - def running_nodes(self):
712 return self._nodes_in_states(['running'])
713 714 @property
715 - def stopped_nodes(self):
716 return self._nodes_in_states(['stopping', 'stopped'])
717 718 @property
719 - def spot_requests(self):
720 filters = {'launch.group-id': self._security_group, 721 'state': ['active', 'open']} 722 return self.ec2.get_all_spot_requests(filters=filters)
723
724 - def create_node(self, alias, image_id=None, instance_type=None, zone=None, 725 placement_group=None):
729
730 - def create_nodes(self, aliases, image_id=None, instance_type=None, count=1, 731 zone=None, placement_group=None):
732 """ 733 Convenience method for requesting instances with this cluster's 734 settings 735 """ 736 cluster_sg = self.cluster_group.name 737 if instance_type in static.CLUSTER_TYPES: 738 placement_group = self.placement_group.name 739 response = self.ec2.request_instances( 740 image_id or self.node_image_id, 741 price=self.spot_bid, 742 instance_type=instance_type or self.node_instance_type, 743 min_count=count, max_count=count, count=count, 744 key_name=self.keyname, 745 security_groups=[cluster_sg], 746 availability_zone_group=cluster_sg, 747 launch_group=cluster_sg, 748 placement=zone or self.zone, 749 user_data='|'.join(aliases), 750 placement_group=placement_group) 751 return response
752
753 - def _get_next_node_num(self):
754 nodes = self._nodes_in_states(['pending', 'running']) 755 nodes = filter(lambda x: not x.is_master(), nodes) 756 highest = 0 757 for n in nodes: 758 try: 759 highest = max(highest, int(n.alias[4:8])) 760 except ValueError: 761 pass 762 next = highest + 1 763 log.debug("Highest node number is %d. choosing %d." % (highest, next)) 764 return next
765
766 - def add_node(self, alias=None):
767 """ 768 Add a single node to this cluster 769 """ 770 aliases = None 771 if alias: 772 aliases = [alias] 773 self.add_nodes(1, aliases=aliases)
774
775 - def add_nodes(self, num_nodes, aliases=None):
776 """ 777 Add new nodes to this cluster 778 779 aliases - list of aliases to assign to new nodes (len must equal 780 num_nodes) 781 """ 782 running_pending = self._nodes_in_states(['pending', 'running']) 783 aliases = aliases or [] 784 if not aliases: 785 next_node_id = self._get_next_node_num() 786 for i in range(next_node_id, next_node_id + num_nodes): 787 alias = 'node%.3d' % i 788 aliases.append(alias) 789 assert len(aliases) == num_nodes 790 if "master" in aliases: 791 raise exception.ClusterValidationError( 792 "worker nodes cannot have master as an alias") 793 for node in running_pending: 794 if node.alias in aliases: 795 raise exception.ClusterValidationError( 796 "node with alias %s already exists" % node.alias) 797 log.debug("Adding node(s): %s" % aliases) 798 log.info("Launching node(s): %s" % ', '.join(aliases)) 799 print self.create_nodes(aliases, count=len(aliases)) 800 self.wait_for_cluster(msg="Waiting for node(s) to come up...") 801 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 802 self.disable_threads) 803 for alias in aliases: 804 node = self.get_node_by_alias(alias) 805 default_plugin.on_add_node( 806 node, self.nodes, self.master_node, 807 self.cluster_user, self.cluster_shell, 808 self.volumes) 809 self.run_plugins(method_name="on_add_node", node=node)
810
811 - def remove_node(self, node):
812 """ 813 Remove a single node from this cluster 814 """ 815 return self.remove_nodes([node])
816
817 - def remove_nodes(self, nodes):
818 """ 819 Remove a list of nodes from this cluster 820 """ 821 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 822 self.disable_threads) 823 for node in nodes: 824 if node.is_master(): 825 raise exception.InvalidOperation("cannot remove master node") 826 self.run_plugins(method_name="on_remove_node", 827 node=node, reverse=True) 828 default_plugin.on_remove_node( 829 node, self.nodes, self.master_node, 830 self.cluster_user, self.cluster_shell, 831 self.volumes) 832 if node.spot_id: 833 log.info("Cancelling spot request %s" % node.spot_id) 834 node.get_spot_request().cancel() 835 node.terminate()
836
837 - def _get_launch_map(self):
838 """ 839 Groups all node-aliases that have similar instance types/image ids 840 Returns a dictionary that's used to launch all similar instance types 841 and image ids in the same request. Example return value: 842 843 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'], 844 ('m1.large', 'ami-a5c02dcc'): ['node003'], 845 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'], 846 ('m1.small', 'ami-19e17a2b'): ['node004']} 847 """ 848 lmap = {} 849 mtype = self.master_instance_type or self.node_instance_type 850 mimage = self.master_image_id or self.node_image_id 851 lmap[(mtype, mimage)] = ['master'] 852 id_start = 1 853 for itype in self.node_instance_types: 854 count = itype['size'] 855 image_id = itype['image'] or self.node_image_id 856 type = itype['type'] or self.node_instance_type 857 if not (type, image_id) in lmap: 858 lmap[(type, image_id)] = [] 859 for id in range(id_start, id_start + count): 860 alias = 'node%.3d' % id 861 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 862 (alias, image_id, type)) 863 lmap[(type, image_id)].append(alias) 864 id_start += 1 865 ntype = self.node_instance_type 866 nimage = self.node_image_id 867 if not (ntype, nimage) in lmap: 868 lmap[(ntype, nimage)] = [] 869 for id in range(id_start, self.cluster_size): 870 alias = 'node%.3d' % id 871 log.debug("Launch map: %s (ami: %s, type: %s)..." % \ 872 (alias, nimage, ntype)) 873 lmap[(ntype, nimage)].append(alias) 874 return lmap
875
876 - def _get_type_and_image_id(self, alias):
877 """ 878 Returns (instance_type,image_id) for a given alias based 879 on the map returned from self._get_launch_map 880 """ 881 lmap = self._get_launch_map() 882 for (type, image) in lmap: 883 key = (type, image) 884 if alias in lmap.get(key): 885 return key
886
887 - def create_cluster(self):
888 """ 889 Launches all EC2 instances based on this cluster's settings. 890 """ 891 log.info("Launching a %d-node cluster..." % self.cluster_size) 892 mtype = self.master_instance_type or self.node_instance_type 893 self.master_instance_type = mtype 894 if self.spot_bid: 895 self._create_spot_cluster() 896 else: 897 self._create_flat_rate_cluster()
898
899 - def _create_flat_rate_cluster(self):
900 """ 901 Launches cluster using flat-rate instances. This method attempts to 902 minimize the number of launch requests by grouping nodes of the same 903 type/ami and launching each group simultaneously within a single launch 904 request. This is especially important for Cluster Compute instances 905 given that Amazon *highly* recommends requesting all CCI in a single 906 launch request. 907 """ 908 log.info("Launching a %d-node cluster..." % self.cluster_size) 909 lmap = self._get_launch_map() 910 zone = None 911 master_map = None 912 for (type, image) in lmap: 913 # launch all aliases that match master's itype/image_id 914 aliases = lmap.get((type, image)) 915 if 'master' in aliases: 916 master_map = (type, image) 917 for alias in aliases: 918 log.debug("Launching %s (ami: %s, type: %s)" % \ 919 (alias, image, type)) 920 master_response = self.create_nodes(aliases, image_id=image, 921 instance_type=type, 922 count=len(aliases)) 923 zone = master_response.instances[0].placement 924 print master_response 925 lmap.pop(master_map) 926 if self.cluster_size <= 1: 927 return 928 for (type, image) in lmap: 929 aliases = lmap.get((type, image)) 930 for alias in aliases: 931 log.debug("Launching %s (ami: %s, type: %s)" % \ 932 (alias, image, type)) 933 node_response = self.create_nodes(aliases, image_id=image, 934 instance_type=type, 935 count=len(aliases), zone=zone) 936 print node_response
937
938 - def _create_spot_cluster(self):
939 """ 940 Launches cluster using all spot instances. This method makes a single 941 spot request for each node in the cluster since spot instances 942 *always* have an ami_launch_index of 0. This is needed in order to 943 correctly assign aliases to nodes. 944 """ 945 (mtype, mimage) = self._get_type_and_image_id('master') 946 log.info("Launching master node (ami: %s, type: %s)..." % \ 947 (mimage, mtype)) 948 master_response = self.create_node('master', 949 image_id=mimage, 950 instance_type=mtype) 951 print master_response[0] 952 if self.cluster_size <= 1: 953 return 954 # Make sure nodes are in same zone as master 955 launch_spec = master_response[0].launch_specification 956 zone = launch_spec.placement 957 for id in range(1, self.cluster_size): 958 alias = 'node%.3d' % id 959 (ntype, nimage) = self._get_type_and_image_id(alias) 960 log.info("Launching %s (ami: %s, type: %s)" % \ 961 (alias, nimage, ntype)) 962 node_response = self.create_node(alias, 963 image_id=nimage, 964 instance_type=ntype, 965 zone=zone) 966 print node_response[0]
967
968 - def is_ebs_cluster(self):
969 """ 970 Returns true if any instances in the cluster are EBS-backed 971 """ 972 for node in self.nodes: 973 if node.is_ebs_backed(): 974 return True 975 return False
976
977 - def is_cluster_compute(self):
978 """ 979 Returns true if any instances are a Cluster Compute type 980 981 If no instances are currently running, this method checks the 982 original settings used to launch this cluster and returns true 983 if any of the instance type settings specified Cluster Compute 984 instance types 985 """ 986 for node in self.nodes: 987 if node.is_cluster_compute(): 988 return True 989 lmap = self._get_launch_map() 990 for (type, image) in lmap: 991 if type in static.CLUSTER_COMPUTE_TYPES: 992 return True 993 return False
994
995 - def is_cluster_up(self):
996 """ 997 Check that all nodes are 'running' and that ssh is up on all nodes 998 This method will return False if any spot requests are in an 'open' 999 state. 1000 """ 1001 spots = self.spot_requests 1002 active_spots = filter(lambda x: x.state == 'active', spots) 1003 if len(spots) != len(active_spots): 1004 return False 1005 nodes = self.nodes 1006 if not nodes: 1007 return False 1008 for node in nodes: 1009 if not node.is_up(): 1010 return False 1011 return True
1012
1013 - def get_spinner(self, msg):
1014 """ 1015 Logs a status msg, starts a spinner, and returns the spinner object. 1016 This is useful for long running processes: 1017 1018 s = self.get_spinner("Long running process running...") 1019 (do something) 1020 s.stop() 1021 """ 1022 s = spinner.Spinner() 1023 log.info(msg, extra=dict(__nonewline__=True)) 1024 s.start() 1025 return s
1026 1027 @property
1028 - def progress_bar(self):
1029 if not self._progress_bar: 1030 widgets = ['', progressbar.Fraction(), ' ', 1031 progressbar.Bar(marker=progressbar.RotatingMarker()), 1032 ' ', progressbar.Percentage(), ' ', ' '] 1033 pbar = progressbar.ProgressBar(widgets=widgets, 1034 maxval=self.cluster_size, 1035 force_update=True) 1036 self._progress_bar = pbar 1037 return self._progress_bar
1038
1039 - def wait_for_cluster(self, msg="Waiting for cluster to come up..."):
1040 """ 1041 Wait for cluster to come up and display progress bar. Waits for all 1042 spot requests to become 'active', all instances to be in a 'running' 1043 state, and for all SSH daemons to come up. 1044 1045 msg - custom message to print out before waiting on the cluster 1046 """ 1047 interval = self.refresh_interval 1048 log.info("%s %s" % (msg, "(updating every %ds)" % interval)) 1049 pbar = self.progress_bar.reset() 1050 spots = self.spot_requests 1051 if spots: 1052 log.info('Waiting for open spot requests to become active...') 1053 pbar.maxval = len(spots) 1054 pbar.update(0) 1055 while not pbar.finished: 1056 active_spots = filter(lambda x: x.state == "active", spots) 1057 pbar.maxval = len(spots) 1058 pbar.update(len(active_spots)) 1059 if not pbar.finished: 1060 time.sleep(interval) 1061 spots = self.spot_requests 1062 pbar.reset() 1063 nodes = self.nodes 1064 if len(nodes) == 0: 1065 s = self.get_spinner("Waiting for instances to activate...") 1066 while len(nodes) == 0: 1067 time.sleep(interval) 1068 nodes = self.nodes 1069 s.stop() 1070 log.info("Waiting for all nodes to be in a 'running' state...") 1071 pbar.maxval = len(nodes) 1072 pbar.update(0) 1073 while not pbar.finished: 1074 running_nodes = filter(lambda x: x.state == "running", nodes) 1075 pbar.maxval = len(nodes) 1076 pbar.update(len(running_nodes)) 1077 if not pbar.finished: 1078 time.sleep(interval) 1079 nodes = self.nodes 1080 pbar.reset() 1081 log.info("Waiting for SSH to come up on all nodes...") 1082 pbar.maxval = len(nodes) 1083 pbar.update(0) 1084 while not pbar.finished: 1085 active_nodes = filter(lambda n: n.is_up(), nodes) 1086 pbar.maxval = len(nodes) 1087 pbar.update(len(active_nodes)) 1088 if not pbar.finished: 1089 time.sleep(interval) 1090 nodes = self.nodes 1091 pbar.finish()
1092
1093 - def is_cluster_stopped(self):
1094 """ 1095 Check whether all nodes are in the 'stopped' state 1096 """ 1097 return len(self.stopped_nodes) == self.cluster_size
1098
1099 - def is_cluster_terminated(self):
1100 """ 1101 Check whether all nodes are in a 'terminated' state 1102 """ 1103 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES) 1104 filters = {'group-id': self._security_group, 1105 'instance-state-name': states} 1106 insts = self.ec2.get_all_instances(filters=filters) 1107 return len(insts) == 0
1108
1109 - def attach_volumes_to_master(self):
1110 """ 1111 Attach each volume to the master node 1112 """ 1113 for vol in self.volumes: 1114 volume = self.volumes.get(vol) 1115 device = volume.get('device') 1116 vol_id = volume.get('volume_id') 1117 vol = self.ec2.get_volume(vol_id) 1118 if vol.attach_data.instance_id == self.master_node.id: 1119 log.info("Volume %s already attached to master...skipping" % \ 1120 vol.id) 1121 continue 1122 if vol.status != "available": 1123 log.error(('Volume %s not available...' + 1124 'please check and try again') % vol.id) 1125 continue 1126 log.info("Attaching volume %s to master node on %s ..." % (vol.id, 1127 device)) 1128 resp = vol.attach(self.master_node.id, device) 1129 log.debug("resp = %s" % resp) 1130 while True: 1131 vol.update() 1132 if vol.attachment_state() == 'attached': 1133 break 1134 time.sleep(5)
1135
1136 - def detach_volumes(self):
1137 """ 1138 Detach all volumes from all nodes 1139 """ 1140 for node in self.nodes: 1141 node.detach_external_volumes()
1142 1143 @print_timing('Restarting cluster')
1144 - def restart_cluster(self):
1145 """ 1146 Reboot all instances and reconfigure the cluster 1147 """ 1148 nodes = self.nodes 1149 if not nodes: 1150 raise exception.ClusterValidationError("No running nodes found") 1151 self.run_plugins(method_name="on_restart", reverse=True) 1152 log.info("Rebooting cluster...") 1153 for node in nodes: 1154 node.reboot() 1155 sleep = 20 1156 log.info("Sleeping for %d seconds..." % sleep) 1157 time.sleep(sleep) 1158 self._setup_cluster()
1159
1160 - def stop_cluster(self):
1161 """ 1162 Stop this cluster by detaching all volumes, stopping/terminating 1163 all instances, cancelling all spot requests (if any), and removing this 1164 cluster's security group. 1165 1166 If a node is a spot instance, it will be terminated. Spot 1167 instances can not be 'stopped', they must be terminated. 1168 """ 1169 self.run_plugins(method_name="on_shutdown", reverse=True) 1170 self.detach_volumes() 1171 for node in self.nodes: 1172 node.shutdown() 1173 for spot in self.spot_requests: 1174 if spot.state not in ['cancelled', 'closed']: 1175 log.info("Cancelling spot instance request: %s" % spot.id) 1176 spot.cancel() 1177 if self.spot_bid or not self.is_ebs_cluster(): 1178 log.info("Removing %s security group" % self._security_group) 1179 self.cluster_group.delete()
1180
1181 - def terminate_cluster(self):
1182 """ 1183 Stop this cluster by first detaching all volumes, shutting down all 1184 instances, cancelling all spot requests (if any), removing this 1185 cluster's placement group (if any), and removing this cluster's 1186 security group. 1187 """ 1188 self.run_plugins(method_name="on_shutdown", reverse=True) 1189 self.detach_volumes() 1190 for node in self.nodes: 1191 node.terminate() 1192 for spot in self.spot_requests: 1193 if spot.state not in ['cancelled', 'closed']: 1194 log.info("Cancelling spot instance request: %s" % spot.id) 1195 spot.cancel() 1196 log.info("Removing %s security group" % self._security_group) 1197 self.cluster_group.delete() 1198 pg = self.ec2.get_placement_group_or_none(self._security_group) 1199 if pg: 1200 s = self.get_spinner("Waiting for cluster to terminate...") 1201 while not self.is_cluster_terminated(): 1202 time.sleep(5) 1203 s.stop() 1204 log.info("Removing %s placement group" % pg.name) 1205 pg.delete()
1206
1207 - def start(self, create=True, create_only=False, validate=True, 1208 validate_only=False, validate_running=False):
1209 """ 1210 Handles creating and configuring a cluster. 1211 Validates, creates, and configures a cluster. 1212 Passing validate=False will ignore validate_only and validate_running 1213 keywords and is effectively the same as running _start 1214 """ 1215 if validate: 1216 retval = self._validate(validate_running=validate_running) 1217 if validate_only: 1218 return retval 1219 return self._start(create, create_only)
1220 1221 @print_timing("Starting cluster")
1222 - def _start(self, create=True, create_only=False):
1223 """ 1224 Start cluster from this cluster template's settings 1225 Handles creating and configuring a cluster 1226 Does not attempt to validate before running 1227 """ 1228 log.info("Starting cluster...") 1229 if create: 1230 self.create_cluster() 1231 else: 1232 for node in self.stopped_nodes: 1233 log.info("Starting stopped node: %s" % node.alias) 1234 node.start() 1235 if create_only: 1236 return 1237 self._setup_cluster() 1238 log.info(user_msgs.cluster_started_msg % { 1239 'master': self.master_node.dns_name, 1240 'user': self.cluster_user, 1241 'key': self.key_location, 1242 'tag': self.cluster_tag, 1243 }, extra=dict(__textwrap__=True, __raw__=True))
1244
1245 - def _setup_cluster(self):
1246 """ 1247 This method waits for all nodes to come up and then runs the default 1248 StarCluster setup routines followed by any additional plugin setup 1249 routines 1250 """ 1251 self.wait_for_cluster() 1252 log.info("The master node is %s" % self.master_node.dns_name) 1253 log.info("Setting up the cluster...") 1254 if self.volumes: 1255 self.attach_volumes_to_master() 1256 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue, 1257 self.disable_threads) 1258 default_plugin.run(self.nodes, self.master_node, self.cluster_user, 1259 self.cluster_shell, self.volumes) 1260 self.run_plugins()
1261
1262 - def run_plugins(self, plugins=None, method_name="run", node=None, 1263 reverse=False):
1264 """ 1265 Run all plugins specified in this Cluster object's self.plugins list 1266 Uses plugins list instead of self.plugins if specified. 1267 1268 plugins must be a tuple: the first element is the plugin's name, the 1269 second element is the plugin object (a subclass of ClusterSetup) 1270 """ 1271 plugs = plugins or self.plugins 1272 if reverse: 1273 plugs = plugs[:] 1274 plugs.reverse() 1275 for plug in plugs: 1276 name, plugin = plug 1277 self.run_plugin(plugin, name, method_name=method_name, node=node)
1278
1279 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1280 """ 1281 Run a StarCluster plugin. 1282 1283 plugin - an instance of the plugin's class 1284 name - a user-friendly label for the plugin 1285 method_name - the method to run within the plugin (default: "run") 1286 node - optional node to pass as first argument to plugin method (used 1287 for on_add_node/on_remove_node) 1288 """ 1289 plugin_name = name or str(plugin) 1290 try: 1291 func = getattr(plugin, method_name, None) 1292 if not func: 1293 log.warn("Plugin %s has no %s method...skipping" % \ 1294 (plugin_name, method_name)) 1295 return 1296 args = [self.nodes, self.master_node, self.cluster_user, 1297 self.cluster_shell, self.volumes] 1298 if node: 1299 args.insert(0, node) 1300 log.info("Running plugin %s" % plugin_name) 1301 func(*args) 1302 except NotImplementedError: 1303 log.debug("method %s not implemented by plugin %s" % (method_name, 1304 plugin_name)) 1305 except Exception, e: 1306 log.error("Error occured while running plugin '%s':" % plugin_name) 1307 if isinstance(e, exception.ThreadPoolException): 1308 e.print_excs() 1309 log.debug(e.format_excs()) 1310 else: 1311 traceback.print_exc() 1312 log.debug(traceback.format_exc())
1313
1314 - def is_running_valid(self):
1315 """ 1316 Checks whether the current running instances are compatible 1317 with this cluster template's settings 1318 """ 1319 try: 1320 self._validate_running_instances() 1321 return True 1322 except exception.ClusterValidationError, e: 1323 log.error(e.msg) 1324 return False
1325
1326 - def _validate(self, validate_running=False):
1327 """ 1328 Checks that all cluster template settings are valid. Raises 1329 a ClusterValidationError exception if not. Passing 1330 validate_running=True will also check that the existing instances 1331 properties match the configuration of this cluster template. 1332 """ 1333 log.info("Validating cluster template settings...") 1334 self._has_all_required_settings() 1335 self._validate_spot_bid() 1336 self._validate_cluster_size() 1337 self._validate_shell_setting() 1338 self._validate_permission_settings() 1339 self._validate_credentials() 1340 self._validate_keypair() 1341 self._validate_zone() 1342 self._validate_ebs_settings() 1343 self._validate_ebs_aws_settings() 1344 self._validate_image_settings() 1345 self._validate_instance_types() 1346 self._validate_cluster_compute() 1347 if validate_running: 1348 log.info("Validating existing instances...") 1349 try: 1350 self._validate_running_instances() 1351 except exception.ClusterValidationError: 1352 log.error('existing instances are not compatible with ' 1353 'cluster template settings:') 1354 raise 1355 log.info('Cluster template settings are valid') 1356 return True
1357
1358 - def is_valid(self):
1359 """ 1360 Returns True if all cluster template settings are valid 1361 """ 1362 try: 1363 self._validate() 1364 return True 1365 except exception.ClusterValidationError, e: 1366 log.error(e.msg) 1367 return False
1368
1369 - def _validate_spot_bid(self):
1370 if self.spot_bid is not None: 1371 if type(self.spot_bid) not in [int, float]: 1372 raise exception.ClusterValidationError( 1373 'spot_bid must be integer or float') 1374 if self.spot_bid <= 0: 1375 raise exception.ClusterValidationError( 1376 'spot_bid must be an integer or float > 0') 1377 return True
1378
1379 - def _validate_cluster_size(self):
1380 try: 1381 int(self.cluster_size) 1382 if self.cluster_size < 1: 1383 raise ValueError 1384 except (ValueError, TypeError): 1385 raise exception.ClusterValidationError( 1386 'cluster_size must be an integer >= 1') 1387 num_itypes = sum([i.get('size') for i in self.node_instance_types]) 1388 num_nodes = self.cluster_size - 1 1389 if num_itypes > num_nodes: 1390 raise exception.ClusterValidationError( 1391 ("total number of nodes specified in node_instance_type (%s)" + 1392 " must be <= cluster_size-1 (%s)") % (num_itypes, num_nodes)) 1393 return True
1394
1395 - def _validate_shell_setting(self):
1396 cluster_shell = self.cluster_shell 1397 if not self.__available_shells.get(cluster_shell): 1398 raise exception.ClusterValidationError( 1399 'Invalid user shell specified. Options are %s' % \ 1400 ' '.join(self.__available_shells.keys())) 1401 return True
1402
1403 - def _validate_image_settings(self):
1404 master_image_id = self.master_image_id 1405 node_image_id = self.node_image_id 1406 conn = self.ec2 1407 image = conn.get_image_or_none(node_image_id) 1408 if not image or image.id != node_image_id: 1409 raise exception.ClusterValidationError( 1410 'node_image_id %s does not exist' % node_image_id) 1411 if master_image_id: 1412 master_image = conn.get_image_or_none(master_image_id) 1413 if not master_image or master_image.id != master_image_id: 1414 raise exception.ClusterValidationError( 1415 'master_image_id %s does not exist' % master_image_id) 1416 return True
1417
1418 - def _validate_zone(self):
1419 availability_zone = self.availability_zone 1420 if availability_zone: 1421 zone = self.ec2.get_zone(availability_zone) 1422 if not zone: 1423 azone = self.availability_zone 1424 raise exception.ClusterValidationError( 1425 'availability_zone = %s does not exist' % azone) 1426 if zone.state != 'available': 1427 log.warn('The availability_zone = %s ' % zone + 1428 'is not available at this time') 1429 return True
1430
1431 - def __check_platform(self, image_id, instance_type):
1432 """ 1433 Validates whether an image_id (AMI) is compatible with a given 1434 instance_type. image_id_setting and instance_type_setting are the 1435 setting labels in the config file. 1436 """ 1437 image = self.ec2.get_image_or_none(image_id) 1438 if not image: 1439 raise exception.ClusterValidationError('Image %s does not exist' % 1440 image_id) 1441 image_platform = image.architecture 1442 image_is_hvm = (image.virtualization_type == "hvm") 1443 if image_is_hvm and not instance_type in static.CLUSTER_TYPES: 1444 cctypes_list = ', '.join(static.CLUSTER_TYPES) 1445 raise exception.ClusterValidationError( 1446 "Image '%s' is a Cluster Compute/GPU image (HVM) and cannot " 1447 "be used with instance type '%s'\nThe instance type " 1448 "for a Cluster Compute/GPU image (HVM) must be one of: %s" % \ 1449 (image_id, instance_type, cctypes_list)) 1450 instance_platforms = self.__instance_types[instance_type] 1451 if image_platform not in instance_platforms: 1452 error_msg = "Instance type %(instance_type)s is for an " + \ 1453 "%(instance_platform)s platform while " + \ 1454 "%(image_id)s is an %(image_platform)s platform" 1455 error_dict = {'instance_type': instance_type, 1456 'instance_platform': ', '.join(instance_platforms), 1457 'image_id': image_id, 1458 'image_platform': image_platform} 1459 raise exception.ClusterValidationError(error_msg % error_dict) 1460 return True
1461
1462 - def _validate_instance_types(self):
1463 master_image_id = self.master_image_id 1464 node_image_id = self.node_image_id 1465 master_instance_type = self.master_instance_type 1466 node_instance_type = self.node_instance_type 1467 instance_types = self.__instance_types 1468 instance_type_list = ', '.join(instance_types.keys()) 1469 if not node_instance_type in instance_types: 1470 raise exception.ClusterValidationError( 1471 ("You specified an invalid node_instance_type %s \n" + 1472 "Possible options are:\n%s") % \ 1473 (node_instance_type, instance_type_list)) 1474 elif master_instance_type: 1475 if not master_instance_type in instance_types: 1476 raise exception.ClusterValidationError( 1477 ("You specified an invalid master_instance_type %s\n" + \ 1478 "Possible options are:\n%s") % \ 1479 (master_instance_type, instance_type_list)) 1480 try: 1481 self.__check_platform(node_image_id, node_instance_type) 1482 except exception.ClusterValidationError, e: 1483 raise exception.ClusterValidationError( 1484 'Incompatible node_image_id and node_instance_type:\n' + e.msg) 1485 if master_image_id and not master_instance_type: 1486 try: 1487 self.__check_platform(master_image_id, node_instance_type) 1488 except exception.ClusterValidationError, e: 1489 raise exception.ClusterValidationError( 1490 'Incompatible master_image_id and ' + 1491 'node_instance_type\n' + e.msg) 1492 elif master_image_id and master_instance_type: 1493 try: 1494 self.__check_platform(master_image_id, master_instance_type) 1495 except exception.ClusterValidationError, e: 1496 raise exception.ClusterValidationError( 1497 'Incompatible master_image_id and ' + 1498 'master_instance_type\n' + e.msg) 1499 elif master_instance_type and not master_image_id: 1500 try: 1501 self.__check_platform(node_image_id, master_instance_type) 1502 except exception.ClusterValidationError, e: 1503 raise exception.ClusterValidationError( 1504 'Incompatible node_image_id and ' + 1505 'master_instance_type\n' + e.msg) 1506 for itype in self.node_instance_types: 1507 type = itype.get('type') 1508 img = itype.get('image') or node_image_id 1509 if not type in instance_types: 1510 raise exception.ClusterValidationError( 1511 ("You specified an invalid instance type %s \n" + 1512 "Possible options are:\n%s") % (type, instance_type_list)) 1513 try: 1514 self.__check_platform(img, type) 1515 except exception.ClusterValidationError, e: 1516 raise exception.ClusterValidationError( 1517 "Invalid settings for node_instance_type %s: %s" % 1518 (type, e.msg)) 1519 return True
1520
1521 - def _validate_cluster_compute(self):
1522 lmap = self._get_launch_map() 1523 for (type, image) in lmap: 1524 if type in static.CLUSTER_TYPES: 1525 img = self.ec2.get_image(image) 1526 if img.virtualization_type != 'hvm': 1527 raise exception.ClusterValidationError(( 1528 'Cluster Compute/GPU instance type %s ' + 1529 'can only be used with HVM images.\n' + 1530 'Image %s is NOT an HVM image.') % (type, image))
1531
1532 - def _validate_ebs_aws_settings(self):
1533 """ 1534 Verify EBS volumes exists on Amazon and that each volume's zone matches 1535 this cluster's zone setting. Requires AWS credentials. 1536 """ 1537 for vol in self.volumes: 1538 v = self.volumes.get(vol) 1539 vol_id = v.get('volume_id') 1540 vol = self.ec2.get_volume(vol_id) 1541 if vol.status != 'available': 1542 if self.master_node: 1543 if vol.attach_data.instance_id == self.master_node.id: 1544 continue 1545 msg = "volume %s is not available (status: %s)" % (vol_id, 1546 vol.status) 1547 raise exception.ClusterValidationError(msg)
1548
1550 permissions = self.permissions 1551 for perm in permissions: 1552 permission = permissions.get(perm) 1553 protocol = permission.get('ip_protocol') 1554 if protocol not in self.__protocols: 1555 raise exception.InvalidProtocol(protocol) 1556 from_port = permission.get('from_port') 1557 to_port = permission.get('to_port') 1558 try: 1559 from_port = int(from_port) 1560 to_port = int(to_port) 1561 except ValueError: 1562 raise exception.InvalidPortRange( 1563 from_port, to_port, reason="integer range required") 1564 if from_port < 0 or to_port < 0: 1565 raise exception.InvalidPortRange( 1566 from_port, to_port, 1567 reason="from/to must be positive integers") 1568 if from_port > to_port: 1569 raise exception.InvalidPortRange( 1570 from_port, to_port, 1571 reason="'from_port' must be <= 'to_port'") 1572 cidr_ip = permission.get('cidr_ip') 1573 if not iptools.validate_cidr(cidr_ip): 1574 raise exception.InvalidCIDRSpecified(cidr_ip)
1575
1576 - def _validate_ebs_settings(self):
1577 """ 1578 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs 1579 and validate these settings. Does not require AWS credentials. 1580 """ 1581 vol_ids = [] 1582 devices = [] 1583 mount_paths = [] 1584 for vol in self.volumes: 1585 vol_name = vol 1586 vol = self.volumes.get(vol) 1587 vol_id = vol.get('volume_id') 1588 device = vol.get('device') 1589 partition = vol.get('partition') 1590 mount_path = vol.get("mount_path") 1591 mount_paths.append(mount_path) 1592 devices.append(device) 1593 vol_ids.append(vol_id) 1594 if not device: 1595 raise exception.ClusterValidationError( 1596 'Missing DEVICE setting for volume %s' % vol_name) 1597 if not utils.is_valid_device(device): 1598 raise exception.ClusterValidationError( 1599 "Invalid DEVICE value for volume %s" % vol_name) 1600 if partition: 1601 if not utils.is_valid_partition(partition): 1602 raise exception.ClusterValidationError( 1603 "Invalid PARTITION value for volume %s" % vol_name) 1604 if not partition.startswith(device): 1605 raise exception.ClusterValidationError( 1606 "Volume PARTITION must start with %s" % device) 1607 if not mount_path: 1608 raise exception.ClusterValidationError( 1609 'Missing MOUNT_PATH setting for volume %s' % vol_name) 1610 if not mount_path.startswith('/'): 1611 raise exception.ClusterValidationError( 1612 "MOUNT_PATH for volume %s should start with /" % vol_name) 1613 for vol_id in vol_ids: 1614 if vol_ids.count(vol_id) > 1: 1615 raise exception.ClusterValidationError( 1616 ("Multiple configurations for volume %s specified. " + \ 1617 "Please choose one") % vol_id) 1618 for dev in devices: 1619 if devices.count(dev) > 1: 1620 raise exception.ClusterValidationError( 1621 "Can't attach more than one volume on device %s" % dev) 1622 for path in mount_paths: 1623 if mount_paths.count(path) > 1: 1624 raise exception.ClusterValidationError( 1625 "Can't mount more than one volume on %s" % path) 1626 return True
1627
1628 - def _has_all_required_settings(self):
1629 has_all_required = True 1630 for opt in self.__cluster_settings: 1631 requirements = self.__cluster_settings[opt] 1632 name = opt 1633 required = requirements[1] 1634 if required and self.get(name.lower()) is None: 1635 log.warn('Missing required setting %s' % name) 1636 has_all_required = False 1637 return has_all_required
1638
1639 - def _validate_credentials(self):
1640 if not self.ec2.is_valid_conn(): 1641 raise exception.ClusterValidationError( 1642 'Invalid AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY combination.') 1643 return True
1644
1645 - def _validate_keypair(self):
1646 key_location = self.key_location 1647 if not key_location: 1648 raise exception.ClusterValidationError( 1649 "no key_location specified for key '%s'" % self.keyname) 1650 if not os.path.exists(key_location): 1651 raise exception.ClusterValidationError( 1652 'key_location=%s does not exist.' % \ 1653 key_location) 1654 elif not os.path.isfile(key_location): 1655 raise exception.ClusterValidationError( 1656 'key_location=%s is not a file.' % \ 1657 key_location) 1658 keyname = self.keyname 1659 keypair = self.ec2.get_keypair_or_none(keyname) 1660 if not keypair: 1661 raise exception.ClusterValidationError( 1662 'Account does not contain a key with keyname = %s. ' % keyname) 1663 if self.zone: 1664 z = self.ec2.get_zone(self.zone) 1665 if keypair.region != z.region: 1666 raise exception.ClusterValidationError( 1667 'Keypair %s not in availability zone region %s' % \ 1668 (keyname, z.region)) 1669 return True
1670
1671 - def ssh_to_master(self, user='root'):
1672 self.ssh_to_node('master', user=user)
1673
1674 - def ssh_to_node(self, alias, user='root'):
1675 node = self.get_node_by_alias(alias) 1676 node = node or self.get_node_by_dns_name(alias) 1677 node = node or self.get_node_by_id(alias) 1678 if not node: 1679 raise exception.InstanceDoesNotExist(alias, label='node') 1680 node.shell(user=user)
1681 1682 if __name__ == "__main__": 1683 from starcluster.config import StarClusterConfig 1684 cfg = StarClusterConfig().load() 1685 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster') 1686 if sc.is_valid(): 1687 sc.start(create=True) 1688