Package starcluster :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module starcluster.cluster

   1  #!/usr/bin/env python 
   2  import os 
   3  import re 
   4  import time 
   5  import string 
   6  import platform 
   7  import pprint  
   8  import inspect 
   9  import cPickle 
  10   
  11  from starcluster import ssh 
  12  from starcluster import awsutils 
  13  from starcluster import clustersetup 
  14  from starcluster import static 
  15  from starcluster import exception 
  16  from starcluster import utils 
  17  from starcluster.utils import print_timing 
  18  from starcluster.spinner import Spinner 
  19  from starcluster.logger import log, INFO_NO_NEWLINE 
  20  from starcluster.node import Node 
21 22 -def get_cluster(cluster_name, cfg):
23 """Factory for Cluster class""" 24 try: 25 ec2 = cfg.get_easy_ec2() 26 cluster = ec2.get_security_group(_get_cluster_name(cluster_name)) 27 kwargs = {} 28 kwargs.update(cfg.aws) 29 try: 30 cluster_key = cluster.instances()[0].key_name 31 key = cfg.get_key(cluster_key) 32 except IndexError: 33 key = dict(keyname=None, key_location=None) 34 kwargs.update(key) 35 kwargs.update({'cluster_tag': cluster_name}) 36 return Cluster(**kwargs) 37 except exception.SecurityGroupDoesNotExist,e: 38 raise exception.ClusterDoesNotExist(cluster_name)
39
40 -def get_cluster_or_none(cluster_name,cfg):
41 """ 42 Same as get_cluster only returns None instead of throwing an exception 43 if the cluster is not found 44 """ 45 try: 46 return get_cluster(cluster_name, cfg) 47 except Exception,e: 48 pass
49
50 -def cluster_exists(tag_name, cfg):
51 return get_cluster_or_none(tag_name, cfg) is not None
52
53 -def ssh_to_master(cluster_name, cfg, user='root'):
54 cluster = get_cluster(cluster_name, cfg) 55 master = cluster.master_node 56 key = cfg.get_key(master.key_name) 57 os.system('ssh -i %s %s@%s' % (key.key_location, user, 58 master.dns_name))
59 -def _get_node_number(alias):
60 """ 61 Maps aliases master, node001, etc to 0,1,etc 62 63 Returns an integer (>=0) representing the node "number" if successful, 64 and returns None otherwise 65 """ 66 if alias == "master": 67 return 0 68 else: 69 pattern = re.compile(r"node([0-9][0-9][0-9])") 70 if pattern.match(alias) and len(alias) == 7: 71 return int(pattern.match(alias).groups()[0])
72
73 -def ssh_to_cluster_node(cluster_name, node_id, cfg, user='root'):
74 cluster = get_cluster(cluster_name, cfg) 75 node_num = _get_node_number(node_id) 76 if node_num is None: 77 node_num = node_id 78 node = None 79 try: 80 node = cluster.nodes[int(node_num)] 81 except: 82 if node_id.startswith('i-') and len(node_id) == 10: 83 node = cluster.get_node_by_id(node_id) 84 else: 85 node = cluster.get_node_by_dns_name(node_id) 86 if node: 87 key = cfg.get_key(node.key_name) 88 os.system('ssh -i %s %s@%s' % (key.key_location, user, 89 node.dns_name)) 90 else: 91 log.error("node '%s' does not exist" % node_id)
92
93 -def _get_cluster_name(cluster_name):
94 if not cluster_name.startswith(static.SECURITY_GROUP_PREFIX): 95 cluster_name = static.SECURITY_GROUP_TEMPLATE % cluster_name 96 return cluster_name
97
98 -def stop_cluster(cluster_name, cfg):
99 ec2 = cfg.get_easy_ec2() 100 cname = _get_cluster_name(cluster_name) 101 try: 102 cluster = ec2.get_security_group(cname) 103 for node in cluster.instances(): 104 log.info('Shutting down %s' % node.id) 105 node.stop() 106 log.info('Removing cluster security group %s' % cluster.name) 107 cluster.delete() 108 except exception.SecurityGroupDoesNotExist,e: 109 raise exception.ClusterDoesNotExist(cluster_name)
110
111 -def get_cluster_security_groups(cfg):
112 ec2 = cfg.get_easy_ec2() 113 sgs = ec2.get_security_groups() 114 starcluster_groups = [] 115 for sg in sgs: 116 is_starcluster = sg.name.startswith(static.SECURITY_GROUP_PREFIX) 117 if is_starcluster and sg.name != static.MASTER_GROUP: 118 starcluster_groups.append(sg) 119 return starcluster_groups
120
121 -def get_tag_from_sg(sg):
122 """ 123 Returns the cluster tag name from a security group name that starts with 124 static.SECURITY_GROUP_PREFIX 125 126 Example: 127 sg = '@sc-mycluster' 128 print get_tag_from_sg(sg) 129 mycluster 130 """ 131 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)') 132 match = regex.match(sg) 133 if match: 134 return match.groups()[0]
135
136 -def list_clusters(cfg):
137 starcluster_groups = get_cluster_security_groups(cfg) 138 if starcluster_groups: 139 for scg in starcluster_groups: 140 print 141 tag = get_tag_from_sg(scg.name) 142 header = '%s (security group: %s)' % (tag, scg.name) 143 print '-'*len(header) 144 print header 145 print '-'*len(header) 146 cl = get_cluster(tag, cfg) 147 master = cl.master_node 148 print 'Launch time: %s' % master.launch_time 149 print 'Zone: %s' % master.placement 150 print 'Keypair: %s' % master.key_name 151 if master.block_device_mapping: 152 print 'EBS volumes:' 153 devices = master.block_device_mapping 154 for dev in devices: 155 d = devices.get(dev) 156 vol_id = d.volume_id 157 status = d.status 158 print ' %s on master:%s (status: %s)' % (vol_id, dev, status) 159 if cl.nodes: 160 print 'Cluster nodes:' 161 for node in cl.nodes: 162 spot = node.spot_id or '' 163 if spot: 164 spot = '(spot %s)' % spot 165 print " %7s %s %s %s %s" % (node.alias, node.state, node.id, 166 node.dns_name, spot) 167 else: 168 log.info("No clusters found...")
169
170 -def run_plugin(plugin_name, cluster_tag, cfg):
171 ec2 = cfg.get_easy_ec2() 172 cl = get_cluster(cluster_tag) 173 cl.load_receipt() 174 plug = cfg.get_plugin(plugin_name) 175 plugins = {} 176 plugins[plugin_name] = plug 177 plugins = cl.load_plugins(plugins) 178 master = cl.master_node 179 for p in plugins: 180 p.run(cl.nodes, cl.master_node, cl.cluster_user, cl.cluster_shell, 181 volumes)
182
183 -class Cluster(object):
184 - def __init__(self, 185 aws_access_key_id=None, 186 aws_secret_access_key=None, 187 aws_port=None, 188 aws_is_secure=True, 189 aws_ec2_path='/', 190 aws_s3_path='/', 191 aws_region_name=None, 192 aws_region_host=None, 193 spot_bid=None, 194 cluster_tag=None, 195 cluster_description=None, 196 cluster_size=None, 197 cluster_user=None, 198 cluster_shell=None, 199 master_image_id=None, 200 master_instance_type=None, 201 node_image_id=None, 202 node_instance_type=None, 203 availability_zone=None, 204 keyname=None, 205 key_location=None, 206 volumes=[], 207 plugins=[], 208 **kwargs):
209 210 now = time.strftime("%Y%m%d%H%M") 211 212 self.ec2 = awsutils.EasyEC2( 213 aws_access_key_id, aws_secret_access_key, 214 aws_port = aws_port, aws_is_secure = aws_is_secure, 215 aws_ec2_path = aws_ec2_path, aws_s3_path = aws_s3_path, 216 aws_region_name = aws_region_name, 217 aws_region_host = aws_region_host, 218 ) 219 self.spot_bid = spot_bid 220 self.cluster_tag = cluster_tag 221 self.cluster_description = cluster_description 222 if self.cluster_tag is None: 223 self.cluster_tag = now 224 if cluster_description is None: 225 self.cluster_description = "Cluster created at %s" % now 226 self.cluster_size = cluster_size 227 self.cluster_user = cluster_user 228 self.cluster_shell = cluster_shell 229 self.master_image_id = master_image_id 230 self.master_instance_type = master_instance_type 231 self.node_image_id = node_image_id 232 self.node_instance_type = node_instance_type 233 self.availability_zone = availability_zone 234 self.keyname = keyname 235 self.key_location = key_location 236 self.volumes = self.load_volumes(volumes) 237 self.plugins = plugins 238 239 self.__instance_types = static.INSTANCE_TYPES 240 self.__cluster_settings = static.CLUSTER_SETTINGS 241 self.__available_shells = static.AVAILABLE_SHELLS 242 self._master_reservation = None 243 self._node_reservation = None 244 self._nodes = None 245 self._master = None 246 self._plugins = self.load_plugins(plugins) 247 self._zone = None
248 249 @property
250 - def zone(self):
251 """ 252 If volumes are specified, this method determines the common availability 253 zone between those volumes. If an availability zone is explicitly 254 specified in the config and does not match the common availability zone 255 of the volumes, an exception is raised. If all volumes are not in the same 256 availabilty zone an exception is raised. If no volumes are specified, 257 returns the user specified availability zone if it exists. 258 """ 259 if not self._zone: 260 zone = None 261 if self.availability_zone: 262 zone = self.ec2.get_zone(self.availability_zone).name 263 common_zone = None 264 for volume in self.volumes: 265 volid = self.volumes.get(volume).get('volume_id') 266 vol = self.ec2.get_volume(volid) 267 if not common_zone: 268 common_zone = vol.zone 269 elif vol.zone != common_zone: 270 vols = [ self.volumes.get(v).get('volume_id') 271 for v in self.volumes ] 272 raise exception.VolumesZoneError(vols) 273 if common_zone and zone and zone != common_zone: 274 raise exception.InvalidZone(zone, common_zone) 275 if not zone and common_zone: 276 zone = common_zone 277 self._zone=zone 278 return self._zone
279
280 - def load_volumes(self, vols):
281 """ 282 Iterate through vols and set device/partition settings automatically if 283 not specified. 284 285 This method assigns the first volume to /dev/sdz, second to /dev/sdy, 286 etc for all volumes that do not include a device/partition setting 287 """ 288 devices = [ '/dev/sd%s' % s for s in string.lowercase ] 289 for volname in vols: 290 vol = vols.get(volname) 291 dev = vol.get('device') 292 if dev in devices: 293 # rm user-defined devices from the list of auto-assigned devices 294 devices.remove(dev) 295 volumes = {} 296 for volname in vols: 297 vol = vols.get(volname) 298 device = vol.get('device') 299 if not device: 300 device = devices.pop() 301 if not utils.is_valid_device(device): 302 raise exception.InvalidDevice(device) 303 v = volumes[volname] = utils.AttributeDict() 304 v.update(vol) 305 v['device'] = device 306 part = vol.get('partition',1) 307 partition = device + str(part) 308 if not utils.is_valid_partition(partition): 309 raise exception.InvalidPartition(part) 310 v['partition'] = partition 311 return volumes
312
313 - def load_plugins(self, plugins):
314 plugs = [] 315 for plugin in plugins: 316 setup_class = plugin.get('setup_class') 317 plugin_name = plugin.get('__name__') 318 mod_name = '.'.join(setup_class.split('.')[:-1]) 319 class_name = setup_class.split('.')[-1] 320 try: 321 mod = __import__(mod_name, globals(), locals(), [class_name]) 322 except SyntaxError,e: 323 raise exception.PluginSyntaxError( 324 "Plugin %s (%s) contains a syntax error at line %s" % \ 325 (plugin_name, e.filename, e.lineno) 326 ) 327 except ImportError,e: 328 raise exception.PluginLoadError( 329 "Failed to import plugin %s: %s" % (plugin_name, e.message) 330 ) 331 klass = getattr(mod, class_name, None) 332 if klass: 333 if issubclass(klass, clustersetup.ClusterSetup): 334 argspec = inspect.getargspec(klass.__init__) 335 args = argspec.args[1:] 336 nargs = len(args) 337 ndefaults = 0 338 if argspec.defaults: 339 ndefaults = len(argspec.defaults) 340 nrequired = nargs - ndefaults 341 config_args = [] 342 for arg in argspec.args: 343 if arg in plugin: 344 config_args.append(plugin.get(arg)) 345 log.debug("config_args = %s" % config_args) 346 log.debug("args = %s" % argspec.args) 347 if nrequired != len(config_args): 348 raise exception.PluginError( 349 "Not enough settings provided for plugin %s" % \ 350 plugin_name 351 ) 352 plugs.append((plugin_name,klass(*config_args))) 353 else: 354 raise exception.PluginError( 355 """Plugin %s must be a subclass of starcluster.clustersetup.ClusterSetup""" \ 356 % setup_class) 357 else: 358 raise exception.PluginError( 359 'Plugin class %s does not exist' % setup_class 360 ) 361 return plugs
362
363 - def update(self, kwargs):
364 for key in kwargs.keys(): 365 if hasattr(self, key): 366 self.__dict__[key] = kwargs[key]
367
369 """ 370 Validate existing instances against this template's settings 371 """ 372 self._validate_instance_types() 373 num_running = len(self.nodes) 374 if num_running != self.cluster_size: 375 raise exception.ClusterValidationError( 376 "Number of pending/running instances (%s) != %s" % \ 377 (num_running, self.cluster_size)) 378 mtype = self.master_node.instance_type 379 mastertype = self.master_instance_type or self.node_instance_type 380 if mtype != mastertype: 381 raise exception.ClusterValidationError( 382 "The running master node's instance type (%s) != %s" % \ 383 (mtype, mastertype)) 384 masterimage = self.master_image_id or self.node_image_id 385 mimage = self.master_node.image_id 386 if mimage != masterimage: 387 raise exception.ClusterValidationError( 388 "The running master node's image id (%s) != %s" % \ 389 (mimage, masterimage)) 390 mkey = self.master_node.key_name 391 if mkey != self.keyname: 392 raise exception.ClusterValidationError( 393 "The running master's keypair (%s) != %s" % \ 394 (mkey, self.keyname)) 395 try: 396 nodes = self.nodes[1:self.cluster_size] 397 except IndexError,e: 398 raise exception.ClusterValidationError("Cluster has no running instances") 399 mazone = self.master_node.placement 400 for n in nodes: 401 ntype = n.instance_type 402 if ntype != self.node_instance_type: 403 raise exception.ClusterValidationError( 404 "Running node's instance type (%s) != %s" % \ 405 (ntype, self.node_instance_type)) 406 nimage = n.image_id 407 if nimage != self.node_image_id: 408 raise exception.ClusterValidationError( 409 "Running node's image id (%s) != %s" % \ 410 (nimage, self.node_image_id)) 411 if n.key_name != self.keyname: 412 raise exception.ClusterValidationError( 413 "Running node's key_name (%s) != %s" % \ 414 (n.key_name, self.keyname)) 415 nazone = n.placement 416 if mazone != nazone: 417 raise exception.ClusterValidationError( 418 "Running master zone (%s) does not match node zone (%s)" % \ 419 (mazone, nazone)) 420 # reset zone 421 self._zone = None 422 if self.zone and self.zone != mazone: 423 raise exception.ClusterValidationError( 424 "Running cluster's availability_zone (%s) != %s" % \ 425 (azone, self.zone))
426
427 - def get(self, name):
428 return self.__dict__.get(name)
429
430 - def __str__(self):
431 cfg = {} 432 for key in self.__dict__.keys(): 433 if not key.startswith('_'): 434 cfg[key] = getattr(self,key) 435 return pprint.pformat(cfg)
436
437 - def load_receipt(self):
438 """ 439 Fetch the StarCluster receipt file from the master node and use it to 440 populate this object's attributes. This is used to restore the state of 441 this object's settings as they were at the time of creating the cluster. 442 """ 443 try: 444 f = self.master_node.ssh.remote_file(static.STARCLUSTER_RECEIPT_FILE,'r') 445 cfg = cPickle.load(f) 446 f.close() 447 for key in cfg: 448 setattr(self, key, cfg.get(key)) 449 #self._plugins = self.load_plugins(self.plugins) 450 return True 451 except IOError,e: 452 raise exception.ClusterReceiptError( 453 'cluster receipt does not exist') 454 except Exception,e: 455 raise exception.ClusterReceiptError( 456 'failed to load cluster receipt')
457
458 - def create_receipt(self):
459 """ 460 Create a 'receipt' file on the master node that contains this Cluster 461 object's attributes. This receipt is useful for loading 462 the settings used to create the cluster at a later time using 463 load_receipt(). 464 """ 465 try: 466 cfg = {} 467 for key in self.__dict__.keys(): 468 if not key.startswith('_'): 469 val = getattr(self,key) 470 if type(val) in [str, bool, int, float, list, dict]: 471 cfg[key] = val 472 elif type(val) is utils.AttributeDict: 473 cfg[key] = dict(val) 474 self.master_node.ssh.execute('mkdir -p %s' % \ 475 static.STARCLUSTER_RECEIPT_DIR) 476 f = self.master_node.ssh.remote_file(static.STARCLUSTER_RECEIPT_FILE) 477 cPickle.dump(cfg, f) 478 f.close() 479 except Exception,e: 480 print e 481 raise exception.ClusterReceiptError( 482 'failed to create cluster receipt') 483 return True
484 485 @property
486 - def _security_group(self):
487 return static.SECURITY_GROUP_TEMPLATE % self.cluster_tag
488 489 @property
490 - def master_group(self):
494 495 @property
496 - def cluster_group(self):
497 sg = self.ec2.get_or_create_group(self._security_group, 498 self.cluster_description, 499 auth_group_traffic=True) 500 return sg
501 502 @property
503 - def master_node(self):
504 if not self._master: 505 # TODO: do this with reservation group info instead 506 mgroup_instances = self.master_group.instances() 507 cgroup_instances = [node.id for node in self.cluster_group.instances()] 508 for node in mgroup_instances: 509 if node.id in cgroup_instances and node.state in ['pending','running']: 510 self._master = Node(node, self.key_location, 'master') 511 break 512 return self._master
513 514 @property
515 - def nodes(self):
516 if not self._nodes: 517 nodes = self.cluster_group.instances() 518 self._nodes = [] 519 master = self.master_node 520 nodeid = 1 521 for node in nodes: 522 if node.state not in ['pending','running']: 523 continue 524 if node.id == master.id: 525 self._nodes.insert(0,master) 526 continue 527 self._nodes.append(Node(node, self.key_location, 528 'node%.3d' % nodeid)) 529 nodeid += 1 530 else: 531 for node in self._nodes: 532 log.debug('refreshing instance %s' % node.id) 533 node.update() 534 return self._nodes
535
536 - def get_node_by_dns_name(self, dns_name):
537 nodes = self.nodes 538 for node in nodes: 539 if node.dns_name == dns_name: 540 return node
541
542 - def get_node_by_id(self, instance_id):
543 nodes = self.nodes 544 for node in nodes: 545 if node.id == instance_id: 546 return node
547 548 @property
549 - def running_nodes(self):
550 nodes = [] 551 for node in self.nodes: 552 if node.state == 'running': 553 nodes.append(node) 554 return nodes
555
556 - def run_instances(self, price=None, image_id=None, instance_type='m1.small', 557 min_count=1, max_count=1, count=1, key_name=None, 558 security_groups=None, launch_group=None, 559 availability_zone_group=None, placement=None):
560 conn = self.ec2 561 if price: 562 return conn.request_spot_instances(price, image_id, 563 instance_type=instance_type, 564 count=count, 565 launch_group=launch_group, 566 key_name=key_name, 567 security_groups=security_groups, 568 availability_zone_group=availability_zone_group, 569 placement=placement) 570 else: 571 return conn.run_instances(image_id, instance_type=instance_type, 572 min_count=min_count, max_count=max_count, 573 key_name=key_name, 574 security_groups=security_groups, 575 placement=placement)
576
577 - def create_cluster(self):
578 log.info("Launching a %d-node cluster..." % self.cluster_size) 579 if self.master_image_id is None: 580 self.master_image_id = self.node_image_id 581 if self.master_instance_type is None: 582 self.master_instance_type = self.node_instance_type 583 log.info("Launching master node...") 584 log.info("Master AMI: %s" % self.master_image_id) 585 master_sg = self.master_group.name 586 cluster_sg = self.cluster_group.name 587 zone = self.zone 588 master_response = self.run_instances(self.spot_bid, 589 image_id=self.master_image_id, 590 instance_type=self.master_instance_type, 591 min_count=1, max_count=1, count=1, 592 key_name=self.keyname, 593 security_groups=[master_sg, cluster_sg], 594 availability_zone_group=cluster_sg, 595 launch_group=cluster_sg, 596 placement=zone) 597 print master_response 598 # Make sure nodes are in same zone as master 599 if self.spot_bid: 600 launch_spec = master_response[0].launch_specification 601 zone = launch_spec.placement 602 else: 603 zone = master_response.instances[0].placement 604 if self.cluster_size > 1: 605 log.info("Launching worker nodes...") 606 log.info("Node AMI: %s" % self.node_image_id) 607 instances_response = self.run_instances(self.spot_bid, 608 image_id=self.node_image_id, 609 instance_type=self.node_instance_type, 610 min_count=max((self.cluster_size-1)/2, 1), 611 max_count=max(self.cluster_size-1,1), 612 count=max(self.cluster_size-1,1), 613 key_name=self.keyname, 614 security_groups=[cluster_sg], 615 availability_zone_group=cluster_sg, 616 launch_group=cluster_sg, 617 placement=zone) 618 print instances_response
619
620 - def is_cluster_up(self):
621 """ 622 Check whether there are cluster_size nodes running, 623 that ssh (port 22) is up on all nodes, and that each node 624 has an internal ip address associated with it 625 """ 626 nodes = self.running_nodes 627 if len(nodes) == self.cluster_size: 628 for node in nodes: 629 if not node.is_up(): 630 return False 631 return True 632 else: 633 return False
634
635 - def attach_volumes_to_master(self):
636 for vol in self.volumes: 637 volume = self.volumes.get(vol) 638 device = volume.get('device') 639 vol_id = volume.get('volume_id') 640 vol = self.ec2.get_volume(vol_id) 641 log.info("Attaching volume %s to master node on %s ..." % (vol.id, 642 device)) 643 if vol.status != "available": 644 log.error('Volume %s not available...please check and try again' 645 % vol.id) 646 continue 647 resp = vol.attach(self.master_node.id, device) 648 log.debug("resp = %s" % resp) 649 while True: 650 vol.update() 651 if vol.attachment_state() == 'attached': 652 break 653 time.sleep(5)
654
655 - def detach_volumes(self):
656 for vol in self.volumes: 657 vol_id = self.volumes.get(vol).get('volume_id') 658 vol = self.ec2.get_volume(vol_id) 659 log.info("Detaching volume %s from master" % vol.id) 660 vol.detach()
661
662 - def stop_cluster(self):
663 resp = raw_input(">>> Shutdown cluster ? (yes/no) ") 664 if resp == 'yes': 665 if self.volumes: 666 self.detach_volumes() 667 for node in self.running_nodes: 668 log.info("Shutting down instance: %s " % node.id) 669 node.stop() 670 log.info("Removing %s security group" % self._security_group) 671 self.cluster_group.delete() 672 else: 673 log.info("Exiting without shutting down instances....")
674 675 @print_timing
676 - def start(self, create=True):
677 log.info("Starting cluster...") 678 if create: 679 self.create_cluster() 680 s = Spinner() 681 log.log(INFO_NO_NEWLINE, "Waiting for cluster to start...") 682 s.start() 683 while not self.is_cluster_up(): 684 time.sleep(60) 685 s.stop() 686 687 log.info("The master node is %s" % self.master_node.dns_name) 688 689 if self.volumes: 690 self.attach_volumes_to_master() 691 692 log.info("Setting up the cluster...") 693 default_setup = clustersetup.DefaultClusterSetup().run( 694 self.nodes, self.master_node, 695 self.cluster_user, self.cluster_shell, 696 self.volumes 697 ) 698 self.create_receipt() 699 for plugin in self._plugins: 700 try: 701 plugin_name = plugin[0] 702 plug = plugin[1] 703 log.info("Running plugin %s" % plugin_name) 704 plug.run(self.nodes, self.master_node, self.cluster_user, 705 self.cluster_shell, self.volumes) 706 except Exception, e: 707 log.error("Error occured while running plugin '%s':" % \ 708 plugin_name) 709 print e 710 711 log.info(""" 712 713 The cluster has been started and configured. 714 715 Login to the master node as root by running: 716 717 $ starcluster sshmaster %(tag)s 718 719 or manually as %(user)s: 720 721 $ ssh -i %(key)s %(user)s@%(master)s 722 723 When you are finished using the cluster, run: 724 725 $ starcluster stop %(tag)s 726 727 to shutdown the cluster and stop paying for service 728 729 """ % { 730 'master': self.master_node.dns_name, 731 'user': self.cluster_user, 732 'key': self.key_location, 733 'tag': self.cluster_tag, 734 })
735
736 - def is_running_valid(self):
737 """ 738 Checks whether the current running instances are compatible 739 with this cluster template's settings 740 """ 741 try: 742 self._validate_running_instances() 743 return True 744 except exception.ClusterValidationError,e: 745 log.error(e.msg) 746 return False
747
748 - def is_valid(self):
749 """ 750 Checks that all cluster template settings are valid 751 """ 752 try: 753 self._has_all_required_settings() 754 self._validate_spot_bid() 755 self._validate_cluster_size() 756 self._validate_shell_setting() 757 self._validate_credentials() 758 self._validate_keypair() 759 self._validate_zone() 760 self._validate_ebs_settings() 761 self._validate_ebs_aws_settings() 762 self._validate_image_settings() 763 self._validate_instance_types() 764 return True 765 except exception.ClusterValidationError,e: 766 log.error(e.msg) 767 return False
768
769 - def _validate_spot_bid(self):
770 if self.spot_bid is not None: 771 if type(self.spot_bid) not in [int,float]: 772 raise exception.ClusterValidationError( \ 773 'spot_bid must be integer or float') 774 if self.spot_bid <= 0: 775 raise exception.ClusterValidationError( 776 'spot_bid must be an integer or float > 0') 777 return True
778
779 - def _validate_cluster_size(self):
780 if self.cluster_size <= 0 or not isinstance(self.cluster_size, int): 781 raise exception.ClusterValidationError( 782 'cluster_size must be a positive integer.') 783 return True
784
785 - def _validate_shell_setting(self):
786 cluster_shell = self.cluster_shell 787 if not self.__available_shells.get(cluster_shell): 788 raise exception.ClusterValidationError( 789 'Invalid user shell specified. Options are %s' % \ 790 ' '.join(self.__available_shells.keys())) 791 return True
792
793 - def _validate_image_settings(self):
794 master_image_id = self.master_image_id 795 node_image_id = self.node_image_id 796 conn = self.ec2 797 image = conn.get_image_or_none(node_image_id) 798 if not image or image.id != node_image_id: 799 raise exception.ClusterValidationError( 800 'node_image_id %s does not exist' % node_image_id 801 ) 802 if master_image_id: 803 master_image = conn.get_image_or_none(master_image_id) 804 if not master_image or master_image.id != master_image_id: 805 raise exception.ClusterValidationError( 806 'master_image_id %s does not exist' % master_image_id) 807 return True
808
809 - def _validate_zone(self):
810 availability_zone = self.availability_zone 811 if availability_zone: 812 zone = self.ec2.get_zone(availability_zone) 813 if not zone: 814 raise exception.ClusterValidationError( 815 'availability_zone = %s does not exist' % availability_zone 816 ) 817 if zone.state != 'available': 818 log.warn('The availability_zone = %s ' % zone + 819 'is not available at this time') 820 return True
821
822 - def __check_platform(self, image_id, instance_type):
823 """ 824 Validates whether an image_id (AMI) is compatible with a given 825 instance_type. image_id_setting and instance_type_setting are the 826 setting labels in the config file. 827 """ 828 image = self.ec2.get_image_or_none(image_id) 829 if not image: 830 raise exception.ClusterValidationError('Image %s does not exist' % 831 image_id) 832 image_platform = image.architecture 833 instance_platform = self.__instance_types[instance_type] 834 if instance_platform != image_platform: 835 error_msg = "Instance type %(instance_type)s is for an " + \ 836 "%(instance_platform)s platform while " + \ 837 "%(image_id)s is an %(image_platform)s platform" 838 error_dict = {'instance_type':instance_type, 839 'instance_platform': instance_platform, 840 'image_id': image_id, 841 'image_platform': image_platform} 842 raise exception.ClusterValidationError(error_msg % error_dict) 843 return True
844
845 - def _validate_instance_types(self):
846 master_image_id = self.master_image_id 847 node_image_id = self.node_image_id 848 master_instance_type = self.master_instance_type 849 node_instance_type = self.node_instance_type 850 instance_types = self.__instance_types 851 instance_type_list = ' '.join(instance_types.keys()) 852 conn = self.ec2 853 if not instance_types.has_key(node_instance_type): 854 raise exception.ClusterValidationError( 855 ("You specified an invalid node_instance_type %s \n" + 856 "Possible options are:\n%s") % \ 857 (node_instance_type, instance_type_list)) 858 elif master_instance_type: 859 if not instance_types.has_key(master_instance_type): 860 raise exception.ClusterValidationError( 861 ("You specified an invalid master_instance_type %s\n" + \ 862 "Possible options are:\n%s") % \ 863 (master_instance_type, instance_type_list)) 864 865 try: 866 self.__check_platform(node_image_id, node_instance_type) 867 except exception.ClusterValidationError,e: 868 raise exception.ClusterValidationError( 869 'Incompatible node_image_id and node_instance_type\n' + e.msg 870 ) 871 if master_image_id and not master_instance_type: 872 try: 873 self.__check_platform(master_image_id, node_instance_type) 874 except exception.ClusterValidationError,e: 875 raise exception.ClusterValidationError( 876 'Incompatible master_image_id and node_instance_type\n' + e.msg 877 ) 878 elif master_image_id and master_instance_type: 879 try: 880 self.__check_platform(master_image_id, master_instance_type) 881 except exception.ClusterValidationError,e: 882 raise exception.ClusterValidationError( 883 'Incompatible master_image_id and master_instance_type\n' + e.msg 884 ) 885 elif master_instance_type and not master_image_id: 886 try: 887 self.__check_platform(node_image_id, master_instance_type) 888 except exception.ClusterValidationError,e: 889 raise exception.ClusterValidationError( 890 'Incompatible node_image_id and master_instance_type\n' + e.msg 891 ) 892 return True
893
894 - def _validate_ebs_aws_settings(self):
895 """ 896 Verify EBS volumes exists on Amazon and that each volume's zone matches 897 this cluster's zone setting. Requires AWS credentials. 898 """ 899 zone = self.zone 900 for vol in self.volumes: 901 v = self.volumes.get(vol) 902 vol_id = v.get('volume_id') 903 vol = self.ec2.get_volume(vol_id) 904 if vol.status != 'available': 905 msg = "volume %s is not available (status: %s)" % (vol_id, 906 vol.status) 907 raise exception.ClusterValidationError(msg)
908
909 - def _validate_ebs_settings(self):
910 """ 911 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs 912 and validate these settings. Does not require AWS credentials. 913 """ 914 vol_ids = [] 915 devices = [] 916 mount_paths = [] 917 for vol in self.volumes: 918 vol_name = vol 919 vol = self.volumes.get(vol) 920 vol_id = vol.get('volume_id') 921 device = vol.get('device') 922 partition = vol.get('partition') 923 mount_path = vol.get("mount_path") 924 mount_paths.append(mount_path) 925 devices.append(device) 926 vol_ids.append(vol_id) 927 if not device: 928 raise exception.ClusterValidationError( 929 'Missing DEVICE setting for volume %s' % vol_name) 930 if not utils.is_valid_device(device): 931 raise exception.ClusterValidationError( 932 "Invalid DEVICE value for volume %s" % vol_name) 933 if not partition: 934 raise exception.ClusterValidationError( 935 'Missing PARTITION setting for volume %s' % vol_name) 936 if not utils.is_valid_partition(partition): 937 raise exception.ClusterValidationError( 938 "Invalid PARTITION value for volume %s" % vol_name) 939 if not partition.startswith(device): 940 raise exception.ClusterValidationError( 941 "Volume PARTITION must start with %s" % device) 942 if not mount_path: 943 raise exception.ClusterValidationError( 944 'Missing MOUNT_PATH setting for volume %s' % vol_name) 945 if not mount_path.startswith('/'): 946 raise exception.ClusterValidationError( 947 "MOUNT_PATH for volume %s should start with /" % vol_name) 948 for vol_id in vol_ids: 949 if vol_ids.count(vol_id) > 1: 950 raise exception.ClusterValidationError( 951 ("Multiple configurations for volume %s specified. " + \ 952 "Please choose one") % vol_id) 953 for dev in devices: 954 if devices.count(dev) > 1: 955 raise exception.ClusterValidationError( 956 "Can't attach more than one volume on device %s" % dev) 957 for path in mount_paths: 958 if mount_paths.count(path) > 1: 959 raise exception.ClusterValidationError( 960 "Can't mount more than one volume on %s" % path) 961 return True
962
963 - def _has_all_required_settings(self):
964 has_all_required = True 965 for opt in self.__cluster_settings: 966 requirements = self.__cluster_settings[opt] 967 name = opt; required = requirements[1]; 968 if required and self.get(name.lower()) is None: 969 log.warn('Missing required setting %s' % name) 970 has_all_required = False 971 return has_all_required
972
973 - def _validate_credentials(self):
974 if not self.ec2.is_valid_conn(): 975 raise exception.ClusterValidationError( 976 'Invalid AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY combination.') 977 return True
978
979 - def _validate_keypair(self):
980 key_location = self.key_location 981 if not os.path.exists(key_location): 982 raise exception.ClusterValidationError( 983 'key_location=%s does not exist.' % \ 984 key_location) 985 elif not os.path.isfile(key_location): 986 raise exception.ClusterValidationError( 987 'key_location=%s is not a file.' % \ 988 key_location) 989 keyname = self.keyname 990 conn = self.ec2 991 keypair = self.ec2.get_keypair_or_none(keyname) 992 if not keypair: 993 raise exception.ClusterValidationError( 994 'Account does not contain a key with keyname = %s. ' % keyname) 995 if self.zone: 996 z = self.ec2.get_zone(self.zone) 997 if keypair.region != z.region: 998 raise exception.ClusterValidationError( 999 'Keypair %s not in availability zone region %s' % (keyname, 1000 z.region)) 1001 return True
1002 1003 if __name__ == "__main__": 1004 from starcluster.config import StarClusterConfig 1005 cfg = StarClusterConfig(); cfg.load() 1006 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster') 1007 if sc.is_valid(): 1008 sc.start(create=True) 1009