1
2 import os
3 import re
4 import time
5 import zlib
6 import string
7 import pprint
8 import base64
9 import cPickle
10 import traceback
11
12 from starcluster import utils
13 from starcluster import static
14 from starcluster import spinner
15 from starcluster import iptools
16 from starcluster import managers
17 from starcluster import exception
18 from starcluster import progressbar
19 from starcluster import clustersetup
20 from starcluster.node import Node
21 from starcluster.utils import print_timing
22 from starcluster.templates import user_msgs
23 from starcluster.logger import log
27 """
28 Manager class for Cluster objects
29 """
31 return "<ClusterManager: %s>" % self.ec2.region.name
32
33 - def get_cluster(self, cluster_name, group=None, load_receipt=True,
34 load_plugins=True):
55
56 - def get_clusters(self, load_receipt=True, load_plugins=True):
66
72
74 """
75 Returns a new Cluster object using the settings from the cluster
76 template template_name
77
78 If tag_name is passed, the Cluster object's cluster_tag setting will
79 be set to tag_name
80 """
81 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name,
82 ec2_conn=self.ec2)
83 return cl
84
86 """
87 Same as get_cluster but returns None instead of throwing an exception
88 if the cluster does not exist
89 """
90 try:
91 return self.get_cluster(cluster_name)
92 except exception.ClusterDoesNotExist:
93 pass
94
96 """
97 Returns True if cluster exists
98 """
99 return self.get_cluster_or_none(tag_name) is not None
100
102 """
103 ssh to master node of cluster_name
104
105 user keyword specifies an alternate user to login as
106 """
107 cluster = self.get_cluster(cluster_name)
108 cluster.ssh_to_master(user=user)
109
111 """
112 ssh to a node in cluster_name that has either an id,
113 dns name, or alias matching node_id
114
115 user keyword specifies an alternate user to login as
116 """
117 cluster = self.get_cluster(cluster_name)
118 cluster.ssh_to_node(node_id, user=user)
119
127
128 - def add_node(self, cluster_name, alias=None, no_create=False):
131
132 - def add_nodes(self, cluster_name, num_nodes, aliases=None,
133 no_create=False):
134 """
135 Add one or more nodes to cluster
136 """
137 cl = self.get_cluster(cluster_name)
138 cl.add_nodes(num_nodes, aliases=aliases, no_create=no_create)
139
140 - def remove_node(self, cluster_name, alias, terminate=True):
149
156
157 - def stop_cluster(self, cluster_name, terminate_unstoppable=False):
158 """
159 Stop an EBS-backed cluster
160 """
161 cl = self.get_cluster(cluster_name)
162 cl.stop_cluster(terminate_unstoppable)
163
170
177
185
187 """
188 Returns the cluster tag name from a security group name that starts
189 with static.SECURITY_GROUP_PREFIX
190
191 Example:
192 sg = '@sc-mycluster'
193 print get_tag_from_sg(sg)
194 mycluster
195 """
196 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)')
197 match = regex.match(sg)
198 if match:
199 return match.groups()[0]
200
201 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
202 """
203 Prints a summary for each active cluster on EC2
204 """
205 if not cluster_groups:
206 cluster_groups = self.get_cluster_security_groups()
207 if not cluster_groups:
208 log.info("No clusters found...")
209 else:
210 try:
211 cluster_groups = [self.get_cluster_security_group(g) for g \
212 in cluster_groups]
213 except exception.SecurityGroupDoesNotExist:
214 raise exception.ClusterDoesNotExist(g)
215 for scg in cluster_groups:
216 tag = self.get_tag_from_sg(scg.name)
217 try:
218 cl = self.get_cluster(tag, group=scg, load_plugins=False)
219 except exception.IncompatibleCluster, e:
220 sep = '*' * 60
221 log.error('\n'.join([sep, e.msg, sep]),
222 extra=dict(__textwrap__=True))
223 continue
224 header = '%s (security group: %s)' % (tag, scg.name)
225 print '-' * len(header)
226 print header
227 print '-' * len(header)
228 nodes = cl.nodes
229 try:
230 n = nodes[0]
231 except IndexError:
232 n = None
233 state = getattr(n, 'state', None)
234 ltime = 'N/A'
235 uptime = 'N/A'
236 if state in ['pending', 'running']:
237 ltime = getattr(n, 'local_launch_time', 'N/A')
238 uptime = getattr(n, 'uptime', 'N/A')
239 print 'Launch time: %s' % ltime
240 print 'Uptime: %s' % uptime
241 print 'Zone: %s' % getattr(n, 'placement', 'N/A')
242 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A')
243 ebs_nodes = [n for n in nodes if n.attached_vols]
244 if ebs_nodes:
245 print 'EBS volumes:'
246 for node in ebs_nodes:
247 devices = node.attached_vols
248 node_id = node.alias or node.id
249 for dev in devices:
250 d = devices.get(dev)
251 vol_id = d.volume_id
252 status = d.status
253 print ' %s on %s:%s (status: %s)' % \
254 (vol_id, node_id, dev, status)
255 else:
256 print 'EBS volumes: N/A'
257 if nodes:
258 print 'Cluster nodes:'
259 for node in nodes:
260 nodeline = " %7s %s %s %s" % (node.alias, node.state,
261 node.id, node.dns_name)
262 if node.spot_id:
263 nodeline += ' (spot %s)' % node.spot_id
264 if show_ssh_status:
265 ssh_status = {True: 'Up', False: 'Down'}
266 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()]
267 print nodeline
268 print 'Total nodes: %d' % len(nodes)
269 else:
270 print 'Cluster nodes: N/A'
271 print
272
286
289 - def __init__(self,
290 ec2_conn=None,
291 spot_bid=None,
292 cluster_tag=None,
293 cluster_description=None,
294 cluster_size=None,
295 cluster_user=None,
296 cluster_shell=None,
297 master_image_id=None,
298 master_instance_type=None,
299 node_image_id=None,
300 node_instance_type=None,
301 node_instance_types=[],
302 availability_zone=None,
303 keyname=None,
304 key_location=None,
305 volumes=[],
306 plugins=[],
307 permissions=[],
308 refresh_interval=30,
309 disable_queue=False,
310 disable_threads=False,
311 cluster_group=None,
312 force_spot_master=False,
313 **kwargs):
314
315 now = time.strftime("%Y%m%d%H%M")
316 self.ec2 = ec2_conn
317 self.spot_bid = spot_bid
318 self.cluster_tag = cluster_tag
319 self.cluster_description = cluster_description
320 if self.cluster_tag is None:
321 self.cluster_tag = "cluster%s" % now
322 if cluster_description is None:
323 self.cluster_description = "Cluster created at %s" % now
324 self.cluster_size = cluster_size or 0
325 self.cluster_user = cluster_user
326 self.cluster_shell = cluster_shell
327 self.master_image_id = master_image_id
328 self.master_instance_type = master_instance_type
329 self.node_image_id = node_image_id
330 self.node_instance_type = node_instance_type
331 self.node_instance_types = node_instance_types
332 self.availability_zone = availability_zone
333 self.keyname = keyname
334 self.key_location = key_location
335 self.volumes = self.load_volumes(volumes)
336 self.plugins = self.load_plugins(plugins)
337 self.permissions = permissions
338 self.refresh_interval = refresh_interval
339 self.disable_queue = disable_queue
340 self.disable_threads = disable_threads
341 self.force_spot_master = force_spot_master
342
343 self.__instance_types = static.INSTANCE_TYPES
344 self.__cluster_settings = static.CLUSTER_SETTINGS
345 self.__available_shells = static.AVAILABLE_SHELLS
346 self.__protocols = static.PROTOCOLS
347 self._progress_bar = None
348 self._master_reservation = None
349 self._node_reservation = None
350 self._nodes = []
351 self._master = None
352 self._zone = None
353 self._plugins = plugins
354 self._cluster_group = None
355 self._placement_group = None
356
358 return '<Cluster: %s (%s-node)>' % (self.cluster_tag,
359 self.cluster_size)
360
361 @property
363 """
364 If volumes are specified, this method determines the common
365 availability zone between those volumes. If an availability zone
366 is explicitly specified in the config and does not match the common
367 availability zone of the volumes, an exception is raised. If all
368 volumes are not in the same availabilty zone an exception is raised.
369 If no volumes are specified, returns the user specified availability
370 zone if it exists.
371 """
372 if not self._zone:
373 zone = None
374 if self.availability_zone:
375 zone = self.ec2.get_zone(self.availability_zone).name
376 common_zone = None
377 for volume in self.volumes:
378 volid = self.volumes.get(volume).get('volume_id')
379 vol = self.ec2.get_volume(volid)
380 if not common_zone:
381 common_zone = vol.zone
382 elif vol.zone != common_zone:
383 vols = [self.volumes.get(v).get('volume_id')
384 for v in self.volumes]
385 raise exception.VolumesZoneError(vols)
386 if common_zone and zone and zone != common_zone:
387 raise exception.InvalidZone(zone, common_zone)
388 if not zone and common_zone:
389 zone = common_zone
390 self._zone = zone
391 return self._zone
392
394 """
395 Iterate through vols and set device/partition settings automatically if
396 not specified.
397
398 This method assigns the first volume to /dev/sdz, second to /dev/sdy,
399 etc for all volumes that do not include a device/partition setting
400 """
401 devices = ['/dev/sd%s' % s for s in string.lowercase]
402 devmap = {}
403 for volname in vols:
404 vol = vols.get(volname)
405 dev = vol.get('device')
406 if dev in devices:
407
408 devices.remove(dev)
409 volid = vol.get('volume_id')
410 if dev and not volid in devmap:
411 devmap[volid] = dev
412 volumes = {}
413 for volname in vols:
414 vol = vols.get(volname)
415 vol_id = vol.get('volume_id')
416 device = vol.get('device')
417 if not device:
418 if vol_id in devmap:
419 device = devmap.get(vol_id)
420 else:
421 device = devices.pop()
422 devmap[vol_id] = device
423 if not utils.is_valid_device(device):
424 raise exception.InvalidDevice(device)
425 v = volumes[volname] = utils.AttributeDict()
426 v.update(vol)
427 v['device'] = device
428 part = vol.get('partition')
429 if part:
430 partition = device + str(part)
431 if not utils.is_valid_partition(partition):
432 raise exception.InvalidPartition(part)
433 v['partition'] = partition
434 return volumes
435
437 plugs = []
438 for plugin in plugins:
439 setup_class = plugin.get('setup_class')
440 plugin_name = plugin.get('__name__').split()[-1]
441 mod_name = '.'.join(setup_class.split('.')[:-1])
442 class_name = setup_class.split('.')[-1]
443 try:
444 mod = __import__(mod_name, globals(), locals(), [class_name])
445 except SyntaxError, e:
446 raise exception.PluginSyntaxError(
447 "Plugin %s (%s) contains a syntax error at line %s" % \
448 (plugin_name, e.filename, e.lineno))
449 except ImportError, e:
450 raise exception.PluginLoadError(
451 "Failed to import plugin %s: %s" % \
452 (plugin_name, e[0]))
453 klass = getattr(mod, class_name, None)
454 if not klass:
455 raise exception.PluginError(
456 'Plugin class %s does not exist' % setup_class)
457 if not issubclass(klass, clustersetup.ClusterSetup):
458 raise exception.PluginError(
459 ("Plugin %s must be a subclass of " + \
460 "starcluster.clustersetup.ClusterSetup") % setup_class)
461 args, kwargs = utils.get_arg_spec(klass.__init__)
462 config_args = []
463 missing_args = []
464 for arg in args:
465 if arg in plugin:
466 config_args.append(plugin.get(arg))
467 else:
468 missing_args.append(arg)
469 log.debug("config_args = %s" % config_args)
470 if missing_args:
471 raise exception.PluginError(
472 "Not enough settings provided for plugin %s (missing: %s)"
473 % (plugin_name, ', '.join(missing_args)))
474 config_kwargs = {}
475 for arg in kwargs:
476 if arg in plugin:
477 config_kwargs[arg] = plugin.get(arg)
478 log.debug("config_kwargs = %s" % config_kwargs)
479 plugs.append((plugin_name, klass(*config_args, **config_kwargs)))
480 return plugs
481
483 for key in kwargs.keys():
484 if hasattr(self, key):
485 self.__dict__[key] = kwargs[key]
486
488 """
489 Validate existing instances against this cluster's settings
490 """
491 self.wait_for_active_spots()
492 nodes = self.nodes
493 if not nodes:
494 raise exception.ClusterValidationError("No existing nodes found!")
495 log.info("Validating existing instances...")
496 mazone = self.master_node.placement
497 rlmap = self._get_launch_map(reverse=True)
498 for node in nodes:
499 itype, image = rlmap.get(node.alias)
500 alias = node.alias
501 ntype = node.instance_type
502 if ntype != itype:
503 raise exception.ClusterValidationError(
504 "%s's instance type (%s) != %s" % (alias, ntype, itype))
505 nimage = node.image_id
506 if nimage != image:
507 raise exception.ClusterValidationError(
508 "%s's image id (%s) != %s" % (alias, nimage, image))
509 if node.key_name != self.keyname:
510 raise exception.ClusterValidationError(
511 "%s's key_name (%s) != %s" % (alias, node.key_name,
512 self.keyname))
513 nazone = node.placement
514 if mazone != nazone:
515 raise exception.ClusterValidationError(
516 "Node '%s' zone (%s) does not match master's zone (%s)" %
517 (alias, nazone, mazone))
518
519 self._zone = None
520 if self.zone and self.zone != mazone:
521 raise exception.ClusterValidationError(
522 "Running cluster's availability_zone (%s) != %s" %
523 (mazone, self.zone))
524
525 - def get(self, name):
526 return self.__dict__.get(name)
527
531
533 """
534 Load the original settings used to launch this cluster into this
535 Cluster object. The settings are loaded from the cluster group's
536 description field.
537 """
538 try:
539 desc = self.cluster_group.description
540 version, b64data = desc.split('-', 1)
541 if utils.program_version_greater(version, static.VERSION):
542 d = dict(cluster=self.cluster_tag, old_version=static.VERSION,
543 new_version=version)
544 msg = user_msgs.version_mismatch % d
545 sep = '*' * 60
546 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1})
547 compressed_data = base64.b64decode(b64data)
548 pkl_data = zlib.decompress(compressed_data)
549 cluster_settings = cPickle.loads(str(pkl_data)).__dict__
550 except (cPickle.PickleError, zlib.error, ValueError, TypeError,
551 EOFError, IndexError), e:
552 log.debug('load receipt exception: ', exc_info=True)
553 raise exception.IncompatibleCluster(self.cluster_group)
554 except Exception, e:
555 raise exception.ClusterReceiptError(
556 'failed to load cluster receipt: %s' % e)
557 for key in cluster_settings:
558 if hasattr(self, key):
559 setattr(self, key, cluster_settings.get(key))
560 if load_plugins:
561 try:
562 self.plugins = self.load_plugins(self._plugins)
563 except exception.PluginError, e:
564 log.warn(e)
565 log.warn("An error occured while loading plugins")
566 log.warn("Not running any plugins")
567 except Exception, e:
568 raise exception.ClusterReceiptError(
569 'failed to load cluster receipt: %s' % e)
570 return True
571
573 cfg = {}
574 exclude = ['key_location', 'plugins']
575 include = ['_zone', '_plugins']
576 for key in self.__dict__.keys():
577 private = key.startswith('_')
578 if (not private or key in include) and not key in exclude:
579 val = getattr(self, key)
580 if type(val) in [str, unicode, bool, int, float, list, dict]:
581 cfg[key] = val
582 elif type(val) is utils.AttributeDict:
583 cfg[key] = dict(val)
584 return cfg
585
586 @property
589
590 @property
592 if self._cluster_group is None:
593 ssh_port = static.DEFAULT_SSH_PORT
594 desc = base64.b64encode(zlib.compress(cPickle.dumps(self)))
595 desc = '-'.join([static.VERSION, desc])
596 sg = self.ec2.get_or_create_group(self._security_group,
597 desc,
598 auth_ssh=True,
599 auth_group_traffic=True)
600 for p in self.permissions:
601 perm = self.permissions.get(p)
602 ip_protocol = perm.get('ip_protocol', 'tcp')
603 from_port = perm.get('from_port')
604 to_port = perm.get('to_port')
605 cidr_ip = perm.get('cidr_ip', static.WORLD_CIDRIP)
606 if not self.ec2.has_permission(sg, ip_protocol, from_port,
607 to_port, cidr_ip):
608 log.info("Opening %s port range %s-%s for CIDR %s" %
609 (ip_protocol, from_port, to_port, cidr_ip))
610 sg.authorize(ip_protocol, from_port, to_port, cidr_ip)
611 if ip_protocol == 'tcp' and from_port <= ssh_port <= to_port:
612 sg.revoke(ip_protocol, ssh_port, ssh_port,
613 static.WORLD_CIDRIP)
614 self._cluster_group = sg
615 return self._cluster_group
616
617 @property
623
624 @property
633
634 @property
636 states = ['pending', 'running', 'stopping', 'stopped']
637 filters = {'group-name': self._security_group,
638 'instance-state-name': states}
639 nodes = self.ec2.get_all_instances(filters=filters)
640
641 current_ids = [n.id for n in nodes]
642 remove_nodes = [n for n in self._nodes if n.id not in current_ids]
643 for node in remove_nodes:
644 self._nodes.remove(node)
645
646 existing_nodes = dict([(n.id, n) for n in self._nodes])
647 log.debug('existing nodes: %s' % existing_nodes)
648 for node in nodes:
649 if node.id in existing_nodes:
650 log.debug('updating existing node %s in self._nodes' % node.id)
651 enode = existing_nodes.get(node.id)
652 enode.key_location = self.key_location
653 enode.instance = node
654 else:
655 log.debug('adding node %s to self._nodes list' % node.id)
656 n = Node(node, self.key_location)
657 if n.is_master():
658 self._master = n
659 self._nodes.insert(0, n)
660 else:
661 self._nodes.append(n)
662 self._nodes.sort(key=lambda n: n.alias)
663 log.debug('returning self._nodes = %s' % self._nodes)
664 return self._nodes
665
671
677
683
689
691 return filter(lambda x: x.state in states, self.nodes)
692
693 @property
696
697 @property
700
701 @property
706
712
713 - def create_node(self, alias, image_id=None, instance_type=None, zone=None,
714 placement_group=None, spot_bid=None, force_flat=False):
719
720 - def create_nodes(self, aliases, image_id=None, instance_type=None, count=1,
721 zone=None, placement_group=None, spot_bid=None,
722 force_flat=False):
723 """
724 Convenience method for requesting instances with this cluster's
725 settings. All settings (kwargs) except force_flat default to cluster
726 settings if not provided. Passing force_flat=True ignores spot_bid
727 completely forcing a flat-rate instance to be requested.
728 """
729 spot_bid = spot_bid or self.spot_bid
730 if force_flat:
731 spot_bid = None
732 cluster_sg = self.cluster_group.name
733 instance_type = instance_type or self.node_instance_type
734 if not placement_group and instance_type in static.CLUSTER_TYPES:
735 placement_group = self.placement_group.name
736 image_id = image_id or self.node_image_id
737 kwargs = dict(price=spot_bid, instance_type=instance_type,
738 min_count=count, max_count=count, count=count,
739 key_name=self.keyname, security_groups=[cluster_sg],
740 availability_zone_group=cluster_sg,
741 launch_group=cluster_sg, placement=zone or self.zone,
742 user_data='|'.join(aliases),
743 placement_group=placement_group)
744 resvs = []
745 if spot_bid:
746 for alias in aliases:
747 kwargs['user_data'] = alias
748 resvs.extend(self.ec2.request_instances(image_id, **kwargs))
749 else:
750 resvs.append(self.ec2.request_instances(image_id, **kwargs))
751 for resv in resvs:
752 log.info(str(resv), extra=dict(__raw__=True))
753 return resvs
754
756 nodes = self._nodes_in_states(['pending', 'running'])
757 nodes = filter(lambda x: not x.is_master(), nodes)
758 highest = 0
759 for n in nodes:
760 try:
761 highest = max(highest, int(n.alias[4:8]))
762 except ValueError:
763 pass
764 next = highest + 1
765 log.debug("Highest node number is %d. choosing %d." % (highest, next))
766 return next
767
768 - def add_node(self, alias=None, no_create=False):
769 """
770 Add a single node to this cluster
771 """
772 aliases = None
773 if alias:
774 aliases = [alias]
775 self.add_nodes(1, aliases=aliases, no_create=no_create)
776
777 - def add_nodes(self, num_nodes, aliases=None, no_create=False):
778 """
779 Add new nodes to this cluster
780
781 aliases - list of aliases to assign to new nodes (len must equal
782 num_nodes)
783 """
784 running_pending = self._nodes_in_states(['pending', 'running'])
785 aliases = aliases or []
786 if not aliases:
787 next_node_id = self._get_next_node_num()
788 for i in range(next_node_id, next_node_id + num_nodes):
789 alias = 'node%.3d' % i
790 aliases.append(alias)
791 assert len(aliases) == num_nodes
792 if "master" in aliases:
793 raise exception.ClusterValidationError(
794 "worker nodes cannot have master as an alias")
795 if not no_create:
796 for node in running_pending:
797 if node.alias in aliases:
798 raise exception.ClusterValidationError(
799 "node with alias %s already exists" % node.alias)
800 log.info("Launching node(s): %s" % ', '.join(aliases))
801 self.create_nodes(aliases, count=len(aliases))
802 self.wait_for_cluster(msg="Waiting for node(s) to come up...")
803 log.debug("Adding node(s): %s" % aliases)
804 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue,
805 self.disable_threads)
806 for alias in aliases:
807 node = self.get_node_by_alias(alias)
808 default_plugin.on_add_node(
809 node, self.nodes, self.master_node,
810 self.cluster_user, self.cluster_shell,
811 self.volumes)
812 self.run_plugins(method_name="on_add_node", node=node)
813
819
841
843 """
844 Groups all node-aliases that have similar instance types/image ids
845 Returns a dictionary that's used to launch all similar instance types
846 and image ids in the same request. Example return value:
847
848 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'],
849 ('m1.large', 'ami-a5c02dcc'): ['node003'],
850 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'],
851 ('m1.small', 'ami-19e17a2b'): ['node004']}
852
853 Passing reverse=True will return the same information only keyed by
854 node aliases:
855
856 {'master': ('m1.small', 'ami-17b15e7e'),
857 'node001': ('c1.xlarge', 'ami-a5c02dcc'),
858 'node002': ('c1.xlarge', 'ami-a5c02dcc'),
859 'node003': ('m1.large', 'ami-a5c02dcc'),
860 'node004': ('m1.small', 'ami-19e17a2b'),
861 'node005': ('m1.small', 'ami-17b15e7e'),
862 'node006': ('m1.small', 'ami-17b15e7e')}
863 """
864 lmap = {}
865 mtype = self.master_instance_type or self.node_instance_type
866 mimage = self.master_image_id or self.node_image_id
867 lmap[(mtype, mimage)] = ['master']
868 id_start = 1
869 for itype in self.node_instance_types:
870 count = itype['size']
871 image_id = itype['image'] or self.node_image_id
872 type = itype['type'] or self.node_instance_type
873 if not (type, image_id) in lmap:
874 lmap[(type, image_id)] = []
875 for id in range(id_start, id_start + count):
876 alias = 'node%.3d' % id
877 log.debug("Launch map: %s (ami: %s, type: %s)..." % \
878 (alias, image_id, type))
879 lmap[(type, image_id)].append(alias)
880 id_start += 1
881 ntype = self.node_instance_type
882 nimage = self.node_image_id
883 if not (ntype, nimage) in lmap:
884 lmap[(ntype, nimage)] = []
885 for id in range(id_start, self.cluster_size):
886 alias = 'node%.3d' % id
887 log.debug("Launch map: %s (ami: %s, type: %s)..." % \
888 (alias, nimage, ntype))
889 lmap[(ntype, nimage)].append(alias)
890 if reverse:
891 rlmap = {}
892 for (itype, image_id) in lmap:
893 aliases = lmap.get((itype, image_id))
894 for alias in aliases:
895 rlmap[alias] = (itype, image_id)
896 return rlmap
897 return lmap
898
900 """
901 Returns (instance_type,image_id) for a given alias based
902 on the map returned from self._get_launch_map
903 """
904 lmap = self._get_launch_map()
905 for (type, image) in lmap:
906 key = (type, image)
907 if alias in lmap.get(key):
908 return key
909
911 """
912 Launches all EC2 instances based on this cluster's settings.
913 """
914 log.info("Launching a %d-node cluster..." % self.cluster_size)
915 mtype = self.master_instance_type or self.node_instance_type
916 self.master_instance_type = mtype
917 if self.spot_bid:
918 self._create_spot_cluster()
919 else:
920 self._create_flat_rate_cluster()
921
923 """
924 Launches cluster using flat-rate instances. This method attempts to
925 minimize the number of launch requests by grouping nodes of the same
926 type/ami and launching each group simultaneously within a single launch
927 request. This is especially important for Cluster Compute instances
928 given that Amazon *highly* recommends requesting all CCI in a single
929 launch request.
930 """
931 lmap = self._get_launch_map()
932 zone = None
933 master_map = None
934 for (type, image) in lmap:
935
936 aliases = lmap.get((type, image))
937 if 'master' in aliases:
938 master_map = (type, image)
939 for alias in aliases:
940 log.debug("Launching %s (ami: %s, type: %s)" % \
941 (alias, image, type))
942 master_response = self.create_nodes(aliases, image_id=image,
943 instance_type=type,
944 count=len(aliases),
945 force_flat=True)[0]
946 zone = master_response.instances[0].placement
947 lmap.pop(master_map)
948 if self.cluster_size <= 1:
949 return
950 for (type, image) in lmap:
951 aliases = lmap.get((type, image))
952 for alias in aliases:
953 log.debug("Launching %s (ami: %s, type: %s)" % \
954 (alias, image, type))
955 self.create_nodes(aliases, image_id=image, instance_type=type,
956 count=len(aliases), zone=zone, force_flat=True)
957
959 """
960 Launches cluster using all spot instances. This method makes a single
961 spot request for each node in the cluster since spot instances
962 *always* have an ami_launch_index of 0. This is needed in order to
963 correctly assign aliases to nodes.
964 """
965 (mtype, mimage) = self._get_type_and_image_id('master')
966 log.info("Launching master node (ami: %s, type: %s)..." % \
967 (mimage, mtype))
968 force_flat = not self.force_spot_master and self.cluster_size > 1
969 master_response = self.create_node('master',
970 image_id=mimage,
971 instance_type=mtype,
972 force_flat=force_flat)
973 zone = None
974 if not force_flat and self.spot_bid:
975
976 launch_spec = master_response.launch_specification
977 zone = launch_spec.placement
978 else:
979
980 zone = master_response.instances[0].placement
981 if self.cluster_size <= 1:
982 return
983 for id in range(1, self.cluster_size):
984 alias = 'node%.3d' % id
985 (ntype, nimage) = self._get_type_and_image_id(alias)
986 log.info("Launching %s (ami: %s, type: %s)" %
987 (alias, nimage, ntype))
988 self.create_node(alias, image_id=nimage, instance_type=ntype,
989 zone=zone)
990
992 """
993 Returns True if all nodes are spot instances
994 """
995 nodes = self.nodes
996 if not nodes:
997 return False
998 for node in nodes:
999 if not node.is_spot():
1000 return False
1001 return True
1002
1004 """
1005 Returns True if any nodes are spot instances
1006 """
1007 for node in self.nodes:
1008 if node.is_spot():
1009 return True
1010 return False
1011
1013 """
1014 Returns True if all nodes are EBS-backed
1015 """
1016 nodes = self.nodes
1017 if not nodes:
1018 return False
1019 for node in nodes:
1020 if not node.is_ebs_backed():
1021 return False
1022 return True
1023
1025 """
1026 Returns True if any nodes are EBS-backed
1027 """
1028 for node in self.nodes:
1029 if node.is_ebs_backed():
1030 return True
1031 return False
1032
1034 """
1035 Returns True if all nodes are stoppable (i.e. non-spot and EBS-backed)
1036 """
1037 nodes = self.nodes
1038 if not nodes:
1039 return False
1040 for node in self.nodes:
1041 if not node.is_stoppable():
1042 return False
1043 return True
1044
1046 """
1047 Returns True if any nodes are stoppable (i.e. non-spot and EBS-backed)
1048 """
1049 nodes = self.nodes
1050 if not nodes:
1051 return False
1052 for node in nodes:
1053 if node.is_stoppable():
1054 return True
1055 return False
1056
1058 """
1059 Returns true if all instances are Cluster/GPU Compute type
1060 """
1061 nodes = self.nodes
1062 if not nodes:
1063 return False
1064 for node in nodes:
1065 if not node.is_cluster_compute():
1066 return False
1067 return True
1068
1074
1076 """
1077 Check that all nodes are 'running' and that ssh is up on all nodes
1078 This method will return False if any spot requests are in an 'open'
1079 state.
1080 """
1081 spots = self.spot_requests
1082 active_spots = filter(lambda x: x.state == 'active', spots)
1083 if len(spots) != len(active_spots):
1084 return False
1085 nodes = self.nodes
1086 if not nodes:
1087 return False
1088 for node in nodes:
1089 if not node.is_up():
1090 return False
1091 return True
1092
1094 """
1095 Logs a status msg, starts a spinner, and returns the spinner object.
1096 This is useful for long running processes:
1097
1098 s = self.get_spinner("Long running process running...")
1099 (do something)
1100 s.stop()
1101 """
1102 s = spinner.Spinner()
1103 log.info(msg, extra=dict(__nonewline__=True))
1104 s.start()
1105 return s
1106
1107 @property
1118
1120 """
1121 Wait for all open spot requests for this cluster to transition to
1122 'active'.
1123 """
1124 spots = spots or self.spot_requests
1125 open_spots = [spot for spot in spots if spot.state == "open"]
1126 if open_spots:
1127 pbar = self.progress_bar.reset()
1128 log.info('Waiting for open spot requests to become active...')
1129 pbar.maxval = len(spots)
1130 pbar.update(0)
1131 while not pbar.finished:
1132 active_spots = filter(lambda x: x.state == "active", spots)
1133 pbar.maxval = len(spots)
1134 pbar.update(len(active_spots))
1135 if not pbar.finished:
1136 time.sleep(self.refresh_interval)
1137 spots = self.get_spot_requests_or_raise()
1138 pbar.reset()
1139
1141 """
1142 Wait indefinitely for cluster nodes to show up.
1143 """
1144 nodes = nodes or self.nodes
1145 if len(nodes) == 0:
1146 s = self.get_spinner("Waiting for instances to activate...")
1147 while len(nodes) == 0:
1148 time.sleep(self.refresh_interval)
1149 nodes = self.nodes
1150 s.stop()
1151
1169
1171 """
1172 Wait until all cluster nodes are in a 'running' state
1173 """
1174 log.info("Waiting for SSH to come up on all nodes...")
1175 nodes = nodes or self.get_nodes_or_raise()
1176 pbar = self.progress_bar.reset()
1177 pbar.maxval = len(nodes)
1178 pbar.update(0)
1179 while not pbar.finished:
1180 active_nodes = filter(lambda n: n.is_up(), nodes)
1181 pbar.maxval = len(nodes)
1182 pbar.update(len(active_nodes))
1183 if not pbar.finished:
1184 time.sleep(self.refresh_interval)
1185 nodes = self.get_nodes_or_raise()
1186 pbar.finish()
1187
1189 """
1190 Wait for cluster to come up and display progress bar. Waits for all
1191 spot requests to become 'active', all instances to be in a 'running'
1192 state, and for all SSH daemons to come up.
1193
1194 msg - custom message to print out before waiting on the cluster
1195 """
1196 interval = self.refresh_interval
1197 log.info("%s %s" % (msg, "(updating every %ds)" % interval))
1198 self.wait_for_active_spots()
1199 self.wait_for_active_instances()
1200 self.wait_for_running_instances()
1201 self.wait_for_ssh()
1202
1204 """
1205 Check whether all nodes are in the 'stopped' state
1206 """
1207 nodes = self.nodes
1208 if not nodes:
1209 return False
1210 for node in nodes:
1211 if node.state != 'stopped':
1212 return False
1213 return True
1214
1216 """
1217 Check whether all nodes are in a 'terminated' state
1218 """
1219 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES)
1220 filters = {'group-name': self._security_group,
1221 'instance-state-name': states}
1222 insts = self.ec2.get_all_instances(filters=filters)
1223 return len(insts) == 0
1224
1226 """
1227 Attach each volume to the master node
1228 """
1229 for vol in self.volumes:
1230 volume = self.volumes.get(vol)
1231 device = volume.get('device')
1232 vol_id = volume.get('volume_id')
1233 vol = self.ec2.get_volume(vol_id)
1234 if vol.attach_data.instance_id == self.master_node.id:
1235 log.info("Volume %s already attached to master...skipping" % \
1236 vol.id)
1237 continue
1238 if vol.status != "available":
1239 log.error(('Volume %s not available...' +
1240 'please check and try again') % vol.id)
1241 continue
1242 log.info("Attaching volume %s to master node on %s ..." % (vol.id,
1243 device))
1244 resp = vol.attach(self.master_node.id, device)
1245 log.debug("resp = %s" % resp)
1246 while True:
1247 vol.update()
1248 if vol.attachment_state() == 'attached':
1249 break
1250 time.sleep(5)
1251
1258
1259 @print_timing('Restarting cluster')
1275
1277 """
1278 Shutdown this cluster by detaching all volumes and 'stopping' all nodes
1279
1280 In general, all nodes in the cluster must be 'stoppable' meaning all
1281 nodes are backed by flat-rate EBS-backed instances. If any
1282 'unstoppable' nodes are found an exception is raised. A node is
1283 'unstoppable' if it is backed by either a spot or S3-backed instance.
1284
1285 If the cluster contains a mix of 'stoppable' and 'unstoppable' nodes
1286 you can stop all stoppable nodes and terminate any unstoppable nodes by
1287 setting terminate_unstoppable=True.
1288
1289 This will stop all nodes that can be stopped and terminate the rest.
1290 """
1291 nodes = self.nodes
1292 if not nodes:
1293 raise exception.ClusterValidationError("No running nodes found")
1294 if not self.is_stoppable():
1295 has_stoppable_nodes = self.has_stoppable_nodes()
1296 if not terminate_unstoppable and has_stoppable_nodes:
1297 raise exception.InvalidOperation(
1298 "Cluster contains nodes that are not stoppable")
1299 if not has_stoppable_nodes:
1300 raise exception.InvalidOperation(
1301 "Cluster does not contain any stoppable nodes")
1302 try:
1303 self.run_plugins(method_name="on_shutdown", reverse=True)
1304 except exception.MasterDoesNotExist, e:
1305 log.error("Cannot run plugins: %s" % e)
1306 self.detach_volumes()
1307 for node in nodes:
1308 node.shutdown()
1309
1341
1342 - def start(self, create=True, create_only=False, validate=True,
1343 validate_only=False, validate_running=False):
1344 """
1345 Creates and configures a cluster from this cluster template's settings.
1346
1347 create - create new nodes when starting the cluster. set to False to
1348 use existing nodes
1349 create_only - only create the cluster node instances, don't configure
1350 the cluster
1351 validate - whether or not to validate the cluster settings used.
1352 False will ignore validate_only and validate_running
1353 keywords and is effectively the same as running _start
1354 validate_only - only validate cluster settings, do not create or
1355 configure cluster
1356 validate_running - whether or not to validate the existing instances
1357 being used against this cluster's settings
1358 """
1359 if validate:
1360 if not create and validate_running:
1361 try:
1362 self._validate_running_instances()
1363 except exception.ClusterValidationError, e:
1364 msg = "Existing nodes are not compatible with cluster "
1365 msg += "settings:\n"
1366 e.msg = msg + e.msg
1367 raise
1368 elif create:
1369 self._validate()
1370 if validate_only:
1371 return
1372 else:
1373 log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK")
1374 return self._start(create=create, create_only=create_only)
1375
1376 @print_timing("Starting cluster")
1377 - def _start(self, create=True, create_only=False):
1378 """
1379 Create and configure a cluster from this cluster template's settings
1380 (Does not attempt to validate before running)
1381
1382 create - create new nodes when starting the cluster. set to False to
1383 use existing nodes
1384 create_only - only create the cluster node instances, don't configure
1385 the cluster
1386 """
1387 log.info("Starting cluster...")
1388 if create:
1389 self.create_cluster()
1390 else:
1391 assert self.master_node is not None
1392 for node in self.stopped_nodes:
1393 log.info("Starting stopped node: %s" % node.alias)
1394 node.start()
1395 if create_only:
1396 return
1397 self._setup_cluster()
1398
1415
1416 - def run_plugins(self, plugins=None, method_name="run", node=None,
1417 reverse=False):
1418 """
1419 Run all plugins specified in this Cluster object's self.plugins list
1420 Uses plugins list instead of self.plugins if specified.
1421
1422 plugins must be a tuple: the first element is the plugin's name, the
1423 second element is the plugin object (a subclass of ClusterSetup)
1424 """
1425 plugs = plugins or self.plugins
1426 if reverse:
1427 plugs = plugs[:]
1428 plugs.reverse()
1429 for plug in plugs:
1430 name, plugin = plug
1431 self.run_plugin(plugin, name, method_name=method_name, node=node)
1432
1433 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1434 """
1435 Run a StarCluster plugin.
1436
1437 plugin - an instance of the plugin's class
1438 name - a user-friendly label for the plugin
1439 method_name - the method to run within the plugin (default: "run")
1440 node - optional node to pass as first argument to plugin method (used
1441 for on_add_node/on_remove_node)
1442 """
1443 plugin_name = name or str(plugin)
1444 try:
1445 func = getattr(plugin, method_name, None)
1446 if not func:
1447 log.warn("Plugin %s has no %s method...skipping" % \
1448 (plugin_name, method_name))
1449 return
1450 args = [self.nodes, self.master_node, self.cluster_user,
1451 self.cluster_shell, self.volumes]
1452 if node:
1453 args.insert(0, node)
1454 log.info("Running plugin %s" % plugin_name)
1455 func(*args)
1456 except NotImplementedError:
1457 log.debug("method %s not implemented by plugin %s" % (method_name,
1458 plugin_name))
1459 except exception.MasterDoesNotExist:
1460 raise
1461 except Exception, e:
1462 log.error("Error occured while running plugin '%s':" % plugin_name)
1463 if isinstance(e, exception.ThreadPoolException):
1464 e.print_excs()
1465 log.debug(e.format_excs())
1466 else:
1467 traceback.print_exc()
1468 log.debug(traceback.format_exc())
1469
1481
1507
1509 """
1510 Returns True if all cluster template settings are valid
1511 """
1512 try:
1513 self._validate()
1514 return True
1515 except exception.ClusterValidationError, e:
1516 log.error(e.msg)
1517 return False
1518
1528
1530 try:
1531 int(self.cluster_size)
1532 if self.cluster_size < 1:
1533 raise ValueError
1534 except (ValueError, TypeError):
1535 raise exception.ClusterValidationError(
1536 'cluster_size must be an integer >= 1')
1537 num_itypes = sum([i.get('size') for i in self.node_instance_types])
1538 num_nodes = self.cluster_size - 1
1539 if num_itypes > num_nodes:
1540 raise exception.ClusterValidationError(
1541 ("total number of nodes specified in node_instance_type (%s)" +
1542 " must be <= cluster_size-1 (%s)") % (num_itypes, num_nodes))
1543 return True
1544
1546 cluster_shell = self.cluster_shell
1547 if not self.__available_shells.get(cluster_shell):
1548 raise exception.ClusterValidationError(
1549 'Invalid user shell specified. Options are %s' % \
1550 ' '.join(self.__available_shells.keys()))
1551 return True
1552
1567
1569 availability_zone = self.availability_zone
1570 if availability_zone:
1571 zone = self.ec2.get_zone(availability_zone)
1572 if not zone:
1573 azone = self.availability_zone
1574 raise exception.ClusterValidationError(
1575 'availability_zone = %s does not exist' % azone)
1576 if zone.state != 'available':
1577 log.warn('The availability_zone = %s ' % zone +
1578 'is not available at this time')
1579 return True
1580
1611
1613 master_image_id = self.master_image_id
1614 node_image_id = self.node_image_id
1615 master_instance_type = self.master_instance_type
1616 node_instance_type = self.node_instance_type
1617 instance_types = self.__instance_types
1618 instance_type_list = ', '.join(instance_types.keys())
1619 if not node_instance_type in instance_types:
1620 raise exception.ClusterValidationError(
1621 ("You specified an invalid node_instance_type %s \n" +
1622 "Possible options are:\n%s") % \
1623 (node_instance_type, instance_type_list))
1624 elif master_instance_type:
1625 if not master_instance_type in instance_types:
1626 raise exception.ClusterValidationError(
1627 ("You specified an invalid master_instance_type %s\n" + \
1628 "Possible options are:\n%s") % \
1629 (master_instance_type, instance_type_list))
1630 try:
1631 self.__check_platform(node_image_id, node_instance_type)
1632 except exception.ClusterValidationError, e:
1633 raise exception.ClusterValidationError(
1634 'Incompatible node_image_id and node_instance_type:\n' + e.msg)
1635 if master_image_id and not master_instance_type:
1636 try:
1637 self.__check_platform(master_image_id, node_instance_type)
1638 except exception.ClusterValidationError, e:
1639 raise exception.ClusterValidationError(
1640 'Incompatible master_image_id and ' +
1641 'node_instance_type\n' + e.msg)
1642 elif master_image_id and master_instance_type:
1643 try:
1644 self.__check_platform(master_image_id, master_instance_type)
1645 except exception.ClusterValidationError, e:
1646 raise exception.ClusterValidationError(
1647 'Incompatible master_image_id and ' +
1648 'master_instance_type\n' + e.msg)
1649 elif master_instance_type and not master_image_id:
1650 try:
1651 self.__check_platform(node_image_id, master_instance_type)
1652 except exception.ClusterValidationError, e:
1653 raise exception.ClusterValidationError(
1654 'Incompatible node_image_id and ' +
1655 'master_instance_type\n' + e.msg)
1656 for itype in self.node_instance_types:
1657 type = itype.get('type')
1658 img = itype.get('image') or node_image_id
1659 if not type in instance_types:
1660 raise exception.ClusterValidationError(
1661 ("You specified an invalid instance type %s \n" +
1662 "Possible options are:\n%s") % (type, instance_type_list))
1663 try:
1664 self.__check_platform(img, type)
1665 except exception.ClusterValidationError, e:
1666 raise exception.ClusterValidationError(
1667 "Invalid settings for node_instance_type %s: %s" %
1668 (type, e.msg))
1669 return True
1670
1681
1683 """
1684 Verify EBS volumes exists and that each volume's zone matches this
1685 cluster's zone setting.
1686 """
1687 for vol in self.volumes:
1688 v = self.volumes.get(vol)
1689 vol_id = v.get('volume_id')
1690 vol = self.ec2.get_volume(vol_id)
1691 if vol.status != 'available':
1692 if self.master_node:
1693 if vol.attach_data.instance_id == self.master_node.id:
1694 continue
1695 msg = "volume %s is not available (status: %s)" % (vol_id,
1696 vol.status)
1697 raise exception.ClusterValidationError(msg)
1698
1700 permissions = self.permissions
1701 for perm in permissions:
1702 permission = permissions.get(perm)
1703 protocol = permission.get('ip_protocol')
1704 if protocol not in self.__protocols:
1705 raise exception.InvalidProtocol(protocol)
1706 from_port = permission.get('from_port')
1707 to_port = permission.get('to_port')
1708 try:
1709 from_port = int(from_port)
1710 to_port = int(to_port)
1711 except ValueError:
1712 raise exception.InvalidPortRange(
1713 from_port, to_port, reason="integer range required")
1714 if from_port < 0 or to_port < 0:
1715 raise exception.InvalidPortRange(
1716 from_port, to_port,
1717 reason="from/to must be positive integers")
1718 if from_port > to_port:
1719 raise exception.InvalidPortRange(
1720 from_port, to_port,
1721 reason="'from_port' must be <= 'to_port'")
1722 cidr_ip = permission.get('cidr_ip')
1723 if not iptools.validate_cidr(cidr_ip):
1724 raise exception.InvalidCIDRSpecified(cidr_ip)
1725
1727 """
1728 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs
1729 and validate these settings. Does not require AWS credentials.
1730 """
1731 volmap = {}
1732 devmap = {}
1733 mount_paths = []
1734 for vol in self.volumes:
1735 vol_name = vol
1736 vol = self.volumes.get(vol)
1737 vol_id = vol.get('volume_id')
1738 device = vol.get('device')
1739 partition = vol.get('partition')
1740 mount_path = vol.get("mount_path")
1741 vmap = volmap.get(vol_id, {})
1742 devices = vmap.get('device', [])
1743 partitions = vmap.get('partition', [])
1744 if devices and device not in devices:
1745 raise exception.ClusterValidationError(
1746 "Can't attach volume %s to more than one device" % vol_id)
1747 elif partitions and partition in partitions:
1748 raise exception.ClusterValidationError(
1749 "Multiple configurations for %s\n"
1750 "Either pick one or specify a separate partition for "
1751 "each configuration" % vol_id)
1752 vmap['partition'] = partitions + [partition]
1753 vmap['device'] = devices + [device]
1754 volmap[vol_id] = vmap
1755 dmap = devmap.get(device, {})
1756 vol_ids = dmap.get('volume_id', [])
1757 if vol_ids and vol_id not in vol_ids:
1758 raise exception.ClusterValidationError(
1759 "Can't attach more than one volume on device %s" % device)
1760 dmap['volume_id'] = vol_ids + [vol_id]
1761 devmap[device] = dmap
1762 mount_paths.append(mount_path)
1763 if not device:
1764 raise exception.ClusterValidationError(
1765 'Missing DEVICE setting for volume %s' % vol_name)
1766 if not utils.is_valid_device(device):
1767 raise exception.ClusterValidationError(
1768 "Invalid DEVICE value for volume %s" % vol_name)
1769 if partition:
1770 if not utils.is_valid_partition(partition):
1771 raise exception.ClusterValidationError(
1772 "Invalid PARTITION value for volume %s" % vol_name)
1773 if not partition.startswith(device):
1774 raise exception.ClusterValidationError(
1775 "Volume PARTITION must start with %s" % device)
1776 if not mount_path:
1777 raise exception.ClusterValidationError(
1778 'Missing MOUNT_PATH setting for volume %s' % vol_name)
1779 if not mount_path.startswith('/'):
1780 raise exception.ClusterValidationError(
1781 "MOUNT_PATH for volume %s should start with /" % vol_name)
1782 for path in mount_paths:
1783 if mount_paths.count(path) > 1:
1784 raise exception.ClusterValidationError(
1785 "Can't mount more than one volume on %s" % path)
1786 return True
1787
1789 has_all_required = True
1790 for opt in self.__cluster_settings:
1791 requirements = self.__cluster_settings[opt]
1792 name = opt
1793 required = requirements[1]
1794 if required and self.get(name.lower()) is None:
1795 log.warn('Missing required setting %s' % name)
1796 has_all_required = False
1797 return has_all_required
1798
1804
1830
1833
1841
1842 if __name__ == "__main__":
1843 from starcluster.config import StarClusterConfig
1844 cfg = StarClusterConfig().load()
1845 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster')
1846 if sc.is_valid():
1847 sc.start(create=True)
1848