1
2 import os
3 import re
4 import time
5 import zlib
6 import string
7 import pprint
8 import base64
9 import inspect
10 import cPickle
11 import traceback
12
13 from starcluster import utils
14 from starcluster import static
15 from starcluster import spinner
16 from starcluster import iptools
17 from starcluster import managers
18 from starcluster import exception
19 from starcluster import progressbar
20 from starcluster import clustersetup
21 from starcluster.node import Node
22 from starcluster.utils import print_timing
23 from starcluster.templates import user_msgs
24 from starcluster.logger import log
28 """
29 Manager class for Cluster objects
30 """
32 return "<ClusterManager: %s>" % self.ec2.region.name
33
34 - def get_cluster(self, cluster_name, group=None, load_receipt=True,
35 load_plugins=True):
55
61
63 """
64 Returns a new Cluster object using the settings from the cluster
65 template template_name
66
67 If tag_name is passed, the Cluster object's cluster_tag setting will
68 be set to tag_name
69 """
70 cl = self.cfg.get_cluster_template(template_name, tag_name=tag_name,
71 ec2_conn=self.ec2)
72 return cl
73
75 """
76 Same as get_cluster but returns None instead of throwing an exception
77 if the cluster does not exist
78 """
79 try:
80 return self.get_cluster(cluster_name)
81 except exception.ClusterDoesNotExist:
82 pass
83
85 """
86 Returns True if cluster exists
87 """
88 return self.get_cluster_or_none(tag_name) is not None
89
91 """
92 ssh to master node of cluster_name
93
94 user keyword specifies an alternate user to login as
95 """
96 cluster = self.get_cluster(cluster_name)
97 cluster.ssh_to_master(user=user)
98
100 """
101 ssh to a node in cluster_name that has either an id,
102 dns name, or alias matching node_id
103
104 user keyword specifies an alternate user to login as
105 """
106 cluster = self.get_cluster(cluster_name)
107 cluster.ssh_to_node(node_id, user=user)
108
116
117 - def add_node(self, cluster_name, alias=None):
120
121 - def add_nodes(self, cluster_name, num_nodes, aliases=None):
122 """
123 Add one or more nodes to cluster
124 """
125 cl = self.get_cluster(cluster_name)
126 cl.add_nodes(num_nodes, aliases=aliases)
127
137
144
146 """
147 Stops cluster_name if it's an EBS cluster, otherwise terminates the
148 cluster
149 """
150 cl = self.get_cluster(cluster_name)
151 cl.stop_cluster()
152
159
166
174
176 """
177 Returns the cluster tag name from a security group name that starts
178 with static.SECURITY_GROUP_PREFIX
179
180 Example:
181 sg = '@sc-mycluster'
182 print get_tag_from_sg(sg)
183 mycluster
184 """
185 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)')
186 match = regex.match(sg)
187 if match:
188 return match.groups()[0]
189
190 - def list_clusters(self, cluster_groups=None, show_ssh_status=False):
191 """
192 Prints a summary for each active cluster on EC2
193 """
194 if not cluster_groups:
195 cluster_groups = self.get_cluster_security_groups()
196 if not cluster_groups:
197 log.info("No clusters found...")
198 else:
199 try:
200 cluster_groups = [self.get_cluster_security_group(g) for g \
201 in cluster_groups]
202 except exception.SecurityGroupDoesNotExist:
203 raise exception.ClusterDoesNotExist(g)
204 for scg in cluster_groups:
205 tag = self.get_tag_from_sg(scg.name)
206 try:
207 cl = self.get_cluster(tag, group=scg, load_plugins=False)
208 except exception.IncompatibleCluster, e:
209 sep = '*' * 60
210 log.error('\n'.join([sep, e.msg, sep]),
211 extra=dict(__textwrap__=True))
212 continue
213 header = '%s (security group: %s)' % (tag, scg.name)
214 print '-' * len(header)
215 print header
216 print '-' * len(header)
217 nodes = cl.nodes
218 try:
219 n = nodes[0]
220 except IndexError:
221 n = None
222 state = getattr(n, 'state', None)
223 ltime = 'N/A'
224 uptime = 'N/A'
225 if state in ['pending', 'running']:
226 ltime = getattr(n, 'local_launch_time', 'N/A')
227 uptime = getattr(n, 'uptime', 'N/A')
228 print 'Launch time: %s' % ltime
229 print 'Uptime: %s' % uptime
230 print 'Zone: %s' % getattr(n, 'placement', 'N/A')
231 print 'Keypair: %s' % getattr(n, 'key_name', 'N/A')
232 ebs_nodes = [n for n in nodes if n.attached_vols]
233 if ebs_nodes:
234 print 'EBS volumes:'
235 for node in ebs_nodes:
236 devices = node.attached_vols
237 node_id = node.alias or node.id
238 for dev in devices:
239 d = devices.get(dev)
240 vol_id = d.volume_id
241 status = d.status
242 print ' %s on %s:%s (status: %s)' % \
243 (vol_id, node_id, dev, status)
244 else:
245 print 'EBS volumes: N/A'
246 if nodes:
247 print 'Cluster nodes:'
248 for node in nodes:
249 nodeline = " %7s %s %s %s" % (node.alias, node.state,
250 node.id, node.dns_name)
251 if node.spot_id:
252 nodeline += ' (spot %s)' % node.spot_id
253 if show_ssh_status:
254 ssh_status = {True: 'Up', False: 'Down'}
255 nodeline += ' (SSH: %s)' % ssh_status[node.is_up()]
256 print nodeline
257 print 'Total nodes: %d' % len(nodes)
258 else:
259 print 'Cluster nodes: N/A'
260 print
261
275
278 - def __init__(self,
279 ec2_conn=None,
280 spot_bid=None,
281 cluster_tag=None,
282 cluster_description=None,
283 cluster_size=None,
284 cluster_user=None,
285 cluster_shell=None,
286 master_image_id=None,
287 master_instance_type=None,
288 node_image_id=None,
289 node_instance_type=None,
290 node_instance_types=[],
291 availability_zone=None,
292 keyname=None,
293 key_location=None,
294 volumes=[],
295 plugins=[],
296 permissions=[],
297 refresh_interval=30,
298 disable_queue=False,
299 disable_threads=False,
300 cluster_group=None,
301 **kwargs):
302
303 now = time.strftime("%Y%m%d%H%M")
304
305 self.ec2 = ec2_conn
306 self.spot_bid = spot_bid
307 self.cluster_tag = cluster_tag
308 self.cluster_description = cluster_description
309 if self.cluster_tag is None:
310 self.cluster_tag = "cluster%s" % now
311 if cluster_description is None:
312 self.cluster_description = "Cluster created at %s" % now
313 self.cluster_size = cluster_size or 0
314 self.cluster_user = cluster_user
315 self.cluster_shell = cluster_shell
316 self.master_image_id = master_image_id
317 self.master_instance_type = master_instance_type
318 self.node_image_id = node_image_id
319 self.node_instance_type = node_instance_type
320 self.node_instance_types = node_instance_types
321 self.availability_zone = availability_zone
322 self.keyname = keyname
323 self.key_location = key_location
324 self.volumes = self.load_volumes(volumes)
325 self.plugins = self.load_plugins(plugins)
326 self.permissions = permissions
327 self.refresh_interval = refresh_interval
328 self.disable_queue = disable_queue
329 self.disable_threads = disable_threads
330
331 self.__instance_types = static.INSTANCE_TYPES
332 self.__cluster_settings = static.CLUSTER_SETTINGS
333 self.__available_shells = static.AVAILABLE_SHELLS
334 self.__protocols = static.PROTOCOLS
335 self._progress_bar = None
336 self._master_reservation = None
337 self._node_reservation = None
338 self._nodes = []
339 self._master = None
340 self._zone = None
341 self._plugins = plugins
342 self._cluster_group = None
343 self._placement_group = None
344
346 return '<Cluster: %s (%s-node)>' % (self.cluster_tag,
347 self.cluster_size)
348
349 @property
351 """
352 If volumes are specified, this method determines the common
353 availability zone between those volumes. If an availability zone
354 is explicitly specified in the config and does not match the common
355 availability zone of the volumes, an exception is raised. If all
356 volumes are not in the same availabilty zone an exception is raised.
357 If no volumes are specified, returns the user specified availability
358 zone if it exists.
359 """
360 if not self._zone:
361 zone = None
362 if self.availability_zone:
363 zone = self.ec2.get_zone(self.availability_zone).name
364 common_zone = None
365 for volume in self.volumes:
366 volid = self.volumes.get(volume).get('volume_id')
367 vol = self.ec2.get_volume(volid)
368 if not common_zone:
369 common_zone = vol.zone
370 elif vol.zone != common_zone:
371 vols = [self.volumes.get(v).get('volume_id')
372 for v in self.volumes]
373 raise exception.VolumesZoneError(vols)
374 if common_zone and zone and zone != common_zone:
375 raise exception.InvalidZone(zone, common_zone)
376 if not zone and common_zone:
377 zone = common_zone
378 self._zone = zone
379 return self._zone
380
382 """
383 Iterate through vols and set device/partition settings automatically if
384 not specified.
385
386 This method assigns the first volume to /dev/sdz, second to /dev/sdy,
387 etc for all volumes that do not include a device/partition setting
388 """
389 devices = ['/dev/sd%s' % s for s in string.lowercase]
390 for volname in vols:
391 vol = vols.get(volname)
392 dev = vol.get('device')
393 if dev in devices:
394
395 devices.remove(dev)
396 volumes = {}
397 for volname in vols:
398 vol = vols.get(volname)
399 device = vol.get('device')
400 if not device:
401 device = devices.pop()
402 if not utils.is_valid_device(device):
403 raise exception.InvalidDevice(device)
404 v = volumes[volname] = utils.AttributeDict()
405 v.update(vol)
406 v['device'] = device
407 part = vol.get('partition')
408 if part:
409 partition = device + str(part)
410 if not utils.is_valid_partition(partition):
411 raise exception.InvalidPartition(part)
412 v['partition'] = partition
413 return volumes
414
416 plugs = []
417 for plugin in plugins:
418 setup_class = plugin.get('setup_class')
419 plugin_name = plugin.get('__name__').split()[-1]
420 mod_name = '.'.join(setup_class.split('.')[:-1])
421 class_name = setup_class.split('.')[-1]
422 try:
423 mod = __import__(mod_name, globals(), locals(), [class_name])
424 except SyntaxError, e:
425 raise exception.PluginSyntaxError(
426 "Plugin %s (%s) contains a syntax error at line %s" % \
427 (plugin_name, e.filename, e.lineno))
428 except ImportError, e:
429 raise exception.PluginLoadError(
430 "Failed to import plugin %s: %s" % \
431 (plugin_name, e.message))
432 klass = getattr(mod, class_name, None)
433 if not klass:
434 raise exception.PluginError(
435 'Plugin class %s does not exist' % setup_class)
436 if not issubclass(klass, clustersetup.ClusterSetup):
437 raise exception.PluginError(
438 ("Plugin %s must be a subclass of " + \
439 "starcluster.clustersetup.ClusterSetup") % setup_class)
440 (args, varargs,
441 keywords, defaults) = inspect.getargspec(klass.__init__)
442 log.debug('plugin args = %s' % args)
443 log.debug('plugin varargs = %s' % varargs)
444 log.debug('plugin keywords = %s' % keywords)
445 log.debug('plugin defaults = %s' % str(defaults))
446 args = args[1:]
447 nargs = len(args)
448 ndefaults = 0
449 if defaults:
450 ndefaults = len(defaults)
451 nrequired = nargs - ndefaults
452 args = args[:nrequired]
453 kwargs = args[nrequired:]
454 config_args = []
455 for arg in args:
456 if arg in plugin:
457 config_args.append(plugin.get(arg))
458 config_kwargs = {}
459 for arg in kwargs:
460 if arg in plugin:
461 config_kwargs[arg] = plugin.get(arg)
462 log.debug("config_args = %s" % config_args)
463 log.debug("config_kwargs = %s" % config_kwargs)
464 if nrequired > len(config_args):
465 raise exception.PluginError(
466 "Not enough settings provided for plugin %s" % plugin_name)
467 plugs.append((plugin_name, klass(*config_args, **config_kwargs)))
468 return plugs
469
471 for key in kwargs.keys():
472 if hasattr(self, key):
473 self.__dict__[key] = kwargs[key]
474
476 """
477 Validate existing instances against this template's settings
478 """
479 self._validate_instance_types()
480 num_running = len(self.nodes)
481 if num_running != self.cluster_size:
482 raise exception.ClusterValidationError(
483 "Number of existing instances (%s) != cluster_size (%s)" % \
484 (num_running, self.cluster_size))
485 mtype = self.master_node.instance_type
486 mastertype = self.master_instance_type or self.node_instance_type
487 if mtype != mastertype:
488 raise exception.ClusterValidationError(
489 "The existing master node's instance type (%s) != %s" % \
490 (mtype, mastertype))
491 masterimage = self.master_image_id or self.node_image_id
492 mimage = self.master_node.image_id
493 if mimage != masterimage:
494 raise exception.ClusterValidationError(
495 "The existing master node's image id (%s) != %s" % \
496 (mimage, masterimage))
497 mkey = self.master_node.key_name
498 if mkey != self.keyname:
499 raise exception.ClusterValidationError(
500 "The existing master's keypair (%s) != %s" % \
501 (mkey, self.keyname))
502 try:
503 nodes = self.nodes[1:]
504 except IndexError:
505 raise exception.ClusterValidationError(
506 "Cluster has no running instances")
507 mazone = self.master_node.placement
508 id_start = 0
509 for itype in self.node_instance_types:
510 size = itype['size']
511 image = itype['image'] or self.node_image_id
512 type = itype['type'] or self.node_instance_type
513 for i in range(id_start, id_start + size):
514 n = nodes[i]
515 ntype = n.instance_type
516 if ntype != type:
517 raise exception.ClusterValidationError(
518 "Running node's instance type (%s) != %s" % \
519 (ntype, type))
520 nimage = n.image_id
521 if nimage != image:
522 raise exception.ClusterValidationError(
523 "Running node's image id (%s) != %s" % \
524 (nimage, image))
525 id_start += 1
526 for n in nodes[id_start:]:
527 ntype = n.instance_type
528 if n.instance_type != self.node_instance_type:
529 raise exception.ClusterValidationError(
530 "Running node's instance type (%s) != %s" % \
531 (ntype, self.node_instance_type))
532 nimage = n.image_id
533 if nimage != self.node_image_id:
534 raise exception.ClusterValidationError(
535 "Running node's image id (%s) != %s" % \
536 (nimage, image))
537 for n in nodes:
538 if n.key_name != self.keyname:
539 raise exception.ClusterValidationError(
540 "Running node's key_name (%s) != %s" % \
541 (n.key_name, self.keyname))
542 nazone = n.placement
543 if mazone != nazone:
544 raise exception.ClusterValidationError(
545 ("Running master's zone (%s) " + \
546 "does not match node zone (%s)") % \
547 (mazone, nazone))
548
549 self._zone = None
550 if self.zone and self.zone != mazone:
551 raise exception.ClusterValidationError(
552 "Running cluster's availability_zone (%s) != %s" % \
553 (mazone, self.zone))
554
555 - def get(self, name):
556 return self.__dict__.get(name)
557
561
563 """
564 Load the original settings used to launch this cluster into this
565 Cluster object. The settings are loaded from the cluster group's
566 description field.
567 """
568 try:
569 desc = self.cluster_group.description
570 version, b64data = desc.split('-', 1)
571 if utils.program_version_greater(version, static.VERSION):
572 d = dict(cluster=self.cluster_tag, old_version=static.VERSION,
573 new_version=version)
574 msg = user_msgs.version_mismatch % d
575 sep = '*' * 60
576 log.warn('\n'.join([sep, msg, sep]), extra={'__textwrap__': 1})
577 compressed_data = base64.b64decode(b64data)
578 pkl_data = zlib.decompress(compressed_data)
579 cluster_settings = cPickle.loads(str(pkl_data)).__dict__
580 except (cPickle.PickleError, zlib.error, ValueError, TypeError,
581 EOFError, IndexError), e:
582 log.debug('load receipt exception: ', exc_info=True)
583 raise exception.IncompatibleCluster(self.cluster_group)
584 except Exception, e:
585 raise exception.ClusterReceiptError(
586 'failed to load cluster receipt: %s' % e)
587 for key in cluster_settings:
588 if hasattr(self, key):
589 setattr(self, key, cluster_settings.get(key))
590 if load_plugins:
591 try:
592 self.plugins = self.load_plugins(self._plugins)
593 except exception.PluginError, e:
594 log.warn(e)
595 log.warn("An error occured while loading plugins")
596 log.warn("Not running any plugins")
597 except Exception, e:
598 raise exception.ClusterReceiptError(
599 'failed to load cluster receipt: %s' % e)
600 return True
601
603 cfg = {}
604 exclude = ['key_location', 'plugins']
605 include = ['_zone', '_plugins']
606 for key in self.__dict__.keys():
607 private = key.startswith('_')
608 if (not private or key in include) and not key in exclude:
609 val = getattr(self, key)
610 if type(val) in [str, unicode, bool, int, float, list, dict]:
611 cfg[key] = val
612 elif type(val) is utils.AttributeDict:
613 cfg[key] = dict(val)
614 return cfg
615
616 @property
619
620 @property
622 if self._cluster_group is None:
623 desc = base64.b64encode(zlib.compress(cPickle.dumps(self)))
624 desc = '-'.join([static.VERSION, desc])
625 sg = self.ec2.get_or_create_group(self._security_group,
626 desc,
627 auth_ssh=True,
628 auth_group_traffic=True)
629 for p in self.permissions:
630 perm = self.permissions.get(p)
631 ip_protocol = perm.get('ip_protocol', 'tcp')
632 from_port = perm.get('from_port')
633 to_port = perm.get('to_port')
634 cidr_ip = perm.get('cidr_ip', '0.0.0.0/0')
635 if not self.ec2.has_permission(sg, ip_protocol, from_port,
636 to_port, cidr_ip):
637 log.info("Opening %s port range %s-%s for CIDR %s" %
638 (ip_protocol, from_port, to_port, cidr_ip))
639 sg.authorize(ip_protocol, from_port, to_port, cidr_ip)
640 self._cluster_group = sg
641 return self._cluster_group
642
643 @property
649
650 @property
657
658 @property
660 states = ['pending', 'running', 'stopping', 'stopped']
661 filters = {'group-id': self._security_group,
662 'instance-state-name': states}
663 nodes = self.ec2.get_all_instances(filters=filters)
664
665 current_ids = map(lambda n: n.id, nodes)
666 remove_nodes = filter(lambda n: n.id not in current_ids, self._nodes)
667 map(lambda n: self._nodes.remove(n), remove_nodes)
668
669 existing_nodes = dict(map(lambda x: (x.id, x), self._nodes))
670 log.debug('existing nodes: %s' % existing_nodes)
671 for node in nodes:
672 if node.id in existing_nodes:
673 log.debug('updating existing node %s in self._nodes' % node.id)
674 enode = existing_nodes.get(node.id)
675 enode.key_location = self.key_location
676 enode.instance = node
677 else:
678 log.debug('adding node %s to self._nodes list' % node.id)
679 n = Node(node, self.key_location)
680 if n.is_master():
681 self._master = n
682 self._nodes.insert(0, n)
683 else:
684 self._nodes.append(n)
685 self._nodes.sort(key=lambda n: n.alias)
686 log.debug('returning self._nodes = %s' % self._nodes)
687 return self._nodes
688
694
700
706
708 return filter(lambda x: x.state in states, self.nodes)
709
710 @property
713
714 @property
717
718 @property
723
724 - def create_node(self, alias, image_id=None, instance_type=None, zone=None,
725 placement_group=None):
729
730 - def create_nodes(self, aliases, image_id=None, instance_type=None, count=1,
731 zone=None, placement_group=None):
732 """
733 Convenience method for requesting instances with this cluster's
734 settings
735 """
736 cluster_sg = self.cluster_group.name
737 if instance_type in static.CLUSTER_TYPES:
738 placement_group = self.placement_group.name
739 response = self.ec2.request_instances(
740 image_id or self.node_image_id,
741 price=self.spot_bid,
742 instance_type=instance_type or self.node_instance_type,
743 min_count=count, max_count=count, count=count,
744 key_name=self.keyname,
745 security_groups=[cluster_sg],
746 availability_zone_group=cluster_sg,
747 launch_group=cluster_sg,
748 placement=zone or self.zone,
749 user_data='|'.join(aliases),
750 placement_group=placement_group)
751 return response
752
754 nodes = self._nodes_in_states(['pending', 'running'])
755 nodes = filter(lambda x: not x.is_master(), nodes)
756 highest = 0
757 for n in nodes:
758 try:
759 highest = max(highest, int(n.alias[4:8]))
760 except ValueError:
761 pass
762 next = highest + 1
763 log.debug("Highest node number is %d. choosing %d." % (highest, next))
764 return next
765
767 """
768 Add a single node to this cluster
769 """
770 aliases = None
771 if alias:
772 aliases = [alias]
773 self.add_nodes(1, aliases=aliases)
774
775 - def add_nodes(self, num_nodes, aliases=None):
776 """
777 Add new nodes to this cluster
778
779 aliases - list of aliases to assign to new nodes (len must equal
780 num_nodes)
781 """
782 running_pending = self._nodes_in_states(['pending', 'running'])
783 aliases = aliases or []
784 if not aliases:
785 next_node_id = self._get_next_node_num()
786 for i in range(next_node_id, next_node_id + num_nodes):
787 alias = 'node%.3d' % i
788 aliases.append(alias)
789 assert len(aliases) == num_nodes
790 if "master" in aliases:
791 raise exception.ClusterValidationError(
792 "worker nodes cannot have master as an alias")
793 for node in running_pending:
794 if node.alias in aliases:
795 raise exception.ClusterValidationError(
796 "node with alias %s already exists" % node.alias)
797 log.debug("Adding node(s): %s" % aliases)
798 log.info("Launching node(s): %s" % ', '.join(aliases))
799 print self.create_nodes(aliases, count=len(aliases))
800 self.wait_for_cluster(msg="Waiting for node(s) to come up...")
801 default_plugin = clustersetup.DefaultClusterSetup(self.disable_queue,
802 self.disable_threads)
803 for alias in aliases:
804 node = self.get_node_by_alias(alias)
805 default_plugin.on_add_node(
806 node, self.nodes, self.master_node,
807 self.cluster_user, self.cluster_shell,
808 self.volumes)
809 self.run_plugins(method_name="on_add_node", node=node)
810
812 """
813 Remove a single node from this cluster
814 """
815 return self.remove_nodes([node])
816
836
838 """
839 Groups all node-aliases that have similar instance types/image ids
840 Returns a dictionary that's used to launch all similar instance types
841 and image ids in the same request. Example return value:
842
843 {('c1.xlarge', 'ami-a5c02dcc'): ['node001', 'node002'],
844 ('m1.large', 'ami-a5c02dcc'): ['node003'],
845 ('m1.small', 'ami-17b15e7e'): ['master', 'node005', 'node006'],
846 ('m1.small', 'ami-19e17a2b'): ['node004']}
847 """
848 lmap = {}
849 mtype = self.master_instance_type or self.node_instance_type
850 mimage = self.master_image_id or self.node_image_id
851 lmap[(mtype, mimage)] = ['master']
852 id_start = 1
853 for itype in self.node_instance_types:
854 count = itype['size']
855 image_id = itype['image'] or self.node_image_id
856 type = itype['type'] or self.node_instance_type
857 if not (type, image_id) in lmap:
858 lmap[(type, image_id)] = []
859 for id in range(id_start, id_start + count):
860 alias = 'node%.3d' % id
861 log.debug("Launch map: %s (ami: %s, type: %s)..." % \
862 (alias, image_id, type))
863 lmap[(type, image_id)].append(alias)
864 id_start += 1
865 ntype = self.node_instance_type
866 nimage = self.node_image_id
867 if not (ntype, nimage) in lmap:
868 lmap[(ntype, nimage)] = []
869 for id in range(id_start, self.cluster_size):
870 alias = 'node%.3d' % id
871 log.debug("Launch map: %s (ami: %s, type: %s)..." % \
872 (alias, nimage, ntype))
873 lmap[(ntype, nimage)].append(alias)
874 return lmap
875
877 """
878 Returns (instance_type,image_id) for a given alias based
879 on the map returned from self._get_launch_map
880 """
881 lmap = self._get_launch_map()
882 for (type, image) in lmap:
883 key = (type, image)
884 if alias in lmap.get(key):
885 return key
886
888 """
889 Launches all EC2 instances based on this cluster's settings.
890 """
891 log.info("Launching a %d-node cluster..." % self.cluster_size)
892 mtype = self.master_instance_type or self.node_instance_type
893 self.master_instance_type = mtype
894 if self.spot_bid:
895 self._create_spot_cluster()
896 else:
897 self._create_flat_rate_cluster()
898
900 """
901 Launches cluster using flat-rate instances. This method attempts to
902 minimize the number of launch requests by grouping nodes of the same
903 type/ami and launching each group simultaneously within a single launch
904 request. This is especially important for Cluster Compute instances
905 given that Amazon *highly* recommends requesting all CCI in a single
906 launch request.
907 """
908 log.info("Launching a %d-node cluster..." % self.cluster_size)
909 lmap = self._get_launch_map()
910 zone = None
911 master_map = None
912 for (type, image) in lmap:
913
914 aliases = lmap.get((type, image))
915 if 'master' in aliases:
916 master_map = (type, image)
917 for alias in aliases:
918 log.debug("Launching %s (ami: %s, type: %s)" % \
919 (alias, image, type))
920 master_response = self.create_nodes(aliases, image_id=image,
921 instance_type=type,
922 count=len(aliases))
923 zone = master_response.instances[0].placement
924 print master_response
925 lmap.pop(master_map)
926 if self.cluster_size <= 1:
927 return
928 for (type, image) in lmap:
929 aliases = lmap.get((type, image))
930 for alias in aliases:
931 log.debug("Launching %s (ami: %s, type: %s)" % \
932 (alias, image, type))
933 node_response = self.create_nodes(aliases, image_id=image,
934 instance_type=type,
935 count=len(aliases), zone=zone)
936 print node_response
937
939 """
940 Launches cluster using all spot instances. This method makes a single
941 spot request for each node in the cluster since spot instances
942 *always* have an ami_launch_index of 0. This is needed in order to
943 correctly assign aliases to nodes.
944 """
945 (mtype, mimage) = self._get_type_and_image_id('master')
946 log.info("Launching master node (ami: %s, type: %s)..." % \
947 (mimage, mtype))
948 master_response = self.create_node('master',
949 image_id=mimage,
950 instance_type=mtype)
951 print master_response[0]
952 if self.cluster_size <= 1:
953 return
954
955 launch_spec = master_response[0].launch_specification
956 zone = launch_spec.placement
957 for id in range(1, self.cluster_size):
958 alias = 'node%.3d' % id
959 (ntype, nimage) = self._get_type_and_image_id(alias)
960 log.info("Launching %s (ami: %s, type: %s)" % \
961 (alias, nimage, ntype))
962 node_response = self.create_node(alias,
963 image_id=nimage,
964 instance_type=ntype,
965 zone=zone)
966 print node_response[0]
967
969 """
970 Returns true if any instances in the cluster are EBS-backed
971 """
972 for node in self.nodes:
973 if node.is_ebs_backed():
974 return True
975 return False
976
978 """
979 Returns true if any instances are a Cluster Compute type
980
981 If no instances are currently running, this method checks the
982 original settings used to launch this cluster and returns true
983 if any of the instance type settings specified Cluster Compute
984 instance types
985 """
986 for node in self.nodes:
987 if node.is_cluster_compute():
988 return True
989 lmap = self._get_launch_map()
990 for (type, image) in lmap:
991 if type in static.CLUSTER_COMPUTE_TYPES:
992 return True
993 return False
994
996 """
997 Check that all nodes are 'running' and that ssh is up on all nodes
998 This method will return False if any spot requests are in an 'open'
999 state.
1000 """
1001 spots = self.spot_requests
1002 active_spots = filter(lambda x: x.state == 'active', spots)
1003 if len(spots) != len(active_spots):
1004 return False
1005 nodes = self.nodes
1006 if not nodes:
1007 return False
1008 for node in nodes:
1009 if not node.is_up():
1010 return False
1011 return True
1012
1014 """
1015 Logs a status msg, starts a spinner, and returns the spinner object.
1016 This is useful for long running processes:
1017
1018 s = self.get_spinner("Long running process running...")
1019 (do something)
1020 s.stop()
1021 """
1022 s = spinner.Spinner()
1023 log.info(msg, extra=dict(__nonewline__=True))
1024 s.start()
1025 return s
1026
1027 @property
1038
1040 """
1041 Wait for cluster to come up and display progress bar. Waits for all
1042 spot requests to become 'active', all instances to be in a 'running'
1043 state, and for all SSH daemons to come up.
1044
1045 msg - custom message to print out before waiting on the cluster
1046 """
1047 interval = self.refresh_interval
1048 log.info("%s %s" % (msg, "(updating every %ds)" % interval))
1049 pbar = self.progress_bar.reset()
1050 spots = self.spot_requests
1051 if spots:
1052 log.info('Waiting for open spot requests to become active...')
1053 pbar.maxval = len(spots)
1054 pbar.update(0)
1055 while not pbar.finished:
1056 active_spots = filter(lambda x: x.state == "active", spots)
1057 pbar.maxval = len(spots)
1058 pbar.update(len(active_spots))
1059 if not pbar.finished:
1060 time.sleep(interval)
1061 spots = self.spot_requests
1062 pbar.reset()
1063 nodes = self.nodes
1064 if len(nodes) == 0:
1065 s = self.get_spinner("Waiting for instances to activate...")
1066 while len(nodes) == 0:
1067 time.sleep(interval)
1068 nodes = self.nodes
1069 s.stop()
1070 log.info("Waiting for all nodes to be in a 'running' state...")
1071 pbar.maxval = len(nodes)
1072 pbar.update(0)
1073 while not pbar.finished:
1074 running_nodes = filter(lambda x: x.state == "running", nodes)
1075 pbar.maxval = len(nodes)
1076 pbar.update(len(running_nodes))
1077 if not pbar.finished:
1078 time.sleep(interval)
1079 nodes = self.nodes
1080 pbar.reset()
1081 log.info("Waiting for SSH to come up on all nodes...")
1082 pbar.maxval = len(nodes)
1083 pbar.update(0)
1084 while not pbar.finished:
1085 active_nodes = filter(lambda n: n.is_up(), nodes)
1086 pbar.maxval = len(nodes)
1087 pbar.update(len(active_nodes))
1088 if not pbar.finished:
1089 time.sleep(interval)
1090 nodes = self.nodes
1091 pbar.finish()
1092
1094 """
1095 Check whether all nodes are in the 'stopped' state
1096 """
1097 return len(self.stopped_nodes) == self.cluster_size
1098
1100 """
1101 Check whether all nodes are in a 'terminated' state
1102 """
1103 states = filter(lambda x: x != 'terminated', static.INSTANCE_STATES)
1104 filters = {'group-id': self._security_group,
1105 'instance-state-name': states}
1106 insts = self.ec2.get_all_instances(filters=filters)
1107 return len(insts) == 0
1108
1110 """
1111 Attach each volume to the master node
1112 """
1113 for vol in self.volumes:
1114 volume = self.volumes.get(vol)
1115 device = volume.get('device')
1116 vol_id = volume.get('volume_id')
1117 vol = self.ec2.get_volume(vol_id)
1118 if vol.attach_data.instance_id == self.master_node.id:
1119 log.info("Volume %s already attached to master...skipping" % \
1120 vol.id)
1121 continue
1122 if vol.status != "available":
1123 log.error(('Volume %s not available...' +
1124 'please check and try again') % vol.id)
1125 continue
1126 log.info("Attaching volume %s to master node on %s ..." % (vol.id,
1127 device))
1128 resp = vol.attach(self.master_node.id, device)
1129 log.debug("resp = %s" % resp)
1130 while True:
1131 vol.update()
1132 if vol.attachment_state() == 'attached':
1133 break
1134 time.sleep(5)
1135
1142
1143 @print_timing('Restarting cluster')
1159
1161 """
1162 Stop this cluster by detaching all volumes, stopping/terminating
1163 all instances, cancelling all spot requests (if any), and removing this
1164 cluster's security group.
1165
1166 If a node is a spot instance, it will be terminated. Spot
1167 instances can not be 'stopped', they must be terminated.
1168 """
1169 self.run_plugins(method_name="on_shutdown", reverse=True)
1170 self.detach_volumes()
1171 for node in self.nodes:
1172 node.shutdown()
1173 for spot in self.spot_requests:
1174 if spot.state not in ['cancelled', 'closed']:
1175 log.info("Cancelling spot instance request: %s" % spot.id)
1176 spot.cancel()
1177 if self.spot_bid or not self.is_ebs_cluster():
1178 log.info("Removing %s security group" % self._security_group)
1179 self.cluster_group.delete()
1180
1182 """
1183 Stop this cluster by first detaching all volumes, shutting down all
1184 instances, cancelling all spot requests (if any), removing this
1185 cluster's placement group (if any), and removing this cluster's
1186 security group.
1187 """
1188 self.run_plugins(method_name="on_shutdown", reverse=True)
1189 self.detach_volumes()
1190 for node in self.nodes:
1191 node.terminate()
1192 for spot in self.spot_requests:
1193 if spot.state not in ['cancelled', 'closed']:
1194 log.info("Cancelling spot instance request: %s" % spot.id)
1195 spot.cancel()
1196 log.info("Removing %s security group" % self._security_group)
1197 self.cluster_group.delete()
1198 pg = self.ec2.get_placement_group_or_none(self._security_group)
1199 if pg:
1200 s = self.get_spinner("Waiting for cluster to terminate...")
1201 while not self.is_cluster_terminated():
1202 time.sleep(5)
1203 s.stop()
1204 log.info("Removing %s placement group" % pg.name)
1205 pg.delete()
1206
1207 - def start(self, create=True, create_only=False, validate=True,
1208 validate_only=False, validate_running=False):
1209 """
1210 Handles creating and configuring a cluster.
1211 Validates, creates, and configures a cluster.
1212 Passing validate=False will ignore validate_only and validate_running
1213 keywords and is effectively the same as running _start
1214 """
1215 if validate:
1216 retval = self._validate(validate_running=validate_running)
1217 if validate_only:
1218 return retval
1219 return self._start(create, create_only)
1220
1221 @print_timing("Starting cluster")
1222 - def _start(self, create=True, create_only=False):
1223 """
1224 Start cluster from this cluster template's settings
1225 Handles creating and configuring a cluster
1226 Does not attempt to validate before running
1227 """
1228 log.info("Starting cluster...")
1229 if create:
1230 self.create_cluster()
1231 else:
1232 for node in self.stopped_nodes:
1233 log.info("Starting stopped node: %s" % node.alias)
1234 node.start()
1235 if create_only:
1236 return
1237 self._setup_cluster()
1238 log.info(user_msgs.cluster_started_msg % {
1239 'master': self.master_node.dns_name,
1240 'user': self.cluster_user,
1241 'key': self.key_location,
1242 'tag': self.cluster_tag,
1243 }, extra=dict(__textwrap__=True, __raw__=True))
1244
1261
1262 - def run_plugins(self, plugins=None, method_name="run", node=None,
1263 reverse=False):
1264 """
1265 Run all plugins specified in this Cluster object's self.plugins list
1266 Uses plugins list instead of self.plugins if specified.
1267
1268 plugins must be a tuple: the first element is the plugin's name, the
1269 second element is the plugin object (a subclass of ClusterSetup)
1270 """
1271 plugs = plugins or self.plugins
1272 if reverse:
1273 plugs = plugs[:]
1274 plugs.reverse()
1275 for plug in plugs:
1276 name, plugin = plug
1277 self.run_plugin(plugin, name, method_name=method_name, node=node)
1278
1279 - def run_plugin(self, plugin, name='', method_name='run', node=None):
1280 """
1281 Run a StarCluster plugin.
1282
1283 plugin - an instance of the plugin's class
1284 name - a user-friendly label for the plugin
1285 method_name - the method to run within the plugin (default: "run")
1286 node - optional node to pass as first argument to plugin method (used
1287 for on_add_node/on_remove_node)
1288 """
1289 plugin_name = name or str(plugin)
1290 try:
1291 func = getattr(plugin, method_name, None)
1292 if not func:
1293 log.warn("Plugin %s has no %s method...skipping" % \
1294 (plugin_name, method_name))
1295 return
1296 args = [self.nodes, self.master_node, self.cluster_user,
1297 self.cluster_shell, self.volumes]
1298 if node:
1299 args.insert(0, node)
1300 log.info("Running plugin %s" % plugin_name)
1301 func(*args)
1302 except NotImplementedError:
1303 log.debug("method %s not implemented by plugin %s" % (method_name,
1304 plugin_name))
1305 except Exception, e:
1306 log.error("Error occured while running plugin '%s':" % plugin_name)
1307 if isinstance(e, exception.ThreadPoolException):
1308 e.print_excs()
1309 log.debug(e.format_excs())
1310 else:
1311 traceback.print_exc()
1312 log.debug(traceback.format_exc())
1313
1325
1326 - def _validate(self, validate_running=False):
1357
1359 """
1360 Returns True if all cluster template settings are valid
1361 """
1362 try:
1363 self._validate()
1364 return True
1365 except exception.ClusterValidationError, e:
1366 log.error(e.msg)
1367 return False
1368
1378
1380 try:
1381 int(self.cluster_size)
1382 if self.cluster_size < 1:
1383 raise ValueError
1384 except (ValueError, TypeError):
1385 raise exception.ClusterValidationError(
1386 'cluster_size must be an integer >= 1')
1387 num_itypes = sum([i.get('size') for i in self.node_instance_types])
1388 num_nodes = self.cluster_size - 1
1389 if num_itypes > num_nodes:
1390 raise exception.ClusterValidationError(
1391 ("total number of nodes specified in node_instance_type (%s)" +
1392 " must be <= cluster_size-1 (%s)") % (num_itypes, num_nodes))
1393 return True
1394
1396 cluster_shell = self.cluster_shell
1397 if not self.__available_shells.get(cluster_shell):
1398 raise exception.ClusterValidationError(
1399 'Invalid user shell specified. Options are %s' % \
1400 ' '.join(self.__available_shells.keys()))
1401 return True
1402
1417
1419 availability_zone = self.availability_zone
1420 if availability_zone:
1421 zone = self.ec2.get_zone(availability_zone)
1422 if not zone:
1423 azone = self.availability_zone
1424 raise exception.ClusterValidationError(
1425 'availability_zone = %s does not exist' % azone)
1426 if zone.state != 'available':
1427 log.warn('The availability_zone = %s ' % zone +
1428 'is not available at this time')
1429 return True
1430
1461
1463 master_image_id = self.master_image_id
1464 node_image_id = self.node_image_id
1465 master_instance_type = self.master_instance_type
1466 node_instance_type = self.node_instance_type
1467 instance_types = self.__instance_types
1468 instance_type_list = ', '.join(instance_types.keys())
1469 if not node_instance_type in instance_types:
1470 raise exception.ClusterValidationError(
1471 ("You specified an invalid node_instance_type %s \n" +
1472 "Possible options are:\n%s") % \
1473 (node_instance_type, instance_type_list))
1474 elif master_instance_type:
1475 if not master_instance_type in instance_types:
1476 raise exception.ClusterValidationError(
1477 ("You specified an invalid master_instance_type %s\n" + \
1478 "Possible options are:\n%s") % \
1479 (master_instance_type, instance_type_list))
1480 try:
1481 self.__check_platform(node_image_id, node_instance_type)
1482 except exception.ClusterValidationError, e:
1483 raise exception.ClusterValidationError(
1484 'Incompatible node_image_id and node_instance_type:\n' + e.msg)
1485 if master_image_id and not master_instance_type:
1486 try:
1487 self.__check_platform(master_image_id, node_instance_type)
1488 except exception.ClusterValidationError, e:
1489 raise exception.ClusterValidationError(
1490 'Incompatible master_image_id and ' +
1491 'node_instance_type\n' + e.msg)
1492 elif master_image_id and master_instance_type:
1493 try:
1494 self.__check_platform(master_image_id, master_instance_type)
1495 except exception.ClusterValidationError, e:
1496 raise exception.ClusterValidationError(
1497 'Incompatible master_image_id and ' +
1498 'master_instance_type\n' + e.msg)
1499 elif master_instance_type and not master_image_id:
1500 try:
1501 self.__check_platform(node_image_id, master_instance_type)
1502 except exception.ClusterValidationError, e:
1503 raise exception.ClusterValidationError(
1504 'Incompatible node_image_id and ' +
1505 'master_instance_type\n' + e.msg)
1506 for itype in self.node_instance_types:
1507 type = itype.get('type')
1508 img = itype.get('image') or node_image_id
1509 if not type in instance_types:
1510 raise exception.ClusterValidationError(
1511 ("You specified an invalid instance type %s \n" +
1512 "Possible options are:\n%s") % (type, instance_type_list))
1513 try:
1514 self.__check_platform(img, type)
1515 except exception.ClusterValidationError, e:
1516 raise exception.ClusterValidationError(
1517 "Invalid settings for node_instance_type %s: %s" %
1518 (type, e.msg))
1519 return True
1520
1531
1533 """
1534 Verify EBS volumes exists on Amazon and that each volume's zone matches
1535 this cluster's zone setting. Requires AWS credentials.
1536 """
1537 for vol in self.volumes:
1538 v = self.volumes.get(vol)
1539 vol_id = v.get('volume_id')
1540 vol = self.ec2.get_volume(vol_id)
1541 if vol.status != 'available':
1542 if self.master_node:
1543 if vol.attach_data.instance_id == self.master_node.id:
1544 continue
1545 msg = "volume %s is not available (status: %s)" % (vol_id,
1546 vol.status)
1547 raise exception.ClusterValidationError(msg)
1548
1550 permissions = self.permissions
1551 for perm in permissions:
1552 permission = permissions.get(perm)
1553 protocol = permission.get('ip_protocol')
1554 if protocol not in self.__protocols:
1555 raise exception.InvalidProtocol(protocol)
1556 from_port = permission.get('from_port')
1557 to_port = permission.get('to_port')
1558 try:
1559 from_port = int(from_port)
1560 to_port = int(to_port)
1561 except ValueError:
1562 raise exception.InvalidPortRange(
1563 from_port, to_port, reason="integer range required")
1564 if from_port < 0 or to_port < 0:
1565 raise exception.InvalidPortRange(
1566 from_port, to_port,
1567 reason="from/to must be positive integers")
1568 if from_port > to_port:
1569 raise exception.InvalidPortRange(
1570 from_port, to_port,
1571 reason="'from_port' must be <= 'to_port'")
1572 cidr_ip = permission.get('cidr_ip')
1573 if not iptools.validate_cidr(cidr_ip):
1574 raise exception.InvalidCIDRSpecified(cidr_ip)
1575
1577 """
1578 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs
1579 and validate these settings. Does not require AWS credentials.
1580 """
1581 vol_ids = []
1582 devices = []
1583 mount_paths = []
1584 for vol in self.volumes:
1585 vol_name = vol
1586 vol = self.volumes.get(vol)
1587 vol_id = vol.get('volume_id')
1588 device = vol.get('device')
1589 partition = vol.get('partition')
1590 mount_path = vol.get("mount_path")
1591 mount_paths.append(mount_path)
1592 devices.append(device)
1593 vol_ids.append(vol_id)
1594 if not device:
1595 raise exception.ClusterValidationError(
1596 'Missing DEVICE setting for volume %s' % vol_name)
1597 if not utils.is_valid_device(device):
1598 raise exception.ClusterValidationError(
1599 "Invalid DEVICE value for volume %s" % vol_name)
1600 if partition:
1601 if not utils.is_valid_partition(partition):
1602 raise exception.ClusterValidationError(
1603 "Invalid PARTITION value for volume %s" % vol_name)
1604 if not partition.startswith(device):
1605 raise exception.ClusterValidationError(
1606 "Volume PARTITION must start with %s" % device)
1607 if not mount_path:
1608 raise exception.ClusterValidationError(
1609 'Missing MOUNT_PATH setting for volume %s' % vol_name)
1610 if not mount_path.startswith('/'):
1611 raise exception.ClusterValidationError(
1612 "MOUNT_PATH for volume %s should start with /" % vol_name)
1613 for vol_id in vol_ids:
1614 if vol_ids.count(vol_id) > 1:
1615 raise exception.ClusterValidationError(
1616 ("Multiple configurations for volume %s specified. " + \
1617 "Please choose one") % vol_id)
1618 for dev in devices:
1619 if devices.count(dev) > 1:
1620 raise exception.ClusterValidationError(
1621 "Can't attach more than one volume on device %s" % dev)
1622 for path in mount_paths:
1623 if mount_paths.count(path) > 1:
1624 raise exception.ClusterValidationError(
1625 "Can't mount more than one volume on %s" % path)
1626 return True
1627
1629 has_all_required = True
1630 for opt in self.__cluster_settings:
1631 requirements = self.__cluster_settings[opt]
1632 name = opt
1633 required = requirements[1]
1634 if required and self.get(name.lower()) is None:
1635 log.warn('Missing required setting %s' % name)
1636 has_all_required = False
1637 return has_all_required
1638
1644
1670
1673
1681
1682 if __name__ == "__main__":
1683 from starcluster.config import StarClusterConfig
1684 cfg = StarClusterConfig().load()
1685 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster')
1686 if sc.is_valid():
1687 sc.start(create=True)
1688