1
2 import os
3 import re
4 import time
5 import string
6 import platform
7 import pprint
8 import inspect
9 import cPickle
10
11 from starcluster import ssh
12 from starcluster import awsutils
13 from starcluster import clustersetup
14 from starcluster import static
15 from starcluster import exception
16 from starcluster import utils
17 from starcluster.utils import print_timing
18 from starcluster.spinner import Spinner
19 from starcluster.logger import log, INFO_NO_NEWLINE
20 from starcluster.node import Node
39
41 """
42 Same as get_cluster only returns None instead of throwing an exception
43 if the cluster is not found
44 """
45 try:
46 return get_cluster(cluster_name, cfg)
47 except Exception,e:
48 pass
49
52
60 """
61 Maps aliases master, node001, etc to 0,1,etc
62
63 Returns an integer (>=0) representing the node "number" if successful,
64 and returns None otherwise
65 """
66 if alias == "master":
67 return 0
68 else:
69 pattern = re.compile(r"node([0-9][0-9][0-9])")
70 if pattern.match(alias) and len(alias) == 7:
71 return int(pattern.match(alias).groups()[0])
72
92
97
110
120
122 """
123 Returns the cluster tag name from a security group name that starts with
124 static.SECURITY_GROUP_PREFIX
125
126 Example:
127 sg = '@sc-mycluster'
128 print get_tag_from_sg(sg)
129 mycluster
130 """
131 regex = re.compile(static.SECURITY_GROUP_PREFIX + '-(.*)')
132 match = regex.match(sg)
133 if match:
134 return match.groups()[0]
135
169
182
184 - def __init__(self,
185 aws_access_key_id=None,
186 aws_secret_access_key=None,
187 aws_port=None,
188 aws_is_secure=True,
189 aws_ec2_path='/',
190 aws_s3_path='/',
191 aws_region_name=None,
192 aws_region_host=None,
193 spot_bid=None,
194 cluster_tag=None,
195 cluster_description=None,
196 cluster_size=None,
197 cluster_user=None,
198 cluster_shell=None,
199 master_image_id=None,
200 master_instance_type=None,
201 node_image_id=None,
202 node_instance_type=None,
203 availability_zone=None,
204 keyname=None,
205 key_location=None,
206 volumes=[],
207 plugins=[],
208 **kwargs):
209
210 now = time.strftime("%Y%m%d%H%M")
211
212 self.ec2 = awsutils.EasyEC2(
213 aws_access_key_id, aws_secret_access_key,
214 aws_port = aws_port, aws_is_secure = aws_is_secure,
215 aws_ec2_path = aws_ec2_path, aws_s3_path = aws_s3_path,
216 aws_region_name = aws_region_name,
217 aws_region_host = aws_region_host,
218 )
219 self.spot_bid = spot_bid
220 self.cluster_tag = cluster_tag
221 self.cluster_description = cluster_description
222 if self.cluster_tag is None:
223 self.cluster_tag = now
224 if cluster_description is None:
225 self.cluster_description = "Cluster created at %s" % now
226 self.cluster_size = cluster_size
227 self.cluster_user = cluster_user
228 self.cluster_shell = cluster_shell
229 self.master_image_id = master_image_id
230 self.master_instance_type = master_instance_type
231 self.node_image_id = node_image_id
232 self.node_instance_type = node_instance_type
233 self.availability_zone = availability_zone
234 self.keyname = keyname
235 self.key_location = key_location
236 self.volumes = self.load_volumes(volumes)
237 self.plugins = plugins
238
239 self.__instance_types = static.INSTANCE_TYPES
240 self.__cluster_settings = static.CLUSTER_SETTINGS
241 self.__available_shells = static.AVAILABLE_SHELLS
242 self._master_reservation = None
243 self._node_reservation = None
244 self._nodes = None
245 self._master = None
246 self._plugins = self.load_plugins(plugins)
247 self._zone = None
248
249 @property
251 """
252 If volumes are specified, this method determines the common availability
253 zone between those volumes. If an availability zone is explicitly
254 specified in the config and does not match the common availability zone
255 of the volumes, an exception is raised. If all volumes are not in the same
256 availabilty zone an exception is raised. If no volumes are specified,
257 returns the user specified availability zone if it exists.
258 """
259 if not self._zone:
260 zone = None
261 if self.availability_zone:
262 zone = self.ec2.get_zone(self.availability_zone).name
263 common_zone = None
264 for volume in self.volumes:
265 volid = self.volumes.get(volume).get('volume_id')
266 vol = self.ec2.get_volume(volid)
267 if not common_zone:
268 common_zone = vol.zone
269 elif vol.zone != common_zone:
270 vols = [ self.volumes.get(v).get('volume_id')
271 for v in self.volumes ]
272 raise exception.VolumesZoneError(vols)
273 if common_zone and zone and zone != common_zone:
274 raise exception.InvalidZone(zone, common_zone)
275 if not zone and common_zone:
276 zone = common_zone
277 self._zone=zone
278 return self._zone
279
281 """
282 Iterate through vols and set device/partition settings automatically if
283 not specified.
284
285 This method assigns the first volume to /dev/sdz, second to /dev/sdy,
286 etc for all volumes that do not include a device/partition setting
287 """
288 devices = [ '/dev/sd%s' % s for s in string.lowercase ]
289 for volname in vols:
290 vol = vols.get(volname)
291 dev = vol.get('device')
292 if dev in devices:
293
294 devices.remove(dev)
295 volumes = {}
296 for volname in vols:
297 vol = vols.get(volname)
298 device = vol.get('device')
299 if not device:
300 device = devices.pop()
301 if not utils.is_valid_device(device):
302 raise exception.InvalidDevice(device)
303 v = volumes[volname] = utils.AttributeDict()
304 v.update(vol)
305 v['device'] = device
306 part = vol.get('partition',1)
307 partition = device + str(part)
308 if not utils.is_valid_partition(partition):
309 raise exception.InvalidPartition(part)
310 v['partition'] = partition
311 return volumes
312
314 plugs = []
315 for plugin in plugins:
316 setup_class = plugin.get('setup_class')
317 plugin_name = plugin.get('__name__')
318 mod_name = '.'.join(setup_class.split('.')[:-1])
319 class_name = setup_class.split('.')[-1]
320 try:
321 mod = __import__(mod_name, globals(), locals(), [class_name])
322 except SyntaxError,e:
323 raise exception.PluginSyntaxError(
324 "Plugin %s (%s) contains a syntax error at line %s" % \
325 (plugin_name, e.filename, e.lineno)
326 )
327 except ImportError,e:
328 raise exception.PluginLoadError(
329 "Failed to import plugin %s: %s" % (plugin_name, e.message)
330 )
331 klass = getattr(mod, class_name, None)
332 if klass:
333 if issubclass(klass, clustersetup.ClusterSetup):
334 argspec = inspect.getargspec(klass.__init__)
335 args = argspec.args[1:]
336 nargs = len(args)
337 ndefaults = 0
338 if argspec.defaults:
339 ndefaults = len(argspec.defaults)
340 nrequired = nargs - ndefaults
341 config_args = []
342 for arg in argspec.args:
343 if arg in plugin:
344 config_args.append(plugin.get(arg))
345 log.debug("config_args = %s" % config_args)
346 log.debug("args = %s" % argspec.args)
347 if nrequired != len(config_args):
348 raise exception.PluginError(
349 "Not enough settings provided for plugin %s" % \
350 plugin_name
351 )
352 plugs.append((plugin_name,klass(*config_args)))
353 else:
354 raise exception.PluginError(
355 """Plugin %s must be a subclass of starcluster.clustersetup.ClusterSetup""" \
356 % setup_class)
357 else:
358 raise exception.PluginError(
359 'Plugin class %s does not exist' % setup_class
360 )
361 return plugs
362
364 for key in kwargs.keys():
365 if hasattr(self, key):
366 self.__dict__[key] = kwargs[key]
367
369 """
370 Validate existing instances against this template's settings
371 """
372 self._validate_instance_types()
373 num_running = len(self.nodes)
374 if num_running != self.cluster_size:
375 raise exception.ClusterValidationError(
376 "Number of pending/running instances (%s) != %s" % \
377 (num_running, self.cluster_size))
378 mtype = self.master_node.instance_type
379 mastertype = self.master_instance_type or self.node_instance_type
380 if mtype != mastertype:
381 raise exception.ClusterValidationError(
382 "The running master node's instance type (%s) != %s" % \
383 (mtype, mastertype))
384 masterimage = self.master_image_id or self.node_image_id
385 mimage = self.master_node.image_id
386 if mimage != masterimage:
387 raise exception.ClusterValidationError(
388 "The running master node's image id (%s) != %s" % \
389 (mimage, masterimage))
390 mkey = self.master_node.key_name
391 if mkey != self.keyname:
392 raise exception.ClusterValidationError(
393 "The running master's keypair (%s) != %s" % \
394 (mkey, self.keyname))
395 try:
396 nodes = self.nodes[1:self.cluster_size]
397 except IndexError,e:
398 raise exception.ClusterValidationError("Cluster has no running instances")
399 mazone = self.master_node.placement
400 for n in nodes:
401 ntype = n.instance_type
402 if ntype != self.node_instance_type:
403 raise exception.ClusterValidationError(
404 "Running node's instance type (%s) != %s" % \
405 (ntype, self.node_instance_type))
406 nimage = n.image_id
407 if nimage != self.node_image_id:
408 raise exception.ClusterValidationError(
409 "Running node's image id (%s) != %s" % \
410 (nimage, self.node_image_id))
411 if n.key_name != self.keyname:
412 raise exception.ClusterValidationError(
413 "Running node's key_name (%s) != %s" % \
414 (n.key_name, self.keyname))
415 nazone = n.placement
416 if mazone != nazone:
417 raise exception.ClusterValidationError(
418 "Running master zone (%s) does not match node zone (%s)" % \
419 (mazone, nazone))
420
421 self._zone = None
422 if self.zone and self.zone != mazone:
423 raise exception.ClusterValidationError(
424 "Running cluster's availability_zone (%s) != %s" % \
425 (azone, self.zone))
426
427 - def get(self, name):
428 return self.__dict__.get(name)
429
431 cfg = {}
432 for key in self.__dict__.keys():
433 if not key.startswith('_'):
434 cfg[key] = getattr(self,key)
435 return pprint.pformat(cfg)
436
457
484
485 @property
488
489 @property
494
495 @property
501
502 @property
513
514 @property
516 if not self._nodes:
517 nodes = self.cluster_group.instances()
518 self._nodes = []
519 master = self.master_node
520 nodeid = 1
521 for node in nodes:
522 if node.state not in ['pending','running']:
523 continue
524 if node.id == master.id:
525 self._nodes.insert(0,master)
526 continue
527 self._nodes.append(Node(node, self.key_location,
528 'node%.3d' % nodeid))
529 nodeid += 1
530 else:
531 for node in self._nodes:
532 log.debug('refreshing instance %s' % node.id)
533 node.update()
534 return self._nodes
535
541
547
548 @property
555
556 - def run_instances(self, price=None, image_id=None, instance_type='m1.small',
557 min_count=1, max_count=1, count=1, key_name=None,
558 security_groups=None, launch_group=None,
559 availability_zone_group=None, placement=None):
560 conn = self.ec2
561 if price:
562 return conn.request_spot_instances(price, image_id,
563 instance_type=instance_type,
564 count=count,
565 launch_group=launch_group,
566 key_name=key_name,
567 security_groups=security_groups,
568 availability_zone_group=availability_zone_group,
569 placement=placement)
570 else:
571 return conn.run_instances(image_id, instance_type=instance_type,
572 min_count=min_count, max_count=max_count,
573 key_name=key_name,
574 security_groups=security_groups,
575 placement=placement)
576
578 log.info("Launching a %d-node cluster..." % self.cluster_size)
579 if self.master_image_id is None:
580 self.master_image_id = self.node_image_id
581 if self.master_instance_type is None:
582 self.master_instance_type = self.node_instance_type
583 log.info("Launching master node...")
584 log.info("Master AMI: %s" % self.master_image_id)
585 master_sg = self.master_group.name
586 cluster_sg = self.cluster_group.name
587 zone = self.zone
588 master_response = self.run_instances(self.spot_bid,
589 image_id=self.master_image_id,
590 instance_type=self.master_instance_type,
591 min_count=1, max_count=1, count=1,
592 key_name=self.keyname,
593 security_groups=[master_sg, cluster_sg],
594 availability_zone_group=cluster_sg,
595 launch_group=cluster_sg,
596 placement=zone)
597 print master_response
598
599 if self.spot_bid:
600 launch_spec = master_response[0].launch_specification
601 zone = launch_spec.placement
602 else:
603 zone = master_response.instances[0].placement
604 if self.cluster_size > 1:
605 log.info("Launching worker nodes...")
606 log.info("Node AMI: %s" % self.node_image_id)
607 instances_response = self.run_instances(self.spot_bid,
608 image_id=self.node_image_id,
609 instance_type=self.node_instance_type,
610 min_count=max((self.cluster_size-1)/2, 1),
611 max_count=max(self.cluster_size-1,1),
612 count=max(self.cluster_size-1,1),
613 key_name=self.keyname,
614 security_groups=[cluster_sg],
615 availability_zone_group=cluster_sg,
616 launch_group=cluster_sg,
617 placement=zone)
618 print instances_response
619
621 """
622 Check whether there are cluster_size nodes running,
623 that ssh (port 22) is up on all nodes, and that each node
624 has an internal ip address associated with it
625 """
626 nodes = self.running_nodes
627 if len(nodes) == self.cluster_size:
628 for node in nodes:
629 if not node.is_up():
630 return False
631 return True
632 else:
633 return False
634
636 for vol in self.volumes:
637 volume = self.volumes.get(vol)
638 device = volume.get('device')
639 vol_id = volume.get('volume_id')
640 vol = self.ec2.get_volume(vol_id)
641 log.info("Attaching volume %s to master node on %s ..." % (vol.id,
642 device))
643 if vol.status != "available":
644 log.error('Volume %s not available...please check and try again'
645 % vol.id)
646 continue
647 resp = vol.attach(self.master_node.id, device)
648 log.debug("resp = %s" % resp)
649 while True:
650 vol.update()
651 if vol.attachment_state() == 'attached':
652 break
653 time.sleep(5)
654
656 for vol in self.volumes:
657 vol_id = self.volumes.get(vol).get('volume_id')
658 vol = self.ec2.get_volume(vol_id)
659 log.info("Detaching volume %s from master" % vol.id)
660 vol.detach()
661
674
675 @print_timing
676 - def start(self, create=True):
677 log.info("Starting cluster...")
678 if create:
679 self.create_cluster()
680 s = Spinner()
681 log.log(INFO_NO_NEWLINE, "Waiting for cluster to start...")
682 s.start()
683 while not self.is_cluster_up():
684 time.sleep(60)
685 s.stop()
686
687 log.info("The master node is %s" % self.master_node.dns_name)
688
689 if self.volumes:
690 self.attach_volumes_to_master()
691
692 log.info("Setting up the cluster...")
693 default_setup = clustersetup.DefaultClusterSetup().run(
694 self.nodes, self.master_node,
695 self.cluster_user, self.cluster_shell,
696 self.volumes
697 )
698 self.create_receipt()
699 for plugin in self._plugins:
700 try:
701 plugin_name = plugin[0]
702 plug = plugin[1]
703 log.info("Running plugin %s" % plugin_name)
704 plug.run(self.nodes, self.master_node, self.cluster_user,
705 self.cluster_shell, self.volumes)
706 except Exception, e:
707 log.error("Error occured while running plugin '%s':" % \
708 plugin_name)
709 print e
710
711 log.info("""
712
713 The cluster has been started and configured.
714
715 Login to the master node as root by running:
716
717 $ starcluster sshmaster %(tag)s
718
719 or manually as %(user)s:
720
721 $ ssh -i %(key)s %(user)s@%(master)s
722
723 When you are finished using the cluster, run:
724
725 $ starcluster stop %(tag)s
726
727 to shutdown the cluster and stop paying for service
728
729 """ % {
730 'master': self.master_node.dns_name,
731 'user': self.cluster_user,
732 'key': self.key_location,
733 'tag': self.cluster_tag,
734 })
735
747
768
778
780 if self.cluster_size <= 0 or not isinstance(self.cluster_size, int):
781 raise exception.ClusterValidationError(
782 'cluster_size must be a positive integer.')
783 return True
784
786 cluster_shell = self.cluster_shell
787 if not self.__available_shells.get(cluster_shell):
788 raise exception.ClusterValidationError(
789 'Invalid user shell specified. Options are %s' % \
790 ' '.join(self.__available_shells.keys()))
791 return True
792
808
810 availability_zone = self.availability_zone
811 if availability_zone:
812 zone = self.ec2.get_zone(availability_zone)
813 if not zone:
814 raise exception.ClusterValidationError(
815 'availability_zone = %s does not exist' % availability_zone
816 )
817 if zone.state != 'available':
818 log.warn('The availability_zone = %s ' % zone +
819 'is not available at this time')
820 return True
821
844
846 master_image_id = self.master_image_id
847 node_image_id = self.node_image_id
848 master_instance_type = self.master_instance_type
849 node_instance_type = self.node_instance_type
850 instance_types = self.__instance_types
851 instance_type_list = ' '.join(instance_types.keys())
852 conn = self.ec2
853 if not instance_types.has_key(node_instance_type):
854 raise exception.ClusterValidationError(
855 ("You specified an invalid node_instance_type %s \n" +
856 "Possible options are:\n%s") % \
857 (node_instance_type, instance_type_list))
858 elif master_instance_type:
859 if not instance_types.has_key(master_instance_type):
860 raise exception.ClusterValidationError(
861 ("You specified an invalid master_instance_type %s\n" + \
862 "Possible options are:\n%s") % \
863 (master_instance_type, instance_type_list))
864
865 try:
866 self.__check_platform(node_image_id, node_instance_type)
867 except exception.ClusterValidationError,e:
868 raise exception.ClusterValidationError(
869 'Incompatible node_image_id and node_instance_type\n' + e.msg
870 )
871 if master_image_id and not master_instance_type:
872 try:
873 self.__check_platform(master_image_id, node_instance_type)
874 except exception.ClusterValidationError,e:
875 raise exception.ClusterValidationError(
876 'Incompatible master_image_id and node_instance_type\n' + e.msg
877 )
878 elif master_image_id and master_instance_type:
879 try:
880 self.__check_platform(master_image_id, master_instance_type)
881 except exception.ClusterValidationError,e:
882 raise exception.ClusterValidationError(
883 'Incompatible master_image_id and master_instance_type\n' + e.msg
884 )
885 elif master_instance_type and not master_image_id:
886 try:
887 self.__check_platform(node_image_id, master_instance_type)
888 except exception.ClusterValidationError,e:
889 raise exception.ClusterValidationError(
890 'Incompatible node_image_id and master_instance_type\n' + e.msg
891 )
892 return True
893
895 """
896 Verify EBS volumes exists on Amazon and that each volume's zone matches
897 this cluster's zone setting. Requires AWS credentials.
898 """
899 zone = self.zone
900 for vol in self.volumes:
901 v = self.volumes.get(vol)
902 vol_id = v.get('volume_id')
903 vol = self.ec2.get_volume(vol_id)
904 if vol.status != 'available':
905 msg = "volume %s is not available (status: %s)" % (vol_id,
906 vol.status)
907 raise exception.ClusterValidationError(msg)
908
910 """
911 Check EBS vols for missing/duplicate DEVICE/PARTITION/MOUNT_PATHs
912 and validate these settings. Does not require AWS credentials.
913 """
914 vol_ids = []
915 devices = []
916 mount_paths = []
917 for vol in self.volumes:
918 vol_name = vol
919 vol = self.volumes.get(vol)
920 vol_id = vol.get('volume_id')
921 device = vol.get('device')
922 partition = vol.get('partition')
923 mount_path = vol.get("mount_path")
924 mount_paths.append(mount_path)
925 devices.append(device)
926 vol_ids.append(vol_id)
927 if not device:
928 raise exception.ClusterValidationError(
929 'Missing DEVICE setting for volume %s' % vol_name)
930 if not utils.is_valid_device(device):
931 raise exception.ClusterValidationError(
932 "Invalid DEVICE value for volume %s" % vol_name)
933 if not partition:
934 raise exception.ClusterValidationError(
935 'Missing PARTITION setting for volume %s' % vol_name)
936 if not utils.is_valid_partition(partition):
937 raise exception.ClusterValidationError(
938 "Invalid PARTITION value for volume %s" % vol_name)
939 if not partition.startswith(device):
940 raise exception.ClusterValidationError(
941 "Volume PARTITION must start with %s" % device)
942 if not mount_path:
943 raise exception.ClusterValidationError(
944 'Missing MOUNT_PATH setting for volume %s' % vol_name)
945 if not mount_path.startswith('/'):
946 raise exception.ClusterValidationError(
947 "MOUNT_PATH for volume %s should start with /" % vol_name)
948 for vol_id in vol_ids:
949 if vol_ids.count(vol_id) > 1:
950 raise exception.ClusterValidationError(
951 ("Multiple configurations for volume %s specified. " + \
952 "Please choose one") % vol_id)
953 for dev in devices:
954 if devices.count(dev) > 1:
955 raise exception.ClusterValidationError(
956 "Can't attach more than one volume on device %s" % dev)
957 for path in mount_paths:
958 if mount_paths.count(path) > 1:
959 raise exception.ClusterValidationError(
960 "Can't mount more than one volume on %s" % path)
961 return True
962
964 has_all_required = True
965 for opt in self.__cluster_settings:
966 requirements = self.__cluster_settings[opt]
967 name = opt; required = requirements[1];
968 if required and self.get(name.lower()) is None:
969 log.warn('Missing required setting %s' % name)
970 has_all_required = False
971 return has_all_required
972
978
1002
1003 if __name__ == "__main__":
1004 from starcluster.config import StarClusterConfig
1005 cfg = StarClusterConfig(); cfg.load()
1006 sc = cfg.get_cluster_template('smallcluster', 'mynewcluster')
1007 if sc.is_valid():
1008 sc.start(create=True)
1009