1 import os
2 import time
3 import stat
4 import base64
5 import posixpath
6
7 from starcluster import ssh
8 from starcluster import utils
9 from starcluster import static
10 from starcluster import awsutils
11 from starcluster import managers
12 from starcluster import exception
13 from starcluster.logger import log
17 """
18 Manager class for Node objects
19 """
20 - def ssh_to_node(self, node_id, user='root', command=None):
30
31 - def get_node(self, node_id, user='root'):
47
48
49 -class Node(object):
50 """
51 This class represents a single compute node in a StarCluster.
52
53 It contains all useful metadata for the node such as the internal/external
54 hostnames, ips, etc. as well as a paramiko ssh object for executing
55 commands, creating/modifying files on the node.
56
57 'instance' arg must be an instance of boto.ec2.instance.Instance
58
59 'key_location' arg is a string that contains the full path to the
60 private key corresponding to the keypair used to launch this node
61
62 'alias' keyword arg optionally names the node. If no alias is provided,
63 the alias is retrieved from the node's user_data based on the node's
64 launch index
65
66 'user' keyword optionally specifies user to ssh as (defaults to root)
67 """
68 - def __init__(self, instance, key_location, alias=None, user='root'):
69 self.instance = instance
70 self.ec2 = awsutils.EasyEC2(None, None)
71 self.ec2._conn = instance.connection
72 self.key_location = key_location
73 self.user = user
74 self._alias = alias
75 self._groups = None
76 self._ssh = None
77 self._num_procs = None
78 self._memory = None
79
81 return '<Node: %s (%s)>' % (self.alias, self.id)
82
84 tries = range(tries)
85 last_try = tries[-1]
86 for i in tries:
87 try:
88 user_data = self.ec2.get_instance_user_data(self.id)
89 return user_data
90 except exception.InstanceDoesNotExist:
91 if i == last_try:
92 log.debug("failed fetching user data")
93 raise
94 log.debug("InvalidInstanceID.NotFound: "
95 "retrying fetching user data (tries: %s)" % (i + 1))
96 time.sleep(5)
97
98 @property
100 """
101 Fetches the node's alias stored in a tag from either the instance
102 or the instance's parent spot request. If no alias tag is found an
103 exception is raised.
104 """
105 if not self._alias:
106 alias = self.tags.get('alias')
107 if not alias:
108 user_data = self._get_user_data(tries=5)
109 aliases = user_data.split('|')
110 index = self.ami_launch_index
111 try:
112 alias = aliases[index]
113 except IndexError:
114 log.debug(
115 "invalid user_data: %s (index: %d)" % (aliases, index))
116 alias = None
117 if not alias:
118 raise exception.BaseException(
119 "instance %s has no alias" % self.id)
120 self.add_tag('alias', alias)
121 name = self.tags.get('Name')
122 if not name:
123 self.add_tag('Name', alias)
124 self._alias = alias
125 return self._alias
126
131
132 @property
135
136 - def add_tag(self, key, value=None):
137 return self.instance.add_tag(key, value)
138
141
142 @property
148
149 @property
153
154 @property
163
164 @property
166 if not self._num_procs:
167 self._num_procs = int(
168 self.ssh.execute(
169 'cat /proc/cpuinfo | grep processor | wc -l')[0])
170 return self._num_procs
171
172 @property
174 if not self._memory:
175 self._memory = float(
176 self.ssh.execute(
177 "free -m | grep -i mem | awk '{print $2}'")[0])
178 return self._memory
179
180 @property
183
184 @property
187
188 @property
191
192 @property
195
196 @property
199
200 @property
202 return self.instance.id
203
204 @property
207
208 @property
211
212 @property
214 return self.instance.state
215
216 @property
219
220 @property
224
225 @property
228
229 @property
231 try:
232 return int(self.instance.ami_launch_index)
233 except TypeError:
234 log.error("instance %s (state: %s) has no ami_launch_index" %
235 (self.id, self.state))
236 log.error("returning 0 as ami_launch_index...")
237 return 0
238
239 @property
242
243 @property
245 return self.instance.architecture
246
247 @property
249 return self.instance.kernel
250
251 @property
254
255 @property
258
259 @property
262
263 @property
266
267 @property
269 return self.instance.region
270
271 @property
274
275 @property
278
289
291 """
292 Returns dictionary where keys are remote group names and values are
293 grp.struct_grp objects from the standard grp module
294
295 key_by_gid=True will use the integer gid as the returned dictionary's
296 keys instead of the group's name
297 """
298 grp_file = self.ssh.remote_file('/etc/group', 'r')
299 groups = [l.strip().split(':') for l in grp_file.readlines()]
300 grp_file.close()
301 grp_map = {}
302 for group in groups:
303 name, passwd, gid, mems = group
304 gid = int(gid)
305 mems = mems.split(',')
306 key = name
307 if key_by_gid:
308 key = gid
309 grp_map[key] = utils.struct_group([name, passwd, gid, mems])
310 return grp_map
311
313 """
314 Returns dictionary where keys are remote usernames and values are
315 pwd.struct_passwd objects from the standard pwd module
316
317 key_by_uid=True will use the integer uid as the returned dictionary's
318 keys instead of the user's login name
319 """
320 etc_passwd = self.ssh.remote_file('/etc/passwd', 'r')
321 users = [l.strip().split(':') for l in etc_passwd.readlines()]
322 etc_passwd.close()
323 user_map = {}
324 for user in users:
325 name, passwd, uid, gid, gecos, home, shell = user
326 uid = int(uid)
327 gid = int(gid)
328 key = name
329 if key_by_uid:
330 key = uid
331 user_map[key] = utils.struct_passwd([name, passwd, uid, gid,
332 gecos, home, shell])
333 return user_map
334
336 """
337 Remote version of the getgrgid method in the standard grp module
338
339 returns a grp.struct_group
340 """
341 gmap = self.get_group_map(key_by_gid=True)
342 return gmap.get(gid)
343
345 """
346 Remote version of the getgrnam method in the standard grp module
347
348 returns a grp.struct_group
349 """
350 gmap = self.get_group_map()
351 return gmap.get(groupname)
352
354 """
355 Remote version of the getpwuid method in the standard pwd module
356
357 returns a pwd.struct_passwd
358 """
359 umap = self.get_user_map(key_by_uid=True)
360 return umap.get(uid)
361
363 """
364 Remote version of the getpwnam method in the standard pwd module
365
366 returns a pwd.struct_passwd
367 """
368 umap = self.get_user_map()
369 return umap.get(username)
370
371 - def add_user(self, name, uid=None, gid=None, shell="bash"):
372 """
373 Add a user to the remote system.
374
375 name - the username of the user being added
376 uid - optional user id to use when creating new user
377 gid - optional group id to use when creating new user
378 shell - optional shell assign to new user (default: bash)
379 """
380 if gid:
381 self.ssh.execute('groupadd -o -g %s %s' % (gid, name))
382 user_add_cmd = 'useradd -o '
383 if uid:
384 user_add_cmd += '-u %s ' % uid
385 if gid:
386 user_add_cmd += '-g %s ' % gid
387 if shell:
388 user_add_cmd += '-s `which %s` ' % shell
389 user_add_cmd += "-m %s" % name
390 self.ssh.execute(user_add_cmd)
391
392 - def generate_key_for_user(self, username, ignore_existing=False,
393 auth_new_key=False, auth_conn_key=False):
394 """
395 Generates an id_rsa/id_rsa.pub keypair combo for a user on the remote
396 machine.
397
398 ignore_existing - if False, any existing key combos will be used rather
399 than generating a new RSA key
400
401 auth_new_key - if True, add the newly generated public key to the
402 remote user's authorized_keys file
403
404 auth_conn_key - if True, add the public key used to establish this ssh
405 connection to the remote user's authorized_keys
406 """
407 user = self.getpwnam(username)
408 home_folder = user.pw_dir
409 ssh_folder = posixpath.join(home_folder, '.ssh')
410 if not self.ssh.isdir(ssh_folder):
411 self.ssh.mkdir(ssh_folder)
412 private_key = posixpath.join(ssh_folder, 'id_rsa')
413 public_key = private_key + '.pub'
414 authorized_keys = posixpath.join(ssh_folder, 'authorized_keys')
415 key_exists = self.ssh.isfile(private_key)
416 if key_exists and not ignore_existing:
417 log.info("Using existing key: %s" % private_key)
418 key = self.ssh.load_remote_rsa_key(private_key)
419 else:
420 key = self.ssh.generate_rsa_key()
421 pubkey_contents = self.ssh.get_public_key(key)
422 if not key_exists or ignore_existing:
423
424 pub_key = self.ssh.remote_file(public_key, 'w')
425 pub_key.write(pubkey_contents)
426 pub_key.chown(user.pw_uid, user.pw_gid)
427 pub_key.chmod(0400)
428 pub_key.close()
429
430 priv_key = self.ssh.remote_file(private_key, 'w')
431 key.write_private_key(priv_key)
432 priv_key.chown(user.pw_uid, user.pw_gid)
433 priv_key.chmod(0400)
434 priv_key.close()
435 if not auth_new_key or not auth_conn_key:
436 return key
437 auth_keys_contents = ''
438 if self.ssh.isfile(authorized_keys):
439 auth_keys = self.ssh.remote_file(authorized_keys, 'r')
440 auth_keys_contents = auth_keys.read()
441 auth_keys.close()
442 auth_keys = self.ssh.remote_file(authorized_keys, 'a')
443 if auth_new_key:
444
445 if pubkey_contents not in auth_keys_contents:
446 log.debug("adding auth_key_contents")
447 auth_keys.write('%s\n' % pubkey_contents)
448 if auth_conn_key and self.ssh._pkey:
449
450
451 conn_key = self.ssh._pkey
452 conn_pubkey_contents = self.ssh.get_public_key(conn_key)
453 if conn_pubkey_contents not in auth_keys_contents:
454 log.debug("adding conn_pubkey_contents")
455 auth_keys.write('%s\n' % conn_pubkey_contents)
456 auth_keys.chown(user.pw_uid, user.pw_gid)
457 auth_keys.chmod(0600)
458 auth_keys.close()
459 return key
460
462 """
463 Populate user's known_hosts file with pub keys from hosts in nodes list
464
465 username - name of the user to add to known hosts for
466 nodes - the nodes to add to the user's known hosts file
467 add_self - add this Node to known_hosts in addition to nodes
468 """
469 user = self.getpwnam(username)
470 known_hosts_file = posixpath.join(user.pw_dir, '.ssh', 'known_hosts')
471 self.remove_from_known_hosts(username, nodes)
472 khosts = []
473 if add_self and self not in nodes:
474 nodes.append(self)
475 for node in nodes:
476 server_pkey = node.ssh.get_server_public_key()
477 node_names = {}.fromkeys([node.alias, node.private_dns_name,
478 node.private_dns_name_short],
479 node.private_ip_address)
480 node_names[node.public_dns_name] = node.ip_address
481 for name, ip in node_names.items():
482 name_ip = "%s,%s" % (name, ip)
483 khosts.append(' '.join([name_ip, server_pkey.get_name(),
484 base64.b64encode(str(server_pkey))]))
485 khostsf = self.ssh.remote_file(known_hosts_file, 'a')
486 khostsf.write('\n'.join(khosts) + '\n')
487 khostsf.chown(user.pw_uid, user.pw_gid)
488 khostsf.close()
489
504
506 """
507 Configure passwordless ssh for user between this Node and nodes
508 """
509 user = self.getpwnam(username)
510 ssh_folder = posixpath.join(user.pw_dir, '.ssh')
511 priv_key_file = posixpath.join(ssh_folder, 'id_rsa')
512 pub_key_file = priv_key_file + '.pub'
513 known_hosts_file = posixpath.join(ssh_folder, 'known_hosts')
514 auth_key_file = posixpath.join(ssh_folder, 'authorized_keys')
515 self.add_to_known_hosts(username, nodes)
516
517 nodes = filter(lambda n: n.id != self.id, nodes)
518
519 self.copy_remote_file_to_nodes(priv_key_file, nodes)
520 self.copy_remote_file_to_nodes(pub_key_file, nodes)
521
522 self.copy_remote_file_to_nodes(auth_key_file, nodes)
523 self.copy_remote_file_to_nodes(known_hosts_file, nodes)
524
527
529 """
530 Copies a remote file from this Node instance to another Node instance
531 without passwordless ssh between the two.
532
533 dest - path to store the data in on the node (defaults to remote_file)
534 """
535 if not dest:
536 dest = remote_file
537 rf = self.ssh.remote_file(remote_file, 'r')
538 contents = rf.read()
539 sts = rf.stat()
540 mode = stat.S_IMODE(sts.st_mode)
541 uid = sts.st_uid
542 gid = sts.st_gid
543 rf.close()
544 for node in nodes:
545 if self.id == node.id and remote_file == dest:
546 log.warn("src and destination are the same: %s, skipping" %
547 remote_file)
548 continue
549 nrf = node.ssh.remote_file(dest, 'w')
550 nrf.write(contents)
551 nrf.chown(uid, gid)
552 nrf.chmod(mode)
553 nrf.close()
554
556 """
557 Remove a user from the remote system
558 """
559 self.ssh.execute('userdel %s' % name)
560 self.ssh.execute('groupdel %s' % name)
561
563 """
564 Export each path in export_paths to each node in nodes via NFS
565
566 nodes - list of nodes to export each path to
567 export_paths - list of paths on this remote host to export to each node
568
569 Example:
570 # export /home and /opt/sge6 to each node in nodes
571 $ node.start_nfs_server()
572 $ node.export_fs_to_nodes(\
573 nodes=[node1,node2], export_paths=['/home', '/opt/sge6']
574 """
575
576 nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
577 etc_exports = self.ssh.remote_file('/etc/exports', 'w')
578 for node in nodes:
579 for path in export_paths:
580 etc_exports.write(' '.join([path, node.alias +
581 nfs_export_settings + '\n']))
582 etc_exports.close()
583 self.ssh.execute('exportfs -fra')
584
586 """
587 Removes nodes from this node's /etc/exportfs
588
589 nodes - list of nodes to stop
590
591 Example:
592 $ node.remove_export_fs_to_nodes(nodes=[node1,node2])
593 """
594 regex = '|'.join(map(lambda x: x.alias, nodes))
595 self.ssh.remove_lines_from_file('/etc/exports', regex)
596 self.ssh.execute('exportfs -fra')
597
599 self.ssh.execute('/etc/init.d/portmap start')
600 self.ssh.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/',
601 ignore_exit_status=True)
602 self.ssh.execute('/etc/init.d/nfs start')
603 self.ssh.execute('/usr/sbin/exportfs -fra')
604
606 """
607 Mount each path in remote_paths from the remote server_node
608
609 server_node - remote server node that is sharing the remote_paths
610 remote_paths - list of remote paths to mount from server_node
611 """
612 self.ssh.execute('/etc/init.d/portmap start')
613
614 self.ssh.execute('mount -t devpts none /dev/pts',
615 ignore_exit_status=True)
616 remote_paths_regex = '|'.join(map(lambda x: x.center(len(x) + 2),
617 remote_paths))
618 self.ssh.remove_lines_from_file('/etc/fstab', remote_paths_regex)
619 fstab = self.ssh.remote_file('/etc/fstab', 'a')
620 for path in remote_paths:
621 fstab.write('%s:%s %s nfs vers=3,user,rw,exec,noauto 0 0\n' %
622 (server_node.alias, path, path))
623 fstab.close()
624 for path in remote_paths:
625 if not self.ssh.path_exists(path):
626 self.ssh.makedirs(path)
627 self.ssh.execute('mount %s' % path)
628
630 mount_map = {}
631 mount_lines = self.ssh.execute('mount')
632 for line in mount_lines:
633 dev, on_label, path, type_label, fstype, options = line.split()
634 mount_map[dev] = [path, fstype, options]
635 return mount_map
636
638 """
639 Mount device to path
640 """
641 self.ssh.remove_lines_from_file('/etc/fstab',
642 path.center(len(path) + 2))
643 master_fstab = self.ssh.remote_file('/etc/fstab', mode='a')
644 master_fstab.write("%s %s auto noauto,defaults 0 0\n" %
645 (device, path))
646 master_fstab.close()
647 if not self.ssh.path_exists(path):
648 self.ssh.makedirs(path)
649 self.ssh.execute('mount %s' % path)
650
660
662 """
663 Remove all network names for node in nodes arg from this node's
664 /etc/hosts file
665 """
666 aliases = map(lambda x: x.alias, nodes)
667 self.ssh.remove_lines_from_file('/etc/hosts', '|'.join(aliases))
668
670 """
671 Set this node's hostname to self.alias
672
673 hostname - optional hostname to set (defaults to self.alias)
674 """
675 hostname = hostname or self.alias
676 hostname_file = self.ssh.remote_file("/etc/hostname", "w")
677 hostname_file.write(hostname)
678 hostname_file.close()
679 self.ssh.execute('hostname -F /etc/hostname')
680
681 @property
690
691 @property
704
706 """
707 Detaches all volumes returned by self.attached_vols
708 """
709 block_devs = self.attached_vols
710 for dev in block_devs:
711 vol_id = block_devs[dev].volume_id
712 vol = self.ec2.get_volume(vol_id)
713 log.info("Detaching volume %s from %s" % (vol.id, self.alias))
714 if vol.status not in ['available', 'detaching']:
715 vol.detach()
716
718 """
719 Detach and destroy EBS root volume (EBS-backed node only)
720 """
721 if not self.is_ebs_backed():
722 return
723 root_vol = self.block_device_mapping[self.root_device_name]
724 vol_id = root_vol.volume_id
725 vol = self.ec2.get_volume(vol_id)
726 vol.detach()
727 while vol.update() != 'available':
728 time.sleep(5)
729 log.info("Deleting node %s's root volume" % self.alias)
730 root_vol.delete()
731
732 @property
734 if self.instance.spot_instance_request_id:
735 return self.instance.spot_instance_request_id
736
742
744 return self.alias == "master"
745
748
751
754
757
760
762 return self.spot_id is not None
763
766
768 return self.state == "stopped"
769
771 """
772 Starts EBS-backed instance and puts it in the 'running' state.
773 Only works if this node is EBS-backed, raises
774 exception.InvalidOperation otherwise.
775 """
776 if not self.is_ebs_backed():
777 raise exception.InvalidOperation(
778 "Only EBS-backed instances can be started")
779 return self.instance.start()
780
782 """
783 Shutdown EBS-backed instance and put it in the 'stopped' state.
784 Only works if this node is EBS-backed, raises
785 exception.InvalidOperation otherwise.
786
787 NOTE: The EBS root device will *not* be deleted and the instance can
788 be 'started' later on.
789 """
790 if self.is_spot():
791 raise exception.InvalidOperation(
792 "spot instances can not be stopped")
793 elif not self.is_ebs_backed():
794 raise exception.InvalidOperation(
795 "Only EBS-backed instances can be stopped")
796 if not self.is_stopped():
797 log.info("Stopping node: %s (%s)" % (self.alias, self.id))
798 return self.instance.stop()
799 else:
800 log.info("Node '%s' is already stopped" % self.alias)
801
803 """
804 Shutdown and destroy this instance. For EBS-backed nodes, this
805 will also destroy the node's EBS root device. Puts this node
806 into a 'terminated' state.
807 """
808 log.info("Terminating node: %s (%s)" % (self.alias, self.id))
809 return self.instance.terminate()
810
812 """
813 Shutdown this instance. This method will terminate traditional
814 instance-store instances and stop EBS-backed instances
815 (i.e. not destroy EBS root dev)
816 """
817 if self.is_stoppable():
818 self.stop()
819 else:
820 self.terminate()
821
823 """
824 Reboot this instance.
825 """
826 self.instance.reboot()
827
833
835 if self.update() != 'running':
836 return False
837 if not self.is_ssh_up():
838 return False
839 if self.private_ip_address is None:
840 log.debug("instance %s has no private_ip_address" % self.id)
841 log.debug("attempting to determine private_ip_address for "
842 "instance %s" % self.id)
843 try:
844 private_ip = self.ssh.execute(
845 'python -c '
846 '"import socket; print socket.gethostbyname(\'%s\')"' %
847 self.private_dns_name)[0].strip()
848 log.debug("determined instance %s's private ip to be %s" %
849 (self.id, private_ip))
850 self.instance.private_ip_address = private_ip
851 except Exception, e:
852 print e
853 return False
854 return True
855
860
861 @property
863 if not self._ssh:
864 self._ssh = ssh.SSHClient(self.instance.dns_name,
865 username=self.user,
866 private_key=self.key_location)
867 return self._ssh
868
869 - def shell(self, user=None):
870 """
871 Attempts to launch an interactive shell by first trying the system's
872 ssh client. If the system does not have the ssh command it falls back
873 to a pure-python ssh shell.
874 """
875 if self.update() != 'running':
876 try:
877 alias = self.alias
878 except exception.BaseException:
879 alias = None
880 label = 'instance'
881 if alias == "master":
882 label = "master"
883 elif alias:
884 label = "node"
885 instance_id = alias or self.id
886 raise exception.InstanceNotRunning(instance_id, self.state,
887 label=label)
888 user = user or self.user
889 if utils.has_required(['ssh']):
890 log.debug("using system's ssh client")
891 ssh_cmd = static.SSH_TEMPLATE % (self.key_location, user,
892 self.dns_name)
893 log.debug("ssh_cmd: %s" % ssh_cmd)
894 os.system(ssh_cmd)
895 else:
896 log.debug("using pure-python ssh client")
897 self.ssh.interactive_shell(user=user)
898
899 - def get_hosts_entry(self):
900 """ Returns /etc/hosts entry for this node """
901 etc_hosts_line = "%(INTERNAL_IP)s %(INTERNAL_ALIAS)s"
902 etc_hosts_line = etc_hosts_line % self.network_names
903 return etc_hosts_line
904
906 """
907 Run an apt-get command with all the necessary options for
908 non-interactive use (DEBIAN_FRONTEND=interactive, -y, --force-yes, etc)
909 """
910 dpkg_opts = "Dpkg::Options::='--force-confnew'"
911 cmd = "apt-get -o %s -y --force-yes %s" % (dpkg_opts, cmd)
912 cmd = "DEBIAN_FRONTEND='noninteractive' " + cmd
913 self.ssh.execute(cmd)
914
916 """
917 Install a set of packages via apt-get.
918
919 pkgs is a string that contains one or more packages separated by a
920 space
921 """
922 self.apt_command('install %s' % pkgs)
923
925 if self._ssh:
926 self._ssh.close()
927