1
2 import os
3 import pwd
4 import grp
5 import time
6 import stat
7 import base64
8 import posixpath
9
10 from starcluster import ssh
11 from starcluster import utils
12 from starcluster import static
13 from starcluster import awsutils
14 from starcluster import managers
15 from starcluster import exception
16 from starcluster.logger import log
20 """
21 Manager class for Node objects
22 """
26
27 - def get_node(self, node_id, user='root'):
43
44
45 -class Node(object):
46 """
47 This class represents a single compute node in a StarCluster.
48
49 It contains all useful metadata for the node such as the internal/external
50 hostnames, ips, etc as well as a paramiko ssh object for executing
51 commands, creating/modifying files on the node.
52
53 'instance' arg must be an instance of boto.ec2.instance.Instance
54
55 'key_location' arg is a string that contains the full path to the
56 private key corresponding to the keypair used to launch this node
57
58 'alias' keyword arg optionally names the node. If no alias is provided,
59 the alias is retrieved from the node's user_data based on the node's
60 launch index
61
62 'user' keyword optionally specifies user to ssh as (defaults to root)
63 """
64 - def __init__(self, instance, key_location, alias=None, user='root'):
65 self.instance = instance
66 self.ec2 = awsutils.EasyEC2(None, None)
67 self.ec2._conn = instance.connection
68 self.key_location = key_location
69 self.user = user
70 self._alias = alias
71 self._groups = None
72 self._ssh = None
73 self._num_procs = None
74 self._memory = None
75
77 return '<Node: %s (%s)>' % (self.alias, self.id)
78
80 tries = range(tries)
81 last_try = tries[-1]
82 for i in tries:
83 try:
84 user_data = self.ec2.get_instance_user_data(self.id)
85 return user_data
86 except exception.InstanceDoesNotExist:
87 if i == last_try:
88 log.debug("failed fetching user data")
89 raise
90 log.debug("InvalidInstanceID.NotFound: "
91 "retrying fetching user data (tries: %s)" % (i + 1))
92 time.sleep(5)
93
94 @property
96 """
97 Fetches the node's alias stored in a tag from either the instance
98 or the instance's parent spot request. If no alias tag is found an
99 exception is raised.
100 """
101 if not self._alias:
102 alias = self.tags.get('alias')
103 if not alias:
104 user_data = self._get_user_data(tries=5)
105 aliases = user_data.split('|')
106 index = self.ami_launch_index
107 try:
108 alias = aliases[index]
109 except IndexError:
110 log.debug(
111 "invalid user_data: %s (index: %d)" % (aliases, index))
112 alias = None
113 if not alias:
114 raise exception.BaseException(
115 "instance %s has no alias" % self.id)
116 self.add_tag('alias', alias)
117 name = self.tags.get('Name')
118 if not name:
119 self.add_tag('Name', alias)
120 self._alias = alias
121 return self._alias
122
127
128 @property
131
132 - def add_tag(self, key, value=None):
133 return self.instance.add_tag(key, value)
134
137
138 @property
144
145 @property
149
150 @property
152 if not self._num_procs:
153 self._num_procs = int(
154 self.ssh.execute(
155 'cat /proc/cpuinfo | grep processor | wc -l')[0])
156 return self._num_procs
157
158 @property
160 if not self._memory:
161 self._memory = float(
162 self.ssh.execute(
163 "free -m | grep -i mem | awk '{print $2}'")[0])
164 return self._memory
165
166 @property
169
170 @property
173
174 @property
177
178 @property
181
182 @property
185
186 @property
188 return self.instance.id
189
190 @property
193
194 @property
197
198 @property
200 return self.instance.state
201
202 @property
205
206 @property
210
211 @property
214
215 @property
217 try:
218 return int(self.instance.ami_launch_index)
219 except TypeError:
220 log.error("instance %s (state: %s) has no ami_launch_index" % \
221 (self.id, self.state))
222 log.error("returning 0 as ami_launch_index...")
223 return 0
224
225 @property
228
229 @property
231 return self.instance.architecture
232
233 @property
235 return self.instance.kernel
236
237 @property
240
241 @property
244
245 @property
248
249 @property
252
253 @property
256
257 @property
260
276
278 """
279 Returns dictionary where keys are remote group names and values are
280 grp.struct_grp objects from the standard grp module
281
282 key_by_gid=True will use the integer gid as the returned dictionary's
283 keys instead of the group's name
284 """
285 grp_file = self.ssh.remote_file('/etc/group', 'r')
286 groups = [l.strip().split(':') for l in grp_file.readlines()]
287 grp_file.close()
288 grp_map = {}
289 for group in groups:
290 name, passwd, gid, mems = group
291 gid = int(gid)
292 mems = mems.split(',')
293 key = name
294 if key_by_gid:
295 key = gid
296 grp_map[key] = grp.struct_group([name, passwd, gid, mems])
297 return grp_map
298
300 """
301 Returns dictionary where keys are remote usernames and values are
302 pwd.struct_passwd objects from the standard pwd module
303
304 key_by_uid=True will use the integer uid as the returned dictionary's
305 keys instead of the user's login name
306 """
307 etc_passwd = self.ssh.remote_file('/etc/passwd', 'r')
308 users = [l.strip().split(':') for l in etc_passwd.readlines()]
309 etc_passwd.close()
310 user_map = {}
311 for user in users:
312 name, passwd, uid, gid, gecos, home, shell = user
313 uid = int(uid)
314 gid = int(gid)
315 key = name
316 if key_by_uid:
317 key = uid
318 user_map[key] = pwd.struct_passwd([name, passwd, uid, gid,
319 gecos, home, shell])
320 return user_map
321
323 """
324 Remote version of the getgrgid method in the standard grp module
325
326 returns a grp.struct_group
327 """
328 gmap = self.get_group_map(key_by_gid=True)
329 return gmap.get(gid)
330
332 """
333 Remote version of the getgrnam method in the standard grp module
334
335 returns a grp.struct_group
336 """
337 gmap = self.get_group_map()
338 return gmap.get(groupname)
339
341 """
342 Remote version of the getpwuid method in the standard pwd module
343
344 returns a pwd.struct_passwd
345 """
346 umap = self.get_user_map(key_by_uid=True)
347 return umap.get(uid)
348
350 """
351 Remote version of the getpwnam method in the standard pwd module
352
353 returns a pwd.struct_passwd
354 """
355 umap = self.get_user_map()
356 return umap.get(username)
357
358 - def add_user(self, name, uid=None, gid=None, shell="bash"):
359 """
360 Add a user to the remote system.
361
362 name - the username of the user being added
363 uid - optional user id to use when creating new user
364 gid - optional group id to use when creating new user
365 shell - optional shell assign to new user (default: bash)
366 """
367 if gid:
368 self.ssh.execute('groupadd -o -g %s %s' % (gid, name))
369 user_add_cmd = 'useradd -o '
370 if uid:
371 user_add_cmd += '-u %s ' % uid
372 if gid:
373 user_add_cmd += '-g %s ' % gid
374 if shell:
375 user_add_cmd += '-s `which %s` ' % shell
376 user_add_cmd += "-m %s" % name
377 self.ssh.execute(user_add_cmd)
378
379 - def generate_key_for_user(self, username, ignore_existing=False,
380 auth_new_key=False, auth_conn_key=False):
381 """
382 Generates an id_rsa/id_rsa.pub keypair combo for a user on the remote
383 machine.
384
385 ignore_existing - if False, any existing key combos will be used rather
386 than generating a new RSA key
387
388 auth_new_key - if True, add the newly generated public key to the
389 remote user's authorized_keys file
390
391 auth_conn_key - if True, add the public key used to establish this ssh
392 connection to the remote user's authorized_keys
393 """
394 user = self.getpwnam(username)
395 home_folder = user.pw_dir
396 ssh_folder = posixpath.join(home_folder, '.ssh')
397 if not self.ssh.isdir(ssh_folder):
398 self.ssh.mkdir(ssh_folder)
399 private_key = posixpath.join(ssh_folder, 'id_rsa')
400 public_key = private_key + '.pub'
401 authorized_keys = posixpath.join(ssh_folder, 'authorized_keys')
402 key_exists = self.ssh.isfile(private_key)
403 if key_exists and not ignore_existing:
404 log.info("Using existing key: %s" % private_key)
405 key = self.ssh.load_remote_rsa_key(private_key)
406 else:
407 key = self.ssh.generate_rsa_key()
408 pubkey_contents = self.ssh.get_public_key(key)
409 if not key_exists or ignore_existing:
410
411 pub_key = self.ssh.remote_file(public_key, 'w')
412 pub_key.write(pubkey_contents)
413 pub_key.chown(user.pw_uid, user.pw_gid)
414 pub_key.chmod(0400)
415 pub_key.close()
416
417 priv_key = self.ssh.remote_file(private_key, 'w')
418 key.write_private_key(priv_key)
419 priv_key.chown(user.pw_uid, user.pw_gid)
420 priv_key.chmod(0400)
421 priv_key.close()
422 if not auth_new_key or not auth_conn_key:
423 return key
424 auth_keys_contents = ''
425 if self.ssh.isfile(authorized_keys):
426 auth_keys = self.ssh.remote_file(authorized_keys, 'r')
427 auth_keys_contents = auth_keys.read()
428 auth_keys.close()
429 auth_keys = self.ssh.remote_file(authorized_keys, 'a')
430 if auth_new_key:
431
432 if pubkey_contents not in auth_keys_contents:
433 log.debug("adding auth_key_contents")
434 auth_keys.write('%s\n' % pubkey_contents)
435 if auth_conn_key and self.ssh._pkey:
436
437
438 conn_key = self.ssh._pkey
439 conn_pubkey_contents = self.ssh.get_public_key(conn_key)
440 if conn_pubkey_contents not in auth_keys_contents:
441 log.debug("adding conn_pubkey_contents")
442 auth_keys.write('%s\n' % conn_pubkey_contents)
443 auth_keys.chown(user.pw_uid, user.pw_gid)
444 auth_keys.chmod(0600)
445 auth_keys.close()
446 return key
447
449 """
450 Populate user's known_hosts file with pub keys from hosts in nodes list
451
452 username - name of the user to add to known hosts for
453 nodes - the nodes to add to the user's known hosts file
454 add_self - add this Node to known_hosts in addition to nodes
455
456 NOTE: this node's hostname will also be added to the known_hosts
457 file
458 """
459 user = self.getpwnam(username)
460 known_hosts_file = posixpath.join(user.pw_dir, '.ssh', 'known_hosts')
461 self.remove_from_known_hosts(username, nodes)
462 khosts = []
463 for node in nodes:
464 server_pkey = node.ssh.get_server_public_key()
465 khosts.append(' '.join([node.alias, server_pkey.get_name(),
466 base64.b64encode(str(server_pkey))]))
467 if add_self and self not in nodes:
468 server_pkey = self.ssh.get_server_public_key()
469 khosts.append(' '.join([self.alias, server_pkey.get_name(),
470 base64.b64encode(str(server_pkey))]))
471 khostsf = self.ssh.remote_file(known_hosts_file, 'a')
472 khostsf.write('\n'.join(khosts) + '\n')
473 khostsf.chown(user.pw_uid, user.pw_gid)
474 khostsf.close()
475
477 """
478 Remove all network names for nodes from username's known_hosts file
479 on this Node
480 """
481 user = self.getpwnam(username)
482 known_hosts_file = posixpath.join(user.pw_dir, '.ssh', 'known_hosts')
483 hostnames = map(lambda n: n.alias, nodes)
484 if self.ssh.isfile(known_hosts_file):
485 regex = '|'.join(hostnames)
486 self.ssh.remove_lines_from_file(known_hosts_file, regex)
487
489 """
490 Configure passwordless ssh for user between this Node and nodes
491 """
492 user = self.getpwnam(username)
493 ssh_folder = posixpath.join(user.pw_dir, '.ssh')
494 priv_key_file = posixpath.join(ssh_folder, 'id_rsa')
495 pub_key_file = priv_key_file + '.pub'
496 known_hosts_file = posixpath.join(ssh_folder, 'known_hosts')
497 auth_key_file = posixpath.join(ssh_folder, 'authorized_keys')
498 self.add_to_known_hosts(username, nodes)
499
500 nodes = filter(lambda n: n.id != self.id, nodes)
501
502 self.copy_remote_file_to_nodes(priv_key_file, nodes)
503 self.copy_remote_file_to_nodes(pub_key_file, nodes)
504
505 self.copy_remote_file_to_nodes(auth_key_file, nodes)
506 self.copy_remote_file_to_nodes(known_hosts_file, nodes)
507
510
512 """
513 Copies a remote file from this Node instance to another Node instance
514 without passwordless ssh between the two.
515
516 dest - path to store the data in on the node (defaults to remote_file)
517 """
518 if not dest:
519 dest = remote_file
520 rf = self.ssh.remote_file(remote_file, 'r')
521 contents = rf.read()
522 sts = rf.stat()
523 mode = stat.S_IMODE(sts.st_mode)
524 uid = sts.st_uid
525 gid = sts.st_gid
526 rf.close()
527 for node in nodes:
528 if self.id == node.id and remote_file == dest:
529 log.warn("src and destination are the same: %s, skipping" %
530 remote_file)
531 continue
532 nrf = node.ssh.remote_file(dest, 'w')
533 nrf.write(contents)
534 nrf.chown(uid, gid)
535 nrf.chmod(mode)
536 nrf.close()
537
539 """
540 Remove a user from the remote system
541 """
542 self.ssh.execute('userdel %s' % name)
543 self.ssh.execute('groupdel %s' % name)
544
546 """
547 Export each path in export_paths to each node in nodes via NFS
548
549 nodes - list of nodes to export each path to
550 export_paths - list of paths on this remote host to export to each node
551
552 Example:
553 # export /home and /opt/sge6 to each node in nodes
554 $ node.start_nfs_server()
555 $ node.export_fs_to_nodes(\
556 nodes=[node1,node2], export_paths=['/home', '/opt/sge6']
557 """
558
559 nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
560 etc_exports = self.ssh.remote_file('/etc/exports')
561 for node in nodes:
562 for path in export_paths:
563 etc_exports.write(' '.join([path, node.alias + \
564 nfs_export_settings + '\n']))
565 etc_exports.close()
566 self.ssh.execute('exportfs -a')
567
569 """
570 Removes nodes from this node's /etc/exportfs
571
572 nodes - list of nodes to stop
573
574 Example:
575 $ node.remove_export_fs_to_nodes(nodes=[node1,node2])
576 """
577 regex = '|'.join(map(lambda x: x.alias, nodes))
578 self.ssh.remove_lines_from_file('/etc/exports', regex)
579 self.ssh.execute('exportfs -a')
580
582 self.ssh.execute('/etc/init.d/portmap start')
583 self.ssh.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/',
584 ignore_exit_status=True)
585 self.ssh.execute('/etc/init.d/nfs start')
586 self.ssh.execute('/usr/sbin/exportfs -r')
587
589 """
590 Mount each path in remote_paths from the remote server_node
591
592 server_node - remote server node that is sharing the remote_paths
593 remote_paths - list of remote paths to mount from server_node
594 """
595 self.ssh.execute('/etc/init.d/portmap start')
596
597 self.ssh.execute('mount -t devpts none /dev/pts',
598 ignore_exit_status=True)
599 remote_paths_regex = '|'.join(map(lambda x: x.center(len(x) + 2),
600 remote_paths))
601 self.ssh.remove_lines_from_file('/etc/fstab', remote_paths_regex)
602 fstab = self.ssh.remote_file('/etc/fstab', 'a')
603 for path in remote_paths:
604 fstab.write('%s:%s %s nfs user,rw,exec,noauto 0 0\n' %
605 (server_node.alias, path, path))
606 fstab.close()
607 for path in remote_paths:
608 if not self.ssh.path_exists(path):
609 self.ssh.makedirs(path)
610 self.ssh.execute('mount %s' % path)
611
613 mount_map = {}
614 mount_lines = self.ssh.execute('mount')
615 for line in mount_lines:
616 dev, on_label, path, type_label, fstype, options = line.split()
617 mount_map[dev] = [path, fstype, options]
618 return mount_map
619
621 """
622 Mount device to path
623 """
624 self.ssh.remove_lines_from_file('/etc/fstab',
625 path.center(len(path) + 2))
626 master_fstab = self.ssh.remote_file('/etc/fstab', mode='a')
627 master_fstab.write("%s %s auto noauto,defaults 0 0\n" % \
628 (device, path))
629 master_fstab.close()
630 if not self.ssh.path_exists(path):
631 self.ssh.makedirs(path)
632 self.ssh.execute('mount %s' % path)
633
643
645 """
646 Remove all network names for node in nodes arg from this node's
647 /etc/hosts file
648 """
649 aliases = map(lambda x: x.alias, nodes)
650 self.ssh.remove_lines_from_file('/etc/hosts', '|'.join(aliases))
651
653 """
654 Set this node's hostname to self.alias
655
656 hostname - optional hostname to set (defaults to self.alias)
657 """
658 hostname = hostname or self.alias
659 hostname_file = self.ssh.remote_file("/etc/hostname", "w")
660 hostname_file.write(hostname)
661 hostname_file.close()
662 self.ssh.execute('hostname -F /etc/hostname')
663
664 @property
673
674 @property
687
689 """
690 Detaches all volumes returned by self.attached_vols
691 """
692 block_devs = self.attached_vols
693 for dev in block_devs:
694 vol_id = block_devs[dev].volume_id
695 vol = self.ec2.get_volume(vol_id)
696 log.info("Detaching volume %s from %s" % (vol.id, self.alias))
697 if vol.status not in ['available', 'detaching']:
698 vol.detach()
699
701 """
702 Detach and destroy EBS root volume (EBS-backed node only)
703 """
704 if not self.is_ebs_backed():
705 return
706 root_vol = self.block_device_mapping[self.root_device_name]
707 vol_id = root_vol.volume_id
708 vol = self.ec2.get_volume(vol_id)
709 vol.detach()
710 while vol.update() != 'availabile':
711 time.sleep(5)
712 log.info("Deleting node %s's root volume" % self.alias)
713 root_vol.delete()
714
715 @property
717 if self.instance.spot_instance_request_id:
718 return self.instance.spot_instance_request_id
719
725
727 return self.alias == "master"
728
731
734
737
740
743
745 return self.spot_id is not None
746
749
751 return self.state == "stopped"
752
754 """
755 Starts EBS-backed instance and puts it in the 'running' state.
756 Only works if this node is EBS-backed, raises
757 exception.InvalidOperation otherwise.
758 """
759 if not self.is_ebs_backed():
760 raise exception.InvalidOperation(
761 "Only EBS-backed instances can be started")
762 return self.instance.start()
763
765 """
766 Shutdown EBS-backed instance and put it in the 'stopped' state.
767 Only works if this node is EBS-backed, raises
768 exception.InvalidOperation otherwise.
769
770 NOTE: The EBS root device will *not* be deleted and the instance can
771 be 'started' later on.
772 """
773 if self.is_spot():
774 raise exception.InvalidOperation(
775 "spot instances can not be stopped")
776 elif not self.is_ebs_backed():
777 raise exception.InvalidOperation(
778 "Only EBS-backed instances can be stopped")
779 if not self.is_stopped():
780 log.info("Stopping node: %s (%s)" % (self.alias, self.id))
781 return self.instance.stop()
782 else:
783 log.info("Node '%s' is already stopped" % self.alias)
784
786 """
787 Shutdown and destroy this instance. For EBS-backed nodes, this
788 will also destroy the node's EBS root device. Puts this node
789 into a 'terminated' state.
790 """
791 log.info("Terminating node: %s (%s)" % (self.alias, self.id))
792 return self.instance.terminate()
793
795 """
796 Shutdown this instance. This method will terminate traditional
797 instance-store instances and stop EBS-backed instances
798 (ie not destroy EBS root dev)
799 """
800 if self.is_stoppable():
801 self.stop()
802 else:
803 self.terminate()
804
806 """
807 Reboot this instance.
808 """
809 self.instance.reboot()
810
816
818 if self.update() != 'running':
819 return False
820 if not self.is_ssh_up():
821 return False
822 if self.private_ip_address is None:
823 log.debug("instance %s has no private_ip_address" % self.id)
824 log.debug(("attempting to determine private_ip_address for" + \
825 "instance %s") % self.id)
826 try:
827 private_ip = self.ssh.execute((
828 'python -c ' + \
829 '"import socket; print socket.gethostbyname(\'%s\')"') % \
830 self.private_dns_name)[0].strip()
831 log.debug("determined instance %s's private ip to be %s" % \
832 (self.id, private_ip))
833 self.instance.private_ip_address = private_ip
834 except Exception, e:
835 print e
836 return False
837 return True
838
843
844 @property
846 if not self._ssh:
847 self._ssh = ssh.SSHClient(self.instance.dns_name,
848 username=self.user,
849 private_key=self.key_location)
850 return self._ssh
851
852 - def shell(self, user=None):
853 """
854 Attempts to launch an interactive shell by first trying the system's
855 ssh client. If the system does not have the ssh command it falls back
856 to a pure-python ssh shell.
857 """
858 if self.update() != 'running':
859 try:
860 alias = self.alias
861 except exception.BaseException:
862 alias = None
863 label = 'instance'
864 if alias == "master":
865 label = "master"
866 elif alias:
867 label = "node"
868 instance_id = alias or self.id
869 raise exception.InstanceNotRunning(instance_id, self.state,
870 label=label)
871 user = user or self.user
872 if utils.has_required(['ssh']):
873 log.debug("using system's ssh client")
874 ssh_cmd = static.SSH_TEMPLATE % (self.key_location, user,
875 self.dns_name)
876 log.debug("ssh_cmd: %s" % ssh_cmd)
877 os.system(ssh_cmd)
878 else:
879 log.debug("using pure-python ssh client")
880 self.ssh.interactive_shell(user=user)
881
882 - def get_hosts_entry(self):
883 """ Returns /etc/hosts entry for this node """
884 etc_hosts_line = "%(INTERNAL_IP)s %(INTERNAL_ALIAS)s"
885 etc_hosts_line = etc_hosts_line % self.network_names
886 return etc_hosts_line
887
889 if self._ssh:
890 self._ssh.close()
891