1
2 import os
3 import pwd
4 import grp
5 import time
6 import stat
7 import base64
8 import posixpath
9
10 from starcluster import ssh
11 from starcluster import utils
12 from starcluster import static
13 from starcluster import awsutils
14 from starcluster import managers
15 from starcluster import exception
16 from starcluster.logger import log
20 """
21 Manager class for Node objects
22 """
23 - def ssh_to_node(self, node_id, user='root', command=None):
33
34 - def get_node(self, node_id, user='root'):
50
51
52 -class Node(object):
53 """
54 This class represents a single compute node in a StarCluster.
55
56 It contains all useful metadata for the node such as the internal/external
57 hostnames, ips, etc as well as a paramiko ssh object for executing
58 commands, creating/modifying files on the node.
59
60 'instance' arg must be an instance of boto.ec2.instance.Instance
61
62 'key_location' arg is a string that contains the full path to the
63 private key corresponding to the keypair used to launch this node
64
65 'alias' keyword arg optionally names the node. If no alias is provided,
66 the alias is retrieved from the node's user_data based on the node's
67 launch index
68
69 'user' keyword optionally specifies user to ssh as (defaults to root)
70 """
71 - def __init__(self, instance, key_location, alias=None, user='root'):
72 self.instance = instance
73 self.ec2 = awsutils.EasyEC2(None, None)
74 self.ec2._conn = instance.connection
75 self.key_location = key_location
76 self.user = user
77 self._alias = alias
78 self._groups = None
79 self._ssh = None
80 self._num_procs = None
81 self._memory = None
82
84 return '<Node: %s (%s)>' % (self.alias, self.id)
85
87 tries = range(tries)
88 last_try = tries[-1]
89 for i in tries:
90 try:
91 user_data = self.ec2.get_instance_user_data(self.id)
92 return user_data
93 except exception.InstanceDoesNotExist:
94 if i == last_try:
95 log.debug("failed fetching user data")
96 raise
97 log.debug("InvalidInstanceID.NotFound: "
98 "retrying fetching user data (tries: %s)" % (i + 1))
99 time.sleep(5)
100
101 @property
103 """
104 Fetches the node's alias stored in a tag from either the instance
105 or the instance's parent spot request. If no alias tag is found an
106 exception is raised.
107 """
108 if not self._alias:
109 alias = self.tags.get('alias')
110 if not alias:
111 user_data = self._get_user_data(tries=5)
112 aliases = user_data.split('|')
113 index = self.ami_launch_index
114 try:
115 alias = aliases[index]
116 except IndexError:
117 log.debug(
118 "invalid user_data: %s (index: %d)" % (aliases, index))
119 alias = None
120 if not alias:
121 raise exception.BaseException(
122 "instance %s has no alias" % self.id)
123 self.add_tag('alias', alias)
124 name = self.tags.get('Name')
125 if not name:
126 self.add_tag('Name', alias)
127 self._alias = alias
128 return self._alias
129
134
135 @property
138
139 - def add_tag(self, key, value=None):
140 return self.instance.add_tag(key, value)
141
144
145 @property
151
152 @property
156
157 @property
159 if not self._num_procs:
160 self._num_procs = int(
161 self.ssh.execute(
162 'cat /proc/cpuinfo | grep processor | wc -l')[0])
163 return self._num_procs
164
165 @property
167 if not self._memory:
168 self._memory = float(
169 self.ssh.execute(
170 "free -m | grep -i mem | awk '{print $2}'")[0])
171 return self._memory
172
173 @property
176
177 @property
180
181 @property
184
185 @property
188
189 @property
192
193 @property
195 return self.instance.id
196
197 @property
200
201 @property
204
205 @property
207 return self.instance.state
208
209 @property
212
213 @property
217
218 @property
221
222 @property
224 try:
225 return int(self.instance.ami_launch_index)
226 except TypeError:
227 log.error("instance %s (state: %s) has no ami_launch_index" % \
228 (self.id, self.state))
229 log.error("returning 0 as ami_launch_index...")
230 return 0
231
232 @property
235
236 @property
238 return self.instance.architecture
239
240 @property
242 return self.instance.kernel
243
244 @property
247
248 @property
251
252 @property
255
256 @property
259
260 @property
263
264 @property
267
278
280 """
281 Returns dictionary where keys are remote group names and values are
282 grp.struct_grp objects from the standard grp module
283
284 key_by_gid=True will use the integer gid as the returned dictionary's
285 keys instead of the group's name
286 """
287 grp_file = self.ssh.remote_file('/etc/group', 'r')
288 groups = [l.strip().split(':') for l in grp_file.readlines()]
289 grp_file.close()
290 grp_map = {}
291 for group in groups:
292 name, passwd, gid, mems = group
293 gid = int(gid)
294 mems = mems.split(',')
295 key = name
296 if key_by_gid:
297 key = gid
298 grp_map[key] = grp.struct_group([name, passwd, gid, mems])
299 return grp_map
300
302 """
303 Returns dictionary where keys are remote usernames and values are
304 pwd.struct_passwd objects from the standard pwd module
305
306 key_by_uid=True will use the integer uid as the returned dictionary's
307 keys instead of the user's login name
308 """
309 etc_passwd = self.ssh.remote_file('/etc/passwd', 'r')
310 users = [l.strip().split(':') for l in etc_passwd.readlines()]
311 etc_passwd.close()
312 user_map = {}
313 for user in users:
314 name, passwd, uid, gid, gecos, home, shell = user
315 uid = int(uid)
316 gid = int(gid)
317 key = name
318 if key_by_uid:
319 key = uid
320 user_map[key] = pwd.struct_passwd([name, passwd, uid, gid,
321 gecos, home, shell])
322 return user_map
323
325 """
326 Remote version of the getgrgid method in the standard grp module
327
328 returns a grp.struct_group
329 """
330 gmap = self.get_group_map(key_by_gid=True)
331 return gmap.get(gid)
332
334 """
335 Remote version of the getgrnam method in the standard grp module
336
337 returns a grp.struct_group
338 """
339 gmap = self.get_group_map()
340 return gmap.get(groupname)
341
343 """
344 Remote version of the getpwuid method in the standard pwd module
345
346 returns a pwd.struct_passwd
347 """
348 umap = self.get_user_map(key_by_uid=True)
349 return umap.get(uid)
350
352 """
353 Remote version of the getpwnam method in the standard pwd module
354
355 returns a pwd.struct_passwd
356 """
357 umap = self.get_user_map()
358 return umap.get(username)
359
360 - def add_user(self, name, uid=None, gid=None, shell="bash"):
361 """
362 Add a user to the remote system.
363
364 name - the username of the user being added
365 uid - optional user id to use when creating new user
366 gid - optional group id to use when creating new user
367 shell - optional shell assign to new user (default: bash)
368 """
369 if gid:
370 self.ssh.execute('groupadd -o -g %s %s' % (gid, name))
371 user_add_cmd = 'useradd -o '
372 if uid:
373 user_add_cmd += '-u %s ' % uid
374 if gid:
375 user_add_cmd += '-g %s ' % gid
376 if shell:
377 user_add_cmd += '-s `which %s` ' % shell
378 user_add_cmd += "-m %s" % name
379 self.ssh.execute(user_add_cmd)
380
381 - def generate_key_for_user(self, username, ignore_existing=False,
382 auth_new_key=False, auth_conn_key=False):
383 """
384 Generates an id_rsa/id_rsa.pub keypair combo for a user on the remote
385 machine.
386
387 ignore_existing - if False, any existing key combos will be used rather
388 than generating a new RSA key
389
390 auth_new_key - if True, add the newly generated public key to the
391 remote user's authorized_keys file
392
393 auth_conn_key - if True, add the public key used to establish this ssh
394 connection to the remote user's authorized_keys
395 """
396 user = self.getpwnam(username)
397 home_folder = user.pw_dir
398 ssh_folder = posixpath.join(home_folder, '.ssh')
399 if not self.ssh.isdir(ssh_folder):
400 self.ssh.mkdir(ssh_folder)
401 private_key = posixpath.join(ssh_folder, 'id_rsa')
402 public_key = private_key + '.pub'
403 authorized_keys = posixpath.join(ssh_folder, 'authorized_keys')
404 key_exists = self.ssh.isfile(private_key)
405 if key_exists and not ignore_existing:
406 log.info("Using existing key: %s" % private_key)
407 key = self.ssh.load_remote_rsa_key(private_key)
408 else:
409 key = self.ssh.generate_rsa_key()
410 pubkey_contents = self.ssh.get_public_key(key)
411 if not key_exists or ignore_existing:
412
413 pub_key = self.ssh.remote_file(public_key, 'w')
414 pub_key.write(pubkey_contents)
415 pub_key.chown(user.pw_uid, user.pw_gid)
416 pub_key.chmod(0400)
417 pub_key.close()
418
419 priv_key = self.ssh.remote_file(private_key, 'w')
420 key.write_private_key(priv_key)
421 priv_key.chown(user.pw_uid, user.pw_gid)
422 priv_key.chmod(0400)
423 priv_key.close()
424 if not auth_new_key or not auth_conn_key:
425 return key
426 auth_keys_contents = ''
427 if self.ssh.isfile(authorized_keys):
428 auth_keys = self.ssh.remote_file(authorized_keys, 'r')
429 auth_keys_contents = auth_keys.read()
430 auth_keys.close()
431 auth_keys = self.ssh.remote_file(authorized_keys, 'a')
432 if auth_new_key:
433
434 if pubkey_contents not in auth_keys_contents:
435 log.debug("adding auth_key_contents")
436 auth_keys.write('%s\n' % pubkey_contents)
437 if auth_conn_key and self.ssh._pkey:
438
439
440 conn_key = self.ssh._pkey
441 conn_pubkey_contents = self.ssh.get_public_key(conn_key)
442 if conn_pubkey_contents not in auth_keys_contents:
443 log.debug("adding conn_pubkey_contents")
444 auth_keys.write('%s\n' % conn_pubkey_contents)
445 auth_keys.chown(user.pw_uid, user.pw_gid)
446 auth_keys.chmod(0600)
447 auth_keys.close()
448 return key
449
451 """
452 Populate user's known_hosts file with pub keys from hosts in nodes list
453
454 username - name of the user to add to known hosts for
455 nodes - the nodes to add to the user's known hosts file
456 add_self - add this Node to known_hosts in addition to nodes
457
458 NOTE: this node's hostname will also be added to the known_hosts
459 file
460 """
461 user = self.getpwnam(username)
462 known_hosts_file = posixpath.join(user.pw_dir, '.ssh', 'known_hosts')
463 self.remove_from_known_hosts(username, nodes)
464 khosts = []
465 for node in nodes:
466 server_pkey = node.ssh.get_server_public_key()
467 khosts.append(' '.join([node.alias, server_pkey.get_name(),
468 base64.b64encode(str(server_pkey))]))
469 if add_self and self not in nodes:
470 server_pkey = self.ssh.get_server_public_key()
471 khosts.append(' '.join([self.alias, server_pkey.get_name(),
472 base64.b64encode(str(server_pkey))]))
473 khostsf = self.ssh.remote_file(known_hosts_file, 'a')
474 khostsf.write('\n'.join(khosts) + '\n')
475 khostsf.chown(user.pw_uid, user.pw_gid)
476 khostsf.close()
477
479 """
480 Remove all network names for nodes from username's known_hosts file
481 on this Node
482 """
483 user = self.getpwnam(username)
484 known_hosts_file = posixpath.join(user.pw_dir, '.ssh', 'known_hosts')
485 hostnames = map(lambda n: n.alias, nodes)
486 if self.ssh.isfile(known_hosts_file):
487 regex = '|'.join(hostnames)
488 self.ssh.remove_lines_from_file(known_hosts_file, regex)
489
491 """
492 Configure passwordless ssh for user between this Node and nodes
493 """
494 user = self.getpwnam(username)
495 ssh_folder = posixpath.join(user.pw_dir, '.ssh')
496 priv_key_file = posixpath.join(ssh_folder, 'id_rsa')
497 pub_key_file = priv_key_file + '.pub'
498 known_hosts_file = posixpath.join(ssh_folder, 'known_hosts')
499 auth_key_file = posixpath.join(ssh_folder, 'authorized_keys')
500 self.add_to_known_hosts(username, nodes)
501
502 nodes = filter(lambda n: n.id != self.id, nodes)
503
504 self.copy_remote_file_to_nodes(priv_key_file, nodes)
505 self.copy_remote_file_to_nodes(pub_key_file, nodes)
506
507 self.copy_remote_file_to_nodes(auth_key_file, nodes)
508 self.copy_remote_file_to_nodes(known_hosts_file, nodes)
509
512
514 """
515 Copies a remote file from this Node instance to another Node instance
516 without passwordless ssh between the two.
517
518 dest - path to store the data in on the node (defaults to remote_file)
519 """
520 if not dest:
521 dest = remote_file
522 rf = self.ssh.remote_file(remote_file, 'r')
523 contents = rf.read()
524 sts = rf.stat()
525 mode = stat.S_IMODE(sts.st_mode)
526 uid = sts.st_uid
527 gid = sts.st_gid
528 rf.close()
529 for node in nodes:
530 if self.id == node.id and remote_file == dest:
531 log.warn("src and destination are the same: %s, skipping" %
532 remote_file)
533 continue
534 nrf = node.ssh.remote_file(dest, 'w')
535 nrf.write(contents)
536 nrf.chown(uid, gid)
537 nrf.chmod(mode)
538 nrf.close()
539
541 """
542 Remove a user from the remote system
543 """
544 self.ssh.execute('userdel %s' % name)
545 self.ssh.execute('groupdel %s' % name)
546
548 """
549 Export each path in export_paths to each node in nodes via NFS
550
551 nodes - list of nodes to export each path to
552 export_paths - list of paths on this remote host to export to each node
553
554 Example:
555 # export /home and /opt/sge6 to each node in nodes
556 $ node.start_nfs_server()
557 $ node.export_fs_to_nodes(\
558 nodes=[node1,node2], export_paths=['/home', '/opt/sge6']
559 """
560
561 nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
562 etc_exports = self.ssh.remote_file('/etc/exports')
563 for node in nodes:
564 for path in export_paths:
565 etc_exports.write(' '.join([path, node.alias + \
566 nfs_export_settings + '\n']))
567 etc_exports.close()
568 self.ssh.execute('exportfs -a')
569
571 """
572 Removes nodes from this node's /etc/exportfs
573
574 nodes - list of nodes to stop
575
576 Example:
577 $ node.remove_export_fs_to_nodes(nodes=[node1,node2])
578 """
579 regex = '|'.join(map(lambda x: x.alias, nodes))
580 self.ssh.remove_lines_from_file('/etc/exports', regex)
581 self.ssh.execute('exportfs -a')
582
584 self.ssh.execute('/etc/init.d/portmap start')
585 self.ssh.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/',
586 ignore_exit_status=True)
587 self.ssh.execute('/etc/init.d/nfs start')
588 self.ssh.execute('/usr/sbin/exportfs -r')
589
591 """
592 Mount each path in remote_paths from the remote server_node
593
594 server_node - remote server node that is sharing the remote_paths
595 remote_paths - list of remote paths to mount from server_node
596 """
597 self.ssh.execute('/etc/init.d/portmap start')
598
599 self.ssh.execute('mount -t devpts none /dev/pts',
600 ignore_exit_status=True)
601 remote_paths_regex = '|'.join(map(lambda x: x.center(len(x) + 2),
602 remote_paths))
603 self.ssh.remove_lines_from_file('/etc/fstab', remote_paths_regex)
604 fstab = self.ssh.remote_file('/etc/fstab', 'a')
605 for path in remote_paths:
606 fstab.write('%s:%s %s nfs user,rw,exec,noauto 0 0\n' %
607 (server_node.alias, path, path))
608 fstab.close()
609 for path in remote_paths:
610 if not self.ssh.path_exists(path):
611 self.ssh.makedirs(path)
612 self.ssh.execute('mount %s' % path)
613
615 mount_map = {}
616 mount_lines = self.ssh.execute('mount')
617 for line in mount_lines:
618 dev, on_label, path, type_label, fstype, options = line.split()
619 mount_map[dev] = [path, fstype, options]
620 return mount_map
621
623 """
624 Mount device to path
625 """
626 self.ssh.remove_lines_from_file('/etc/fstab',
627 path.center(len(path) + 2))
628 master_fstab = self.ssh.remote_file('/etc/fstab', mode='a')
629 master_fstab.write("%s %s auto noauto,defaults 0 0\n" % \
630 (device, path))
631 master_fstab.close()
632 if not self.ssh.path_exists(path):
633 self.ssh.makedirs(path)
634 self.ssh.execute('mount %s' % path)
635
645
647 """
648 Remove all network names for node in nodes arg from this node's
649 /etc/hosts file
650 """
651 aliases = map(lambda x: x.alias, nodes)
652 self.ssh.remove_lines_from_file('/etc/hosts', '|'.join(aliases))
653
655 """
656 Set this node's hostname to self.alias
657
658 hostname - optional hostname to set (defaults to self.alias)
659 """
660 hostname = hostname or self.alias
661 hostname_file = self.ssh.remote_file("/etc/hostname", "w")
662 hostname_file.write(hostname)
663 hostname_file.close()
664 self.ssh.execute('hostname -F /etc/hostname')
665
666 @property
675
676 @property
689
691 """
692 Detaches all volumes returned by self.attached_vols
693 """
694 block_devs = self.attached_vols
695 for dev in block_devs:
696 vol_id = block_devs[dev].volume_id
697 vol = self.ec2.get_volume(vol_id)
698 log.info("Detaching volume %s from %s" % (vol.id, self.alias))
699 if vol.status not in ['available', 'detaching']:
700 vol.detach()
701
703 """
704 Detach and destroy EBS root volume (EBS-backed node only)
705 """
706 if not self.is_ebs_backed():
707 return
708 root_vol = self.block_device_mapping[self.root_device_name]
709 vol_id = root_vol.volume_id
710 vol = self.ec2.get_volume(vol_id)
711 vol.detach()
712 while vol.update() != 'availabile':
713 time.sleep(5)
714 log.info("Deleting node %s's root volume" % self.alias)
715 root_vol.delete()
716
717 @property
719 if self.instance.spot_instance_request_id:
720 return self.instance.spot_instance_request_id
721
727
729 return self.alias == "master"
730
733
736
739
742
745
747 return self.spot_id is not None
748
751
753 return self.state == "stopped"
754
756 """
757 Starts EBS-backed instance and puts it in the 'running' state.
758 Only works if this node is EBS-backed, raises
759 exception.InvalidOperation otherwise.
760 """
761 if not self.is_ebs_backed():
762 raise exception.InvalidOperation(
763 "Only EBS-backed instances can be started")
764 return self.instance.start()
765
767 """
768 Shutdown EBS-backed instance and put it in the 'stopped' state.
769 Only works if this node is EBS-backed, raises
770 exception.InvalidOperation otherwise.
771
772 NOTE: The EBS root device will *not* be deleted and the instance can
773 be 'started' later on.
774 """
775 if self.is_spot():
776 raise exception.InvalidOperation(
777 "spot instances can not be stopped")
778 elif not self.is_ebs_backed():
779 raise exception.InvalidOperation(
780 "Only EBS-backed instances can be stopped")
781 if not self.is_stopped():
782 log.info("Stopping node: %s (%s)" % (self.alias, self.id))
783 return self.instance.stop()
784 else:
785 log.info("Node '%s' is already stopped" % self.alias)
786
788 """
789 Shutdown and destroy this instance. For EBS-backed nodes, this
790 will also destroy the node's EBS root device. Puts this node
791 into a 'terminated' state.
792 """
793 log.info("Terminating node: %s (%s)" % (self.alias, self.id))
794 return self.instance.terminate()
795
797 """
798 Shutdown this instance. This method will terminate traditional
799 instance-store instances and stop EBS-backed instances
800 (ie not destroy EBS root dev)
801 """
802 if self.is_stoppable():
803 self.stop()
804 else:
805 self.terminate()
806
808 """
809 Reboot this instance.
810 """
811 self.instance.reboot()
812
818
820 if self.update() != 'running':
821 return False
822 if not self.is_ssh_up():
823 return False
824 if self.private_ip_address is None:
825 log.debug("instance %s has no private_ip_address" % self.id)
826 log.debug(("attempting to determine private_ip_address for" + \
827 "instance %s") % self.id)
828 try:
829 private_ip = self.ssh.execute((
830 'python -c ' + \
831 '"import socket; print socket.gethostbyname(\'%s\')"') % \
832 self.private_dns_name)[0].strip()
833 log.debug("determined instance %s's private ip to be %s" % \
834 (self.id, private_ip))
835 self.instance.private_ip_address = private_ip
836 except Exception, e:
837 print e
838 return False
839 return True
840
845
846 @property
848 if not self._ssh:
849 self._ssh = ssh.SSHClient(self.instance.dns_name,
850 username=self.user,
851 private_key=self.key_location)
852 return self._ssh
853
854 - def shell(self, user=None):
855 """
856 Attempts to launch an interactive shell by first trying the system's
857 ssh client. If the system does not have the ssh command it falls back
858 to a pure-python ssh shell.
859 """
860 if self.update() != 'running':
861 try:
862 alias = self.alias
863 except exception.BaseException:
864 alias = None
865 label = 'instance'
866 if alias == "master":
867 label = "master"
868 elif alias:
869 label = "node"
870 instance_id = alias or self.id
871 raise exception.InstanceNotRunning(instance_id, self.state,
872 label=label)
873 user = user or self.user
874 if utils.has_required(['ssh']):
875 log.debug("using system's ssh client")
876 ssh_cmd = static.SSH_TEMPLATE % (self.key_location, user,
877 self.dns_name)
878 log.debug("ssh_cmd: %s" % ssh_cmd)
879 os.system(ssh_cmd)
880 else:
881 log.debug("using pure-python ssh client")
882 self.ssh.interactive_shell(user=user)
883
884 - def get_hosts_entry(self):
885 """ Returns /etc/hosts entry for this node """
886 etc_hosts_line = "%(INTERNAL_IP)s %(INTERNAL_ALIAS)s"
887 etc_hosts_line = etc_hosts_line % self.network_names
888 return etc_hosts_line
889
891 if self._ssh:
892 self._ssh.close()
893