Package starcluster :: Module clustersetup
[hide private]
[frames] | no frames]

Source Code for Module starcluster.clustersetup

  1  #!/usr/bin/env python 
  2   
  3  """ 
  4  clustersetup.py 
  5  """ 
  6  import re 
  7  import posixpath 
  8   
  9  from starcluster import threadpool 
 10  from starcluster.utils import print_timing 
 11  from starcluster.templates import sge 
 12  from starcluster.logger import log 
13 14 15 -class ClusterSetup(object):
16 """ 17 ClusterSetup Interface 18 19 This is the base class for all StarCluster plugins. A plugin should 20 implement at least one if not all of these methods. 21 """
22 - def __init__(self, *args, **kwargs):
23 pass
24
25 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
26 """ 27 This methods gets executed after a node has been added to the cluster 28 """ 29 raise NotImplementedError('on_add_node method not implemented')
30
31 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
32 """ 33 This method gets executed before a node is about to be removed from the 34 cluster 35 """ 36 raise NotImplementedError('on_remove_node method not implemented')
37
38 - def on_restart(self, nodes, master, user, user_shell, volumes):
39 """ 40 This method gets executed before restart the cluster 41 """ 42 raise NotImplementedError('on_restart method not implemented')
43
44 - def on_shutdown(self, nodes, master, user, user_shell, volumes):
45 """ 46 This method gets executed before shutting down the cluster 47 """ 48 raise NotImplementedError('on_shutdown method not implemented')
49
50 - def run(self, nodes, master, user, user_shell, volumes):
51 """ 52 Run this plugin's setup routines 53 54 This method gets executed after the default cluster setup routines have 55 been performed 56 """ 57 raise NotImplementedError('run method not implemented')
58
59 60 -class DefaultClusterSetup(ClusterSetup):
61 """ 62 Default ClusterSetup implementation for StarCluster 63 """
64 - def __init__(self, disable_queue=False, disable_threads=False, 65 num_threads=20):
66 self._nodes = None 67 self._master = None 68 self._user = None 69 self._user_shell = None 70 self._volumes = None 71 self._disable_queue = disable_queue 72 self._disable_threads = disable_threads 73 self._num_threads = num_threads 74 self._pool = None
75 76 @property
77 - def pool(self):
78 if not self._pool: 79 self._pool = threadpool.get_thread_pool(self._num_threads, 80 self._disable_threads) 81 return self._pool
82 83 @property
84 - def nodes(self):
85 return filter(lambda x: not x.is_master(), self._nodes)
86 87 @property
88 - def running_nodes(self):
89 return filter(lambda x: x.state in ['running'], self._nodes)
90
91 - def _setup_hostnames(self, nodes=None):
92 """ 93 Set each node's hostname to their alias. 94 """ 95 nodes = nodes or self._nodes 96 log.info("Configuring hostnames...") 97 for node in nodes: 98 self.pool.simple_job(node.set_hostname, (), jobid=node.alias) 99 self.pool.wait(numtasks=len(nodes))
100
101 - def _setup_cluster_user(self):
102 """ 103 Create cluster user on all StarCluster nodes 104 105 This command takes care to examine existing folders in /home 106 and set the new cluster_user's uid/gid accordingly. This is necessary 107 for the case of EBS volumes containing /home with large amounts of data 108 in them. It's much less expensive in this case to set the uid/gid of 109 the new user to be the existing uid/gid of the dir in EBS rather than 110 chowning potentially terabytes of data. 111 """ 112 mconn = self._master.ssh 113 home_folder = '/home/%s' % self._user 114 first_uid = 1000 115 uid, gid = first_uid, first_uid 116 umap = self._master.get_user_map(key_by_uid=True) 117 if mconn.path_exists(home_folder): 118 # get /home/user's owner/group uid and create 119 # user with that uid/gid 120 s = mconn.stat(home_folder) 121 uid = s.st_uid 122 gid = s.st_gid 123 else: 124 # get highest uid/gid of dirs in /home/*, 125 # increment by 1 and create user with that uid/gid 126 uid_db = {} 127 files = mconn.ls('/home') 128 for file in files: 129 if mconn.isdir(file): 130 f = mconn.stat(file) 131 uid_db[f.st_uid] = (file, f.st_gid) 132 if uid_db.keys(): 133 max_uid = max(uid_db.keys()) 134 max_gid = uid_db[max_uid][1] 135 uid, gid = max_uid + 1, max_gid + 1 136 # make sure the newly selected uid/gid is >= 1000 137 uid = max(uid, first_uid) 138 gid = max(gid, first_uid) 139 # make sure newly selected uid is not already in /etc/passwd 140 while umap.get(uid): 141 uid += 1 142 gid += 1 143 log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (self._user, 144 uid, gid)) 145 self._add_user_to_nodes(uid, gid, self._nodes)
146
147 - def _add_user_to_node(self, uid, gid, node):
148 existing_user = node.getpwuid(uid) 149 if existing_user: 150 username = existing_user.pw_name 151 if username != self._user: 152 msg = ("user %s exists on %s with same uid/gid as " 153 "cluster user %s...removing user %s") 154 log.debug( 155 msg % (username, node.alias, self._user, username)) 156 node.remove_user(username) 157 node.add_user(self._user, uid, gid, self._user_shell) 158 log.debug("user %s exists on node %s, no action" % \ 159 (self._user, node.alias)) 160 else: 161 log.debug("user %s does not exist, creating..." % self._user) 162 node.add_user(self._user, uid, gid, self._user_shell)
163
164 - def _add_user_to_nodes(self, uid, gid, nodes=None):
165 nodes = nodes or self._nodes 166 for node in nodes: 167 self.pool.simple_job(self._add_user_to_node, (uid, gid, node), 168 jobid=node.alias) 169 self.pool.wait(numtasks=len(nodes))
170
171 - def _setup_scratch_on_node(self, node):
172 nconn = node.ssh 173 user_scratch = '/mnt/%s' % self._user 174 if not nconn.path_exists(user_scratch): 175 nconn.mkdir(user_scratch) 176 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % \ 177 {'user': self._user}) 178 scratch = '/scratch' 179 if not nconn.path_exists(scratch): 180 nconn.mkdir(scratch) 181 if not nconn.path_exists(posixpath.join(scratch, self._user)): 182 nconn.execute('ln -s %s %s' % (user_scratch, scratch))
183
184 - def _setup_scratch(self, nodes=None):
185 """ Configure scratch space on all StarCluster nodes """ 186 log.info("Configuring scratch space for user: %s" % self._user) 187 nodes = nodes or self._nodes 188 for node in nodes: 189 self.pool.simple_job(self._setup_scratch_on_node, (node,), 190 jobid=node.alias) 191 self.pool.wait(numtasks=len(nodes))
192
193 - def _setup_etc_hosts(self, nodes=None):
194 """ Configure /etc/hosts on all StarCluster nodes""" 195 log.info("Configuring /etc/hosts on each node") 196 nodes = nodes or self._nodes 197 for node in nodes: 198 self.pool.simple_job(node.add_to_etc_hosts, (nodes, ), 199 jobid=node.alias) 200 self.pool.wait(numtasks=len(nodes))
201
202 - def _setup_passwordless_ssh(self, nodes=None):
203 """ 204 Properly configure passwordless ssh for root and CLUSTER_USER on all 205 StarCluster nodes 206 """ 207 log.info("Configuring passwordless ssh for root") 208 master = self._master 209 nodes = nodes or self.nodes 210 master.generate_key_for_user('root', auth_new_key=True, 211 auth_conn_key=True) 212 master.enable_passwordless_ssh('root', nodes) 213 # generate public/private keys, authorized_keys, and known_hosts files 214 # for cluster_user once on master node...NFS takes care of the rest 215 log.info("Configuring passwordless ssh for %s" % self._user) 216 master.generate_key_for_user(self._user, auth_new_key=True, 217 auth_conn_key=True) 218 master.add_to_known_hosts(self._user, nodes)
219
220 - def _setup_ebs_volumes(self):
221 """ 222 Mount EBS volumes, if specified in ~/.starcluster/config to /home 223 """ 224 # setup /etc/fstab on master to use block device if specified 225 master = self._master 226 devs = master.ssh.ls('/dev') 227 for vol in self._volumes: 228 vol = self._volumes[vol] 229 vol_id = vol.get("volume_id") 230 mount_path = vol.get('mount_path') 231 device = vol.get("device") 232 volume_partition = vol.get('partition') 233 if not (vol_id and device and mount_path): 234 log.error("missing required settings for vol %s" % vol) 235 continue 236 if not master.ssh.path_exists(device): 237 log.warn("Cannot find device %s for volume %s" % (device, vol)) 238 log.warn("Not mounting %s on %s" % (vol_id, mount_path)) 239 log.warn("This usually means there was a problem" + \ 240 "attaching the EBS volume to the master node") 241 continue 242 if not volume_partition: 243 partitions = filter(lambda x: x.startswith(device), devs) 244 if len(partitions) == 1: 245 volume_partition = device 246 elif len(partitions) == 2: 247 volume_partition = device + '1' 248 else: 249 log.error( 250 "volume has more than one partition, must specify" + \ 251 "a partition to use in the config") 252 continue 253 elif not master.ssh.path_exists(volume_partition): 254 log.warn("Cannot find partition %s on volume %s" % \ 255 (volume_partition, vol_id)) 256 log.warn("Not mounting %s on %s" % (vol_id, 257 mount_path)) 258 log.warn("This either means that the volume has not " + \ 259 "been partitioned or that the partition" + \ 260 "specified does not exist on the volume") 261 continue 262 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) 263 mount_map = self._master.get_mount_map() 264 dev = mount_map.get(volume_partition) 265 if dev: 266 path, fstype, options = dev 267 if path != mount_path: 268 log.error("Volume %s is mounted on %s, not on %s" % \ 269 (vol_id, path, mount_path)) 270 else: 271 log.info( 272 "Volume %s already mounted on %s...skipping" % \ 273 (vol_id, mount_path)) 274 continue 275 self._master.mount_device(volume_partition, mount_path)
276
277 - def _get_nfs_export_paths(self):
278 export_paths = ['/home'] 279 if not self._disable_queue: 280 export_paths.append('/opt/sge6') 281 for vol in self._volumes: 282 vol = self._volumes[vol] 283 mount_path = vol.get('mount_path') 284 if not mount_path in export_paths: 285 export_paths.append(mount_path) 286 return export_paths
287
288 - def _mount_nfs_shares(self, nodes):
289 # setup /etc/fstab and mount each nfs share on each node 290 master = self._master 291 export_paths = self._get_nfs_export_paths() 292 for node in nodes: 293 mount_map = node.get_mount_map() 294 mount_paths = [] 295 for path in export_paths: 296 network_device = "%s:%s" % (master.alias, path) 297 if network_device in mount_map: 298 mount_path, type, options = mount_map.get(network_device) 299 log.debug(('nfs share %s already mounted to %s on ' + \ 300 'node %s, skipping...') % \ 301 (network_device, mount_path, node.alias)) 302 else: 303 mount_paths.append(path) 304 log.info("Mounting shares for node %s" % node.alias) 305 self.pool.simple_job(node.mount_nfs_shares, (master, mount_paths), 306 jobid=node.alias) 307 self.pool.wait(numtasks=len(nodes))
308 309 @print_timing
310 - def _setup_nfs(self, nodes=None, start_server=True):
311 """ 312 Share /home, /opt/sge6, and all EBS mount paths via NFS to all nodes 313 """ 314 log.info("Configuring NFS...") 315 master = self._master 316 if not self._disable_queue and not master.ssh.isdir('/opt/sge6'): 317 # copy fresh sge installation files to /opt/sge6 318 master.ssh.execute('cp -r /opt/sge6-fresh /opt/sge6') 319 master.ssh.execute('chown -R %(user)s:%(user)s /opt/sge6' % \ 320 {'user': self._user}) 321 # setup /etc/exports and start nfsd on master node 322 nodes = nodes or self.nodes 323 export_paths = self._get_nfs_export_paths() 324 if start_server: 325 master.start_nfs_server() 326 master.export_fs_to_nodes(nodes, export_paths) 327 self._mount_nfs_shares(nodes)
328
329 - def _setup_sge(self):
330 """ 331 Install Sun Grid Engine with a default parallel 332 environment on StarCluster 333 """ 334 # generate /etc/profile.d/sge.sh for each node 335 for node in self._nodes: 336 conn = node.ssh 337 conn.execute('pkill -9 sge', ignore_exit_status=True) 338 conn.execute('rm /etc/init.d/sge*', ignore_exit_status=True) 339 sge_profile = conn.remote_file("/etc/profile.d/sge.sh") 340 arch = conn.execute("/opt/sge6/util/arch")[0] 341 print >> sge_profile, sge.sgeprofile_template % {'arch': arch} 342 sge_profile.close() 343 # setup sge auto install file 344 master = self._master 345 default_cell = '/opt/sge6/default' 346 if master.ssh.isdir(default_cell): 347 log.info("Removing previous SGE installation...") 348 master.ssh.execute('rm -rf %s' % default_cell) 349 master.ssh.execute('exportfs -fr') 350 mconn = master.ssh 351 admin_list = ' '.join(map(lambda n: n.alias, self._nodes)) 352 exec_list = admin_list 353 submit_list = admin_list 354 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf") 355 # TODO: add sge section to config values for some of the below 356 print >> ec2_sge_conf, sge.sgeinstall_template % \ 357 (admin_list, exec_list, submit_list) 358 ec2_sge_conf.close() 359 # installs sge in /opt/sge6 and starts qmaster/schedd on master node 360 log.info("Installing Sun Grid Engine...") 361 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x ' + \ 362 '-noremote -auto ./ec2_sge.conf', 363 silent=True, only_printable=True) 364 # set all.q shell to bash 365 mconn.execute('source /etc/profile && ' + \ 366 'qconf -mattr queue shell "/bin/bash" all.q') 367 for node in self.nodes: 368 master.ssh.execute('source /etc/profile && qconf -ah %s' % 369 node.alias) 370 master.ssh.execute('source /etc/profile && qconf -as %s' % 371 node.alias) 372 self.pool.simple_job(self._add_to_sge, (node,), jobid=node.alias) 373 self.pool.wait(numtasks=len(self.nodes)) 374 375 # create sge parallel environment 376 # first iterate through each machine and count the number of processors 377 num_processors = sum(map(lambda n: n.num_processors, self._nodes)) 378 parallel_environment = mconn.remote_file("/tmp/pe.txt") 379 print >> parallel_environment, sge.sge_pe_template % num_processors 380 parallel_environment.close() 381 mconn.execute("source /etc/profile && qconf -Ap %s" % \ 382 parallel_environment.name) 383 mconn.execute( 384 'source /etc/profile && qconf -mattr queue pe_list "orte" all.q')
385
386 - def run(self, nodes, master, user, user_shell, volumes):
387 """Start cluster configuration""" 388 try: 389 self._nodes = nodes 390 self._master = master 391 self._user = user 392 self._user_shell = user_shell 393 self._volumes = volumes 394 self._setup_hostnames() 395 self._setup_ebs_volumes() 396 self._setup_cluster_user() 397 self._setup_scratch() 398 self._setup_etc_hosts() 399 self._setup_nfs() 400 self._setup_passwordless_ssh() 401 if not self._disable_queue: 402 self._setup_sge() 403 finally: 404 self.pool.shutdown()
405
406 - def _remove_from_etc_hosts(self, node):
407 nodes = filter(lambda x: x.id != node.id, self.running_nodes) 408 for n in nodes: 409 n.remove_from_etc_hosts([node])
410
411 - def _remove_nfs_exports(self, node):
412 self._master.stop_exporting_fs_to_nodes([node])
413
414 - def _remove_from_known_hosts(self, node):
415 nodes = filter(lambda x: x.id != node.id, self.running_nodes) 416 for n in nodes: 417 n.remove_from_known_hosts('root', [node]) 418 n.remove_from_known_hosts(self._user, [node])
419
420 - def _remove_from_sge(self, node):
421 master = self._master 422 master.ssh.execute( 423 'source /etc/profile && qconf -shgrp @allhosts > /tmp/allhosts') 424 hgrp_file = master.ssh.remote_file('/tmp/allhosts', 'r') 425 contents = hgrp_file.read().splitlines() 426 hgrp_file.close() 427 c = [] 428 for line in contents: 429 line = line.replace(node.alias, '') 430 c.append(line) 431 hgrp_file = master.ssh.remote_file('/tmp/allhosts_new', 'w') 432 hgrp_file.writelines('\n'.join(c)) 433 hgrp_file.close() 434 master.ssh.execute( 435 'source /etc/profile && qconf -Mhgrp /tmp/allhosts_new') 436 master.ssh.execute( 437 'source /etc/profile && qconf -sq all.q > /tmp/allq') 438 allq_file = master.ssh.remote_file('/tmp/allq', 'r') 439 contents = allq_file.read() 440 allq_file.close() 441 c = [l.strip() for l in contents.splitlines()] 442 s = [] 443 allq = [] 444 for l in c: 445 if l.startswith('slots') or l.startswith('['): 446 s.append(l) 447 else: 448 allq.append(l) 449 regex = re.compile(r"\[%s=\d+\],?" % node.alias) 450 slots = [] 451 for line in s: 452 line = line.replace('\\', '') 453 slots.append(regex.sub('', line)) 454 allq.append(''.join(slots)) 455 f = master.ssh.remote_file('/tmp/allq_new', 'w') 456 allq[-1] = allq[-1].strip() 457 if allq[-1].endswith(','): 458 allq[-1] = allq[-1][:-1] 459 f.write('\n'.join(allq)) 460 f.close() 461 master.ssh.execute('source /etc/profile && qconf -Mq /tmp/allq_new') 462 master.ssh.execute( 463 'source /etc/profile && qconf -de %s' % node.alias) 464 master.ssh.execute( 465 'source /etc/profile && qconf -dconf %s' % node.alias)
466
467 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
468 self._nodes = nodes 469 self._master = master 470 self._user = user 471 self._user_shell = user_shell 472 self._volumes = volumes 473 log.info("Removing node %s (%s)..." % (node.alias, node.id)) 474 if not self._disable_queue: 475 log.info("Removing %s from SGE" % node.alias) 476 self._remove_from_sge(node) 477 log.info("Removing %s from known_hosts files" % node.alias) 478 self._remove_from_known_hosts(node) 479 log.info("Removing %s from /etc/hosts" % node.alias) 480 self._remove_from_etc_hosts(node) 481 log.info("Removing %s from NFS" % node.alias) 482 self._remove_nfs_exports(node)
483
484 - def _create_user(self, node):
485 user = self._master.getpwnam(self._user) 486 uid, gid = user.pw_uid, user.pw_gid 487 self._add_user_to_nodes(uid, gid, nodes=[node])
488
489 - def _add_to_sge(self, node):
490 # generate /etc/profile.d/sge.sh 491 master = self._master 492 sge_profile = node.ssh.remote_file("/etc/profile.d/sge.sh") 493 arch = node.ssh.execute("/opt/sge6/util/arch")[0] 494 print >> sge_profile, sge.sgeprofile_template % {'arch': arch} 495 sge_profile.close() 496 master.ssh.execute('source /etc/profile && qconf -ah %s' % \ 497 node.alias) 498 master.ssh.execute('source /etc/profile && qconf -as %s' % \ 499 node.alias) 500 node.ssh.execute(('cd /opt/sge6 && TERM=rxvt ./inst_sge ' + \ 501 '-x -noremote -auto ./ec2_sge.conf'))
502
503 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
504 self._nodes = nodes 505 self._master = master 506 self._user = user 507 self._user_shell = user_shell 508 self._volumes = volumes 509 self._setup_hostnames(nodes=[node]) 510 self._setup_etc_hosts(nodes) 511 self._setup_nfs(nodes=[node], start_server=False) 512 self._create_user(node) 513 self._setup_scratch(nodes=[node]) 514 self._setup_passwordless_ssh(nodes=[node]) 515 if not self._disable_queue: 516 log.info("Adding %s to SGE" % node.alias) 517 self._add_to_sge(node)
518