Package starcluster :: Module clustersetup
[hide private]
[frames] | no frames]

Source Code for Module starcluster.clustersetup

  1  """ 
  2  clustersetup.py 
  3  """ 
  4  import re 
  5  import posixpath 
  6   
  7  from starcluster import threadpool 
  8  from starcluster.utils import print_timing 
  9  from starcluster.templates import sge 
 10  from starcluster.logger import log 
11 12 13 -class ClusterSetup(object):
14 """ 15 ClusterSetup Interface 16 17 This is the base class for all StarCluster plugins. A plugin should 18 implement at least one if not all of these methods. 19 """
20 - def __init__(self, *args, **kwargs):
21 pass
22
23 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
24 """ 25 This methods gets executed after a node has been added to the cluster 26 """ 27 raise NotImplementedError('on_add_node method not implemented')
28
29 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
30 """ 31 This method gets executed before a node is about to be removed from the 32 cluster 33 """ 34 raise NotImplementedError('on_remove_node method not implemented')
35
36 - def on_restart(self, nodes, master, user, user_shell, volumes):
37 """ 38 This method gets executed before restart the cluster 39 """ 40 raise NotImplementedError('on_restart method not implemented')
41
42 - def on_shutdown(self, nodes, master, user, user_shell, volumes):
43 """ 44 This method gets executed before shutting down the cluster 45 """ 46 raise NotImplementedError('on_shutdown method not implemented')
47
48 - def run(self, nodes, master, user, user_shell, volumes):
49 """ 50 Run this plugin's setup routines 51 52 This method gets executed after the default cluster setup routines have 53 been performed 54 """ 55 raise NotImplementedError('run method not implemented')
56
57 58 -class DefaultClusterSetup(ClusterSetup):
59 """ 60 Default ClusterSetup implementation for StarCluster 61 """
62 - def __init__(self, disable_queue=False, disable_threads=False, 63 num_threads=20):
64 self._nodes = None 65 self._master = None 66 self._user = None 67 self._user_shell = None 68 self._volumes = None 69 self._disable_queue = disable_queue 70 self._disable_threads = disable_threads 71 self._num_threads = num_threads 72 self._pool = None
73 74 @property
75 - def pool(self):
76 if not self._pool: 77 self._pool = threadpool.get_thread_pool(self._num_threads, 78 self._disable_threads) 79 return self._pool
80 81 @property
82 - def nodes(self):
83 return filter(lambda x: not x.is_master(), self._nodes)
84 85 @property
86 - def running_nodes(self):
87 return filter(lambda x: x.state in ['running'], self._nodes)
88
89 - def _setup_hostnames(self, nodes=None):
90 """ 91 Set each node's hostname to their alias. 92 """ 93 nodes = nodes or self._nodes 94 log.info("Configuring hostnames...") 95 for node in nodes: 96 self.pool.simple_job(node.set_hostname, (), jobid=node.alias) 97 self.pool.wait(numtasks=len(nodes))
98
99 - def _setup_cluster_user(self):
100 """ 101 Create cluster user on all StarCluster nodes 102 103 This command takes care to examine existing folders in /home 104 and set the new cluster_user's uid/gid accordingly. This is necessary 105 for the case of EBS volumes containing /home with large amounts of data 106 in them. It's much less expensive in this case to set the uid/gid of 107 the new user to be the existing uid/gid of the dir in EBS rather than 108 chowning potentially terabytes of data. 109 """ 110 mconn = self._master.ssh 111 home_folder = '/home/%s' % self._user 112 first_uid = 1000 113 uid, gid = first_uid, first_uid 114 umap = self._master.get_user_map(key_by_uid=True) 115 if mconn.path_exists(home_folder): 116 # get /home/user's owner/group uid and create 117 # user with that uid/gid 118 s = mconn.stat(home_folder) 119 uid = s.st_uid 120 gid = s.st_gid 121 else: 122 # get highest uid/gid of dirs in /home/*, 123 # increment by 1 and create user with that uid/gid 124 uid_db = {} 125 files = mconn.ls('/home') 126 for file in files: 127 if mconn.isdir(file): 128 f = mconn.stat(file) 129 uid_db[f.st_uid] = (file, f.st_gid) 130 if uid_db.keys(): 131 max_uid = max(uid_db.keys()) 132 max_gid = uid_db[max_uid][1] 133 uid, gid = max_uid + 1, max_gid + 1 134 # make sure the newly selected uid/gid is >= 1000 135 uid = max(uid, first_uid) 136 gid = max(gid, first_uid) 137 # make sure newly selected uid is not already in /etc/passwd 138 while umap.get(uid): 139 uid += 1 140 gid += 1 141 log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (self._user, 142 uid, gid)) 143 self._add_user_to_nodes(uid, gid, self._nodes)
144
145 - def _add_user_to_node(self, uid, gid, node):
146 existing_user = node.getpwuid(uid) 147 if existing_user: 148 username = existing_user.pw_name 149 if username != self._user: 150 msg = ("user %s exists on %s with same uid/gid as " 151 "cluster user %s...removing user %s") 152 log.debug( 153 msg % (username, node.alias, self._user, username)) 154 node.remove_user(username) 155 node.add_user(self._user, uid, gid, self._user_shell) 156 log.debug("user %s exists on node %s, no action" % 157 (self._user, node.alias)) 158 else: 159 log.debug("user %s does not exist, creating..." % self._user) 160 node.add_user(self._user, uid, gid, self._user_shell)
161
162 - def _add_user_to_nodes(self, uid, gid, nodes=None):
163 nodes = nodes or self._nodes 164 for node in nodes: 165 self.pool.simple_job(self._add_user_to_node, (uid, gid, node), 166 jobid=node.alias) 167 self.pool.wait(numtasks=len(nodes))
168
169 - def _setup_scratch_on_node(self, node):
170 nconn = node.ssh 171 user_scratch = '/mnt/%s' % self._user 172 if not nconn.path_exists(user_scratch): 173 nconn.mkdir(user_scratch) 174 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % 175 {'user': self._user}) 176 scratch = '/scratch' 177 if not nconn.path_exists(scratch): 178 nconn.mkdir(scratch) 179 if not nconn.path_exists(posixpath.join(scratch, self._user)): 180 nconn.execute('ln -s %s %s' % (user_scratch, scratch))
181
182 - def _setup_scratch(self, nodes=None):
183 """ Configure scratch space on all StarCluster nodes """ 184 log.info("Configuring scratch space for user: %s" % self._user) 185 nodes = nodes or self._nodes 186 for node in nodes: 187 self.pool.simple_job(self._setup_scratch_on_node, (node,), 188 jobid=node.alias) 189 self.pool.wait(numtasks=len(nodes))
190
191 - def _setup_etc_hosts(self, nodes=None):
192 """ Configure /etc/hosts on all StarCluster nodes""" 193 log.info("Configuring /etc/hosts on each node") 194 nodes = nodes or self._nodes 195 for node in nodes: 196 self.pool.simple_job(node.add_to_etc_hosts, (nodes, ), 197 jobid=node.alias) 198 self.pool.wait(numtasks=len(nodes))
199
200 - def _setup_passwordless_ssh(self, nodes=None):
201 """ 202 Properly configure passwordless ssh for root and CLUSTER_USER on all 203 StarCluster nodes 204 """ 205 log.info("Configuring passwordless ssh for root") 206 master = self._master 207 nodes = nodes or self.nodes 208 master.generate_key_for_user('root', auth_new_key=True, 209 auth_conn_key=True) 210 master.enable_passwordless_ssh('root', nodes) 211 # generate public/private keys, authorized_keys, and known_hosts files 212 # for cluster_user once on master node...NFS takes care of the rest 213 log.info("Configuring passwordless ssh for %s" % self._user) 214 master.generate_key_for_user(self._user, auth_new_key=True, 215 auth_conn_key=True) 216 master.add_to_known_hosts(self._user, nodes)
217
218 - def _setup_ebs_volumes(self):
219 """ 220 Mount EBS volumes, if specified in ~/.starcluster/config to /home 221 """ 222 # setup /etc/fstab on master to use block device if specified 223 master = self._master 224 devs = master.ssh.ls('/dev') 225 for vol in self._volumes: 226 vol = self._volumes[vol] 227 vol_id = vol.get("volume_id") 228 mount_path = vol.get('mount_path') 229 device = vol.get("device") 230 volume_partition = vol.get('partition') 231 if not (vol_id and device and mount_path): 232 log.error("missing required settings for vol %s" % vol) 233 continue 234 dev_exists = master.ssh.path_exists(device) 235 if not dev_exists and device.startswith('/dev/sd'): 236 # check for "correct" device in unpatched kernels 237 device = device.replace('/dev/sd', '/dev/xvd') 238 dev_exists = master.ssh.path_exists(device) 239 if not dev_exists: 240 log.warn("Cannot find device %s for volume %s" % 241 (device, vol_id)) 242 log.warn("Not mounting %s on %s" % (vol_id, mount_path)) 243 log.warn("This usually means there was a problem " 244 "attaching the EBS volume to the master node") 245 continue 246 if not volume_partition: 247 partitions = filter(lambda x: x.startswith(device), devs) 248 if len(partitions) == 1: 249 volume_partition = device 250 elif len(partitions) == 2: 251 volume_partition = device + '1' 252 else: 253 log.error( 254 "volume has more than one partition, please specify " 255 "which partition to use (e.g. partition=0, " 256 "partition=1, etc.) in the volume's config") 257 continue 258 elif not master.ssh.path_exists(volume_partition): 259 log.warn("Cannot find partition %s on volume %s" % 260 (volume_partition, vol_id)) 261 log.warn("Not mounting %s on %s" % (vol_id, 262 mount_path)) 263 log.warn("This either means that the volume has not " 264 "been partitioned or that the partition" 265 "specified does not exist on the volume") 266 continue 267 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) 268 mount_map = self._master.get_mount_map() 269 dev = mount_map.get(volume_partition) 270 if dev: 271 path, fstype, options = dev 272 if path != mount_path: 273 log.error("Volume %s is mounted on %s, not on %s" % 274 (vol_id, path, mount_path)) 275 else: 276 log.info( 277 "Volume %s already mounted on %s...skipping" % 278 (vol_id, mount_path)) 279 continue 280 self._master.mount_device(volume_partition, mount_path)
281
282 - def _get_nfs_export_paths(self):
283 export_paths = ['/home'] 284 if not self._disable_queue: 285 export_paths.append('/opt/sge6') 286 for vol in self._volumes: 287 vol = self._volumes[vol] 288 mount_path = vol.get('mount_path') 289 if not mount_path in export_paths: 290 export_paths.append(mount_path) 291 return export_paths
292
293 - def _mount_nfs_shares(self, nodes):
294 # setup /etc/fstab and mount each nfs share on each node 295 master = self._master 296 export_paths = self._get_nfs_export_paths() 297 for node in nodes: 298 mount_map = node.get_mount_map() 299 mount_paths = [] 300 for path in export_paths: 301 network_device = "%s:%s" % (master.alias, path) 302 if network_device in mount_map: 303 mount_path, type, options = mount_map.get(network_device) 304 log.debug('nfs share %s already mounted to %s on ' 305 'node %s, skipping...' % 306 (network_device, mount_path, node.alias)) 307 else: 308 mount_paths.append(path) 309 self.pool.simple_job(node.mount_nfs_shares, (master, mount_paths), 310 jobid=node.alias) 311 self.pool.wait(numtasks=len(nodes))
312 313 @print_timing("Setting up NFS")
314 - def _setup_nfs(self, nodes=None, start_server=True):
315 """ 316 Share /home, /opt/sge6, and all EBS mount paths via NFS to all nodes 317 """ 318 log.info("Configuring NFS...") 319 master = self._master 320 if not self._disable_queue and not master.ssh.isdir('/opt/sge6'): 321 # copy fresh sge installation files to /opt/sge6 322 master.ssh.execute('cp -r /opt/sge6-fresh /opt/sge6') 323 master.ssh.execute('chown -R %(user)s:%(user)s /opt/sge6' % 324 {'user': self._user}) 325 # setup /etc/exports and start nfsd on master node 326 nodes = nodes or self.nodes 327 export_paths = self._get_nfs_export_paths() 328 if start_server: 329 master.start_nfs_server() 330 master.export_fs_to_nodes(nodes, export_paths) 331 self._mount_nfs_shares(nodes)
332
333 - def _create_sge_pe(self, name="orte", nodes=None, queue="all.q"):
334 """ 335 Create or update an SGE parallel environment 336 337 name - name of parallel environment 338 nodes - list of nodes to include in the parallel environment 339 (default: all) 340 queue - configure queue to use the new parallel environment 341 """ 342 mssh = self._master.ssh 343 pe_exists = mssh.get_status('qconf -sp %s' % name, source_profile=True) 344 pe_exists = pe_exists == 0 345 if not pe_exists: 346 log.info("Creating SGE parallel environment '%s'" % name) 347 else: 348 log.info("Updating SGE parallel environment '%s'" % name) 349 # iterate through each machine and count the number of processors 350 nodes = nodes or self._nodes 351 num_processors = sum(self.pool.map(lambda n: n.num_processors, nodes)) 352 penv = mssh.remote_file("/tmp/pe.txt") 353 print >> penv, sge.sge_pe_template % (name, num_processors) 354 penv.close() 355 if not pe_exists: 356 mssh.execute("qconf -Ap %s" % penv.name, source_profile=True) 357 else: 358 mssh.execute("qconf -Mp %s" % penv.name, source_profile=True) 359 if queue: 360 log.info("Adding parallel environment '%s' to queue '%s'" % 361 (name, queue)) 362 mssh.execute('qconf -mattr queue pe_list "%s" %s' % (name, queue), 363 source_profile=True)
364
365 - def _setup_sge(self):
366 """ 367 Install Sun Grid Engine with a default parallel 368 environment on StarCluster 369 """ 370 # generate /etc/profile.d/sge.sh for each node 371 for node in self._nodes: 372 conn = node.ssh 373 conn.execute('pkill -9 sge', ignore_exit_status=True) 374 conn.execute('rm /etc/init.d/sge*', ignore_exit_status=True) 375 sge_profile = conn.remote_file("/etc/profile.d/sge.sh") 376 arch = conn.execute("/opt/sge6/util/arch")[0] 377 print >> sge_profile, sge.sgeprofile_template % {'arch': arch} 378 sge_profile.close() 379 # setup sge auto install file 380 master = self._master 381 default_cell = '/opt/sge6/default' 382 if master.ssh.isdir(default_cell): 383 log.info("Removing previous SGE installation...") 384 master.ssh.execute('rm -rf %s' % default_cell) 385 master.ssh.execute('exportfs -fra') 386 mconn = master.ssh 387 admin_list = ' '.join(map(lambda n: n.alias, self._nodes)) 388 exec_list = admin_list 389 submit_list = admin_list 390 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf") 391 # TODO: add sge section to config values for some of the below 392 conf = sge.sgeinstall_template % (admin_list, exec_list, submit_list) 393 print >> ec2_sge_conf, conf 394 ec2_sge_conf.close() 395 # installs sge in /opt/sge6 and starts qmaster/schedd on master node 396 log.info("Installing Sun Grid Engine...") 397 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x -noremote ' 398 '-auto ./ec2_sge.conf', silent=True, only_printable=True) 399 # set all.q shell to bash 400 mconn.execute('qconf -mattr queue shell "/bin/bash" all.q', 401 source_profile=True) 402 for node in self.nodes: 403 self._add_sge_administrative_host(node) 404 self._add_sge_submit_host(node) 405 self.pool.simple_job(self._add_to_sge, (node,), jobid=node.alias) 406 self.pool.wait(numtasks=len(self.nodes)) 407 self._create_sge_pe()
408
409 - def run(self, nodes, master, user, user_shell, volumes):
410 """Start cluster configuration""" 411 try: 412 self._nodes = nodes 413 self._master = master 414 self._user = user 415 self._user_shell = user_shell 416 self._volumes = volumes 417 self._setup_hostnames() 418 self._setup_ebs_volumes() 419 self._setup_cluster_user() 420 self._setup_scratch() 421 self._setup_etc_hosts() 422 self._setup_nfs() 423 self._setup_passwordless_ssh() 424 if not self._disable_queue: 425 self._setup_sge() 426 finally: 427 self.pool.shutdown()
428
429 - def _remove_from_etc_hosts(self, node):
430 nodes = filter(lambda x: x.id != node.id, self.running_nodes) 431 for n in nodes: 432 n.remove_from_etc_hosts([node])
433
434 - def _remove_nfs_exports(self, node):
435 self._master.stop_exporting_fs_to_nodes([node])
436
437 - def _remove_from_known_hosts(self, node):
438 nodes = filter(lambda x: x.id != node.id, self.running_nodes) 439 for n in nodes: 440 n.remove_from_known_hosts('root', [node]) 441 n.remove_from_known_hosts(self._user, [node])
442
443 - def _remove_from_sge(self, node):
444 master = self._master 445 master.ssh.execute('qconf -shgrp @allhosts > /tmp/allhosts', 446 source_profile=True) 447 hgrp_file = master.ssh.remote_file('/tmp/allhosts', 'r') 448 contents = hgrp_file.read().splitlines() 449 hgrp_file.close() 450 c = [] 451 for line in contents: 452 line = line.replace(node.alias, '') 453 c.append(line) 454 hgrp_file = master.ssh.remote_file('/tmp/allhosts_new', 'w') 455 hgrp_file.writelines('\n'.join(c)) 456 hgrp_file.close() 457 master.ssh.execute('qconf -Mhgrp /tmp/allhosts_new', 458 source_profile=True) 459 master.ssh.execute('qconf -sq all.q > /tmp/allq', source_profile=True) 460 allq_file = master.ssh.remote_file('/tmp/allq', 'r') 461 contents = allq_file.read() 462 allq_file.close() 463 c = [l.strip() for l in contents.splitlines()] 464 s = [] 465 allq = [] 466 for l in c: 467 if l.startswith('slots') or l.startswith('['): 468 s.append(l) 469 else: 470 allq.append(l) 471 regex = re.compile(r"\[%s=\d+\],?" % node.alias) 472 slots = [] 473 for line in s: 474 line = line.replace('\\', '') 475 slots.append(regex.sub('', line)) 476 allq.append(''.join(slots)) 477 f = master.ssh.remote_file('/tmp/allq_new', 'w') 478 allq[-1] = allq[-1].strip() 479 if allq[-1].endswith(','): 480 allq[-1] = allq[-1][:-1] 481 f.write('\n'.join(allq)) 482 f.close() 483 master.ssh.execute('qconf -Mq /tmp/allq_new', source_profile=True) 484 master.ssh.execute('qconf -dconf %s' % node.alias, source_profile=True) 485 master.ssh.execute('qconf -de %s' % node.alias, source_profile=True) 486 node.ssh.execute('pkill -9 sge_execd') 487 nodes = filter(lambda n: n.alias != node.alias, self._nodes) 488 self._create_sge_pe(nodes=nodes)
489
490 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
491 self._nodes = nodes 492 self._master = master 493 self._user = user 494 self._user_shell = user_shell 495 self._volumes = volumes 496 log.info("Removing node %s (%s)..." % (node.alias, node.id)) 497 if not self._disable_queue: 498 log.info("Removing %s from SGE" % node.alias) 499 self._remove_from_sge(node) 500 log.info("Removing %s from known_hosts files" % node.alias) 501 self._remove_from_known_hosts(node) 502 log.info("Removing %s from /etc/hosts" % node.alias) 503 self._remove_from_etc_hosts(node) 504 log.info("Removing %s from NFS" % node.alias) 505 self._remove_nfs_exports(node)
506
507 - def _create_user(self, node):
508 user = self._master.getpwnam(self._user) 509 uid, gid = user.pw_uid, user.pw_gid 510 self._add_user_to_nodes(uid, gid, nodes=[node])
511
512 - def _add_sge_submit_host(self, node):
513 mssh = self._master.ssh 514 mssh.execute('qconf -as %s' % node.alias, source_profile=True)
515
516 - def _add_sge_administrative_host(self, node):
517 mssh = self._master.ssh 518 mssh.execute('qconf -ah %s' % node.alias, source_profile=True)
519
520 - def _add_to_sge(self, node):
521 # generate /etc/profile.d/sge.sh 522 sge_profile = node.ssh.remote_file("/etc/profile.d/sge.sh") 523 arch = node.ssh.execute("/opt/sge6/util/arch")[0] 524 print >> sge_profile, sge.sgeprofile_template % {'arch': arch} 525 sge_profile.close() 526 node.ssh.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -x -noremote ' 527 '-auto ./ec2_sge.conf')
528
529 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
530 self._nodes = nodes 531 self._master = master 532 self._user = user 533 self._user_shell = user_shell 534 self._volumes = volumes 535 self._setup_hostnames(nodes=[node]) 536 self._setup_etc_hosts(nodes) 537 self._setup_nfs(nodes=[node], start_server=False) 538 self._create_user(node) 539 self._setup_scratch(nodes=[node]) 540 self._setup_passwordless_ssh(nodes=[node]) 541 if not self._disable_queue: 542 log.info("Adding %s to SGE" % node.alias) 543 self._add_sge_administrative_host(node) 544 self._add_sge_submit_host(node) 545 self._add_to_sge(node) 546 self._create_sge_pe()
547