1 """
2 clustersetup.py
3 """
4 import re
5 import posixpath
6
7 from starcluster import threadpool
8 from starcluster.utils import print_timing
9 from starcluster.templates import sge
10 from starcluster.logger import log
14 """
15 ClusterSetup Interface
16
17 This is the base class for all StarCluster plugins. A plugin should
18 implement at least one if not all of these methods.
19 """
22
23 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
24 """
25 This methods gets executed after a node has been added to the cluster
26 """
27 raise NotImplementedError('on_add_node method not implemented')
28
29 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
30 """
31 This method gets executed before a node is about to be removed from the
32 cluster
33 """
34 raise NotImplementedError('on_remove_node method not implemented')
35
36 - def on_restart(self, nodes, master, user, user_shell, volumes):
37 """
38 This method gets executed before restart the cluster
39 """
40 raise NotImplementedError('on_restart method not implemented')
41
42 - def on_shutdown(self, nodes, master, user, user_shell, volumes):
43 """
44 This method gets executed before shutting down the cluster
45 """
46 raise NotImplementedError('on_shutdown method not implemented')
47
48 - def run(self, nodes, master, user, user_shell, volumes):
49 """
50 Run this plugin's setup routines
51
52 This method gets executed after the default cluster setup routines have
53 been performed
54 """
55 raise NotImplementedError('run method not implemented')
56
59 """
60 Default ClusterSetup implementation for StarCluster
61 """
62 - def __init__(self, disable_queue=False, disable_threads=False,
63 num_threads=20):
64 self._nodes = None
65 self._master = None
66 self._user = None
67 self._user_shell = None
68 self._volumes = None
69 self._disable_queue = disable_queue
70 self._disable_threads = disable_threads
71 self._num_threads = num_threads
72 self._pool = None
73
74 @property
80
81 @property
83 return filter(lambda x: not x.is_master(), self._nodes)
84
85 @property
87 return filter(lambda x: x.state in ['running'], self._nodes)
88
98
100 """
101 Create cluster user on all StarCluster nodes
102
103 This command takes care to examine existing folders in /home
104 and set the new cluster_user's uid/gid accordingly. This is necessary
105 for the case of EBS volumes containing /home with large amounts of data
106 in them. It's much less expensive in this case to set the uid/gid of
107 the new user to be the existing uid/gid of the dir in EBS rather than
108 chowning potentially terabytes of data.
109 """
110 mconn = self._master.ssh
111 home_folder = '/home/%s' % self._user
112 first_uid = 1000
113 uid, gid = first_uid, first_uid
114 umap = self._master.get_user_map(key_by_uid=True)
115 if mconn.path_exists(home_folder):
116
117
118 s = mconn.stat(home_folder)
119 uid = s.st_uid
120 gid = s.st_gid
121 else:
122
123
124 uid_db = {}
125 files = mconn.ls('/home')
126 for file in files:
127 if mconn.isdir(file):
128 f = mconn.stat(file)
129 uid_db[f.st_uid] = (file, f.st_gid)
130 if uid_db.keys():
131 max_uid = max(uid_db.keys())
132 max_gid = uid_db[max_uid][1]
133 uid, gid = max_uid + 1, max_gid + 1
134
135 uid = max(uid, first_uid)
136 gid = max(gid, first_uid)
137
138 while umap.get(uid):
139 uid += 1
140 gid += 1
141 log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (self._user,
142 uid, gid))
143 self._add_user_to_nodes(uid, gid, self._nodes)
144
146 existing_user = node.getpwuid(uid)
147 if existing_user:
148 username = existing_user.pw_name
149 if username != self._user:
150 msg = ("user %s exists on %s with same uid/gid as "
151 "cluster user %s...removing user %s")
152 log.debug(
153 msg % (username, node.alias, self._user, username))
154 node.remove_user(username)
155 node.add_user(self._user, uid, gid, self._user_shell)
156 log.debug("user %s exists on node %s, no action" %
157 (self._user, node.alias))
158 else:
159 log.debug("user %s does not exist, creating..." % self._user)
160 node.add_user(self._user, uid, gid, self._user_shell)
161
168
170 nconn = node.ssh
171 user_scratch = '/mnt/%s' % self._user
172 if not nconn.path_exists(user_scratch):
173 nconn.mkdir(user_scratch)
174 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' %
175 {'user': self._user})
176 scratch = '/scratch'
177 if not nconn.path_exists(scratch):
178 nconn.mkdir(scratch)
179 if not nconn.path_exists(posixpath.join(scratch, self._user)):
180 nconn.execute('ln -s %s %s' % (user_scratch, scratch))
181
190
199
217
219 """
220 Mount EBS volumes, if specified in ~/.starcluster/config to /home
221 """
222
223 master = self._master
224 devs = master.ssh.ls('/dev')
225 for vol in self._volumes:
226 vol = self._volumes[vol]
227 vol_id = vol.get("volume_id")
228 mount_path = vol.get('mount_path')
229 device = vol.get("device")
230 volume_partition = vol.get('partition')
231 if not (vol_id and device and mount_path):
232 log.error("missing required settings for vol %s" % vol)
233 continue
234 dev_exists = master.ssh.path_exists(device)
235 if not dev_exists and device.startswith('/dev/sd'):
236
237 device = device.replace('/dev/sd', '/dev/xvd')
238 dev_exists = master.ssh.path_exists(device)
239 if not dev_exists:
240 log.warn("Cannot find device %s for volume %s" %
241 (device, vol_id))
242 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
243 log.warn("This usually means there was a problem "
244 "attaching the EBS volume to the master node")
245 continue
246 if not volume_partition:
247 partitions = filter(lambda x: x.startswith(device), devs)
248 if len(partitions) == 1:
249 volume_partition = device
250 elif len(partitions) == 2:
251 volume_partition = device + '1'
252 else:
253 log.error(
254 "volume has more than one partition, please specify "
255 "which partition to use (e.g. partition=0, "
256 "partition=1, etc.) in the volume's config")
257 continue
258 elif not master.ssh.path_exists(volume_partition):
259 log.warn("Cannot find partition %s on volume %s" %
260 (volume_partition, vol_id))
261 log.warn("Not mounting %s on %s" % (vol_id,
262 mount_path))
263 log.warn("This either means that the volume has not "
264 "been partitioned or that the partition"
265 "specified does not exist on the volume")
266 continue
267 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
268 mount_map = self._master.get_mount_map()
269 dev = mount_map.get(volume_partition)
270 if dev:
271 path, fstype, options = dev
272 if path != mount_path:
273 log.error("Volume %s is mounted on %s, not on %s" %
274 (vol_id, path, mount_path))
275 else:
276 log.info(
277 "Volume %s already mounted on %s...skipping" %
278 (vol_id, mount_path))
279 continue
280 self._master.mount_device(volume_partition, mount_path)
281
283 export_paths = ['/home']
284 if not self._disable_queue:
285 export_paths.append('/opt/sge6')
286 for vol in self._volumes:
287 vol = self._volumes[vol]
288 mount_path = vol.get('mount_path')
289 if not mount_path in export_paths:
290 export_paths.append(mount_path)
291 return export_paths
292
294
295 master = self._master
296 export_paths = self._get_nfs_export_paths()
297 for node in nodes:
298 mount_map = node.get_mount_map()
299 mount_paths = []
300 for path in export_paths:
301 network_device = "%s:%s" % (master.alias, path)
302 if network_device in mount_map:
303 mount_path, type, options = mount_map.get(network_device)
304 log.debug('nfs share %s already mounted to %s on '
305 'node %s, skipping...' %
306 (network_device, mount_path, node.alias))
307 else:
308 mount_paths.append(path)
309 self.pool.simple_job(node.mount_nfs_shares, (master, mount_paths),
310 jobid=node.alias)
311 self.pool.wait(numtasks=len(nodes))
312
313 @print_timing("Setting up NFS")
314 - def _setup_nfs(self, nodes=None, start_server=True):
332
334 """
335 Create or update an SGE parallel environment
336
337 name - name of parallel environment
338 nodes - list of nodes to include in the parallel environment
339 (default: all)
340 queue - configure queue to use the new parallel environment
341 """
342 mssh = self._master.ssh
343 pe_exists = mssh.get_status('qconf -sp %s' % name, source_profile=True)
344 pe_exists = pe_exists == 0
345 if not pe_exists:
346 log.info("Creating SGE parallel environment '%s'" % name)
347 else:
348 log.info("Updating SGE parallel environment '%s'" % name)
349
350 nodes = nodes or self._nodes
351 num_processors = sum(self.pool.map(lambda n: n.num_processors, nodes))
352 penv = mssh.remote_file("/tmp/pe.txt")
353 print >> penv, sge.sge_pe_template % (name, num_processors)
354 penv.close()
355 if not pe_exists:
356 mssh.execute("qconf -Ap %s" % penv.name, source_profile=True)
357 else:
358 mssh.execute("qconf -Mp %s" % penv.name, source_profile=True)
359 if queue:
360 log.info("Adding parallel environment '%s' to queue '%s'" %
361 (name, queue))
362 mssh.execute('qconf -mattr queue pe_list "%s" %s' % (name, queue),
363 source_profile=True)
364
366 """
367 Install Sun Grid Engine with a default parallel
368 environment on StarCluster
369 """
370
371 for node in self._nodes:
372 conn = node.ssh
373 conn.execute('pkill -9 sge', ignore_exit_status=True)
374 conn.execute('rm /etc/init.d/sge*', ignore_exit_status=True)
375 sge_profile = conn.remote_file("/etc/profile.d/sge.sh")
376 arch = conn.execute("/opt/sge6/util/arch")[0]
377 print >> sge_profile, sge.sgeprofile_template % {'arch': arch}
378 sge_profile.close()
379
380 master = self._master
381 default_cell = '/opt/sge6/default'
382 if master.ssh.isdir(default_cell):
383 log.info("Removing previous SGE installation...")
384 master.ssh.execute('rm -rf %s' % default_cell)
385 master.ssh.execute('exportfs -fra')
386 mconn = master.ssh
387 admin_list = ' '.join(map(lambda n: n.alias, self._nodes))
388 exec_list = admin_list
389 submit_list = admin_list
390 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf")
391
392 conf = sge.sgeinstall_template % (admin_list, exec_list, submit_list)
393 print >> ec2_sge_conf, conf
394 ec2_sge_conf.close()
395
396 log.info("Installing Sun Grid Engine...")
397 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x -noremote '
398 '-auto ./ec2_sge.conf', silent=True, only_printable=True)
399
400 mconn.execute('qconf -mattr queue shell "/bin/bash" all.q',
401 source_profile=True)
402 for node in self.nodes:
403 self._add_sge_administrative_host(node)
404 self._add_sge_submit_host(node)
405 self.pool.simple_job(self._add_to_sge, (node,), jobid=node.alias)
406 self.pool.wait(numtasks=len(self.nodes))
407 self._create_sge_pe()
408
409 - def run(self, nodes, master, user, user_shell, volumes):
428
433
436
442
444 master = self._master
445 master.ssh.execute('qconf -shgrp @allhosts > /tmp/allhosts',
446 source_profile=True)
447 hgrp_file = master.ssh.remote_file('/tmp/allhosts', 'r')
448 contents = hgrp_file.read().splitlines()
449 hgrp_file.close()
450 c = []
451 for line in contents:
452 line = line.replace(node.alias, '')
453 c.append(line)
454 hgrp_file = master.ssh.remote_file('/tmp/allhosts_new', 'w')
455 hgrp_file.writelines('\n'.join(c))
456 hgrp_file.close()
457 master.ssh.execute('qconf -Mhgrp /tmp/allhosts_new',
458 source_profile=True)
459 master.ssh.execute('qconf -sq all.q > /tmp/allq', source_profile=True)
460 allq_file = master.ssh.remote_file('/tmp/allq', 'r')
461 contents = allq_file.read()
462 allq_file.close()
463 c = [l.strip() for l in contents.splitlines()]
464 s = []
465 allq = []
466 for l in c:
467 if l.startswith('slots') or l.startswith('['):
468 s.append(l)
469 else:
470 allq.append(l)
471 regex = re.compile(r"\[%s=\d+\],?" % node.alias)
472 slots = []
473 for line in s:
474 line = line.replace('\\', '')
475 slots.append(regex.sub('', line))
476 allq.append(''.join(slots))
477 f = master.ssh.remote_file('/tmp/allq_new', 'w')
478 allq[-1] = allq[-1].strip()
479 if allq[-1].endswith(','):
480 allq[-1] = allq[-1][:-1]
481 f.write('\n'.join(allq))
482 f.close()
483 master.ssh.execute('qconf -Mq /tmp/allq_new', source_profile=True)
484 master.ssh.execute('qconf -dconf %s' % node.alias, source_profile=True)
485 master.ssh.execute('qconf -de %s' % node.alias, source_profile=True)
486 node.ssh.execute('pkill -9 sge_execd')
487 nodes = filter(lambda n: n.alias != node.alias, self._nodes)
488 self._create_sge_pe(nodes=nodes)
489
490 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
506
511
515
519
528
529 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
547