1
2
3 """
4 clustersetup.py
5 """
6 import re
7 import posixpath
8
9 from starcluster import threadpool
10 from starcluster.utils import print_timing
11 from starcluster.templates import sge
12 from starcluster.logger import log
16 """
17 ClusterSetup Interface
18
19 This is the base class for all StarCluster plugins. A plugin should
20 implement at least one if not all of these methods.
21 """
24
25 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
26 """
27 This methods gets executed after a node has been added to the cluster
28 """
29 raise NotImplementedError('on_add_node method not implemented')
30
31 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
32 """
33 This method gets executed before a node is about to be removed from the
34 cluster
35 """
36 raise NotImplementedError('on_remove_node method not implemented')
37
38 - def on_restart(self, nodes, master, user, user_shell, volumes):
39 """
40 This method gets executed before restart the cluster
41 """
42 raise NotImplementedError('on_restart method not implemented')
43
44 - def on_shutdown(self, nodes, master, user, user_shell, volumes):
45 """
46 This method gets executed before shutting down the cluster
47 """
48 raise NotImplementedError('on_shutdown method not implemented')
49
50 - def run(self, nodes, master, user, user_shell, volumes):
51 """
52 Run this plugin's setup routines
53
54 This method gets executed after the default cluster setup routines have
55 been performed
56 """
57 raise NotImplementedError('run method not implemented')
58
61 """
62 Default ClusterSetup implementation for StarCluster
63 """
64 - def __init__(self, disable_queue=False, disable_threads=False,
65 num_threads=20):
66 self._nodes = None
67 self._master = None
68 self._user = None
69 self._user_shell = None
70 self._volumes = None
71 self._disable_queue = disable_queue
72 self._disable_threads = disable_threads
73 self._num_threads = num_threads
74 self._pool = None
75
76 @property
78 if not self._pool:
79 self._pool = threadpool.get_thread_pool(self._num_threads,
80 self._disable_threads)
81 return self._pool
82
83 @property
85 return filter(lambda x: not x.is_master(), self._nodes)
86
87 @property
89 return filter(lambda x: x.state in ['running'], self._nodes)
90
100
102 """
103 Create cluster user on all StarCluster nodes
104
105 This command takes care to examine existing folders in /home
106 and set the new cluster_user's uid/gid accordingly. This is necessary
107 for the case of EBS volumes containing /home with large amounts of data
108 in them. It's much less expensive in this case to set the uid/gid of
109 the new user to be the existing uid/gid of the dir in EBS rather than
110 chowning potentially terabytes of data.
111 """
112 mconn = self._master.ssh
113 home_folder = '/home/%s' % self._user
114 first_uid = 1000
115 uid, gid = first_uid, first_uid
116 umap = self._master.get_user_map(key_by_uid=True)
117 if mconn.path_exists(home_folder):
118
119
120 s = mconn.stat(home_folder)
121 uid = s.st_uid
122 gid = s.st_gid
123 else:
124
125
126 uid_db = {}
127 files = mconn.ls('/home')
128 for file in files:
129 if mconn.isdir(file):
130 f = mconn.stat(file)
131 uid_db[f.st_uid] = (file, f.st_gid)
132 if uid_db.keys():
133 max_uid = max(uid_db.keys())
134 max_gid = uid_db[max_uid][1]
135 uid, gid = max_uid + 1, max_gid + 1
136
137 uid = max(uid, first_uid)
138 gid = max(gid, first_uid)
139
140 while umap.get(uid):
141 uid += 1
142 gid += 1
143 log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (self._user,
144 uid, gid))
145 self._add_user_to_nodes(uid, gid, self._nodes)
146
148 existing_user = node.getpwuid(uid)
149 if existing_user:
150 username = existing_user.pw_name
151 if username != self._user:
152 msg = ("user %s exists on %s with same uid/gid as "
153 "cluster user %s...removing user %s")
154 log.debug(
155 msg % (username, node.alias, self._user, username))
156 node.remove_user(username)
157 node.add_user(self._user, uid, gid, self._user_shell)
158 log.debug("user %s exists on node %s, no action" % \
159 (self._user, node.alias))
160 else:
161 log.debug("user %s does not exist, creating..." % self._user)
162 node.add_user(self._user, uid, gid, self._user_shell)
163
170
172 nconn = node.ssh
173 user_scratch = '/mnt/%s' % self._user
174 if not nconn.path_exists(user_scratch):
175 nconn.mkdir(user_scratch)
176 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % \
177 {'user': self._user})
178 scratch = '/scratch'
179 if not nconn.path_exists(scratch):
180 nconn.mkdir(scratch)
181 if not nconn.path_exists(posixpath.join(scratch, self._user)):
182 nconn.execute('ln -s %s %s' % (user_scratch, scratch))
183
192
201
219
221 """
222 Mount EBS volumes, if specified in ~/.starcluster/config to /home
223 """
224
225 master = self._master
226 devs = master.ssh.ls('/dev')
227 for vol in self._volumes:
228 vol = self._volumes[vol]
229 vol_id = vol.get("volume_id")
230 mount_path = vol.get('mount_path')
231 device = vol.get("device")
232 volume_partition = vol.get('partition')
233 if not (vol_id and device and mount_path):
234 log.error("missing required settings for vol %s" % vol)
235 continue
236 if not master.ssh.path_exists(device):
237 log.warn("Cannot find device %s for volume %s" % (device, vol))
238 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
239 log.warn("This usually means there was a problem" + \
240 "attaching the EBS volume to the master node")
241 continue
242 if not volume_partition:
243 partitions = filter(lambda x: x.startswith(device), devs)
244 if len(partitions) == 1:
245 volume_partition = device
246 elif len(partitions) == 2:
247 volume_partition = device + '1'
248 else:
249 log.error(
250 "volume has more than one partition, must specify" + \
251 "a partition to use in the config")
252 continue
253 elif not master.ssh.path_exists(volume_partition):
254 log.warn("Cannot find partition %s on volume %s" % \
255 (volume_partition, vol_id))
256 log.warn("Not mounting %s on %s" % (vol_id,
257 mount_path))
258 log.warn("This either means that the volume has not " + \
259 "been partitioned or that the partition" + \
260 "specified does not exist on the volume")
261 continue
262 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
263 mount_map = self._master.get_mount_map()
264 dev = mount_map.get(volume_partition)
265 if dev:
266 path, fstype, options = dev
267 if path != mount_path:
268 log.error("Volume %s is mounted on %s, not on %s" % \
269 (vol_id, path, mount_path))
270 else:
271 log.info(
272 "Volume %s already mounted on %s...skipping" % \
273 (vol_id, mount_path))
274 continue
275 self._master.mount_device(volume_partition, mount_path)
276
278 export_paths = ['/home']
279 if not self._disable_queue:
280 export_paths.append('/opt/sge6')
281 for vol in self._volumes:
282 vol = self._volumes[vol]
283 mount_path = vol.get('mount_path')
284 if not mount_path in export_paths:
285 export_paths.append(mount_path)
286 return export_paths
287
289
290 master = self._master
291 export_paths = self._get_nfs_export_paths()
292 for node in nodes:
293 mount_map = node.get_mount_map()
294 mount_paths = []
295 for path in export_paths:
296 network_device = "%s:%s" % (master.alias, path)
297 if network_device in mount_map:
298 mount_path, type, options = mount_map.get(network_device)
299 log.debug(('nfs share %s already mounted to %s on ' + \
300 'node %s, skipping...') % \
301 (network_device, mount_path, node.alias))
302 else:
303 mount_paths.append(path)
304 log.info("Mounting shares for node %s" % node.alias)
305 self.pool.simple_job(node.mount_nfs_shares, (master, mount_paths),
306 jobid=node.alias)
307 self.pool.wait(numtasks=len(nodes))
308
309 @print_timing
310 - def _setup_nfs(self, nodes=None, start_server=True):
328
330 """
331 Install Sun Grid Engine with a default parallel
332 environment on StarCluster
333 """
334
335 for node in self._nodes:
336 conn = node.ssh
337 conn.execute('pkill -9 sge', ignore_exit_status=True)
338 conn.execute('rm /etc/init.d/sge*', ignore_exit_status=True)
339 sge_profile = conn.remote_file("/etc/profile.d/sge.sh")
340 arch = conn.execute("/opt/sge6/util/arch")[0]
341 print >> sge_profile, sge.sgeprofile_template % {'arch': arch}
342 sge_profile.close()
343
344 master = self._master
345 default_cell = '/opt/sge6/default'
346 if master.ssh.isdir(default_cell):
347 log.info("Removing previous SGE installation...")
348 master.ssh.execute('rm -rf %s' % default_cell)
349 master.ssh.execute('exportfs -fr')
350 mconn = master.ssh
351 admin_list = ' '.join(map(lambda n: n.alias, self._nodes))
352 exec_list = admin_list
353 submit_list = admin_list
354 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf")
355
356 print >> ec2_sge_conf, sge.sgeinstall_template % \
357 (admin_list, exec_list, submit_list)
358 ec2_sge_conf.close()
359
360 log.info("Installing Sun Grid Engine...")
361 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x ' + \
362 '-noremote -auto ./ec2_sge.conf',
363 silent=True, only_printable=True)
364
365 mconn.execute('source /etc/profile && ' + \
366 'qconf -mattr queue shell "/bin/bash" all.q')
367 for node in self.nodes:
368 master.ssh.execute('source /etc/profile && qconf -ah %s' %
369 node.alias)
370 master.ssh.execute('source /etc/profile && qconf -as %s' %
371 node.alias)
372 self.pool.simple_job(self._add_to_sge, (node,), jobid=node.alias)
373 self.pool.wait(numtasks=len(self.nodes))
374
375
376
377 num_processors = sum(map(lambda n: n.num_processors, self._nodes))
378 parallel_environment = mconn.remote_file("/tmp/pe.txt")
379 print >> parallel_environment, sge.sge_pe_template % num_processors
380 parallel_environment.close()
381 mconn.execute("source /etc/profile && qconf -Ap %s" % \
382 parallel_environment.name)
383 mconn.execute(
384 'source /etc/profile && qconf -mattr queue pe_list "orte" all.q')
385
386 - def run(self, nodes, master, user, user_shell, volumes):
405
410
413
419
421 master = self._master
422 master.ssh.execute(
423 'source /etc/profile && qconf -shgrp @allhosts > /tmp/allhosts')
424 hgrp_file = master.ssh.remote_file('/tmp/allhosts', 'r')
425 contents = hgrp_file.read().splitlines()
426 hgrp_file.close()
427 c = []
428 for line in contents:
429 line = line.replace(node.alias, '')
430 c.append(line)
431 hgrp_file = master.ssh.remote_file('/tmp/allhosts_new', 'w')
432 hgrp_file.writelines('\n'.join(c))
433 hgrp_file.close()
434 master.ssh.execute(
435 'source /etc/profile && qconf -Mhgrp /tmp/allhosts_new')
436 master.ssh.execute(
437 'source /etc/profile && qconf -sq all.q > /tmp/allq')
438 allq_file = master.ssh.remote_file('/tmp/allq', 'r')
439 contents = allq_file.read()
440 allq_file.close()
441 c = [l.strip() for l in contents.splitlines()]
442 s = []
443 allq = []
444 for l in c:
445 if l.startswith('slots') or l.startswith('['):
446 s.append(l)
447 else:
448 allq.append(l)
449 regex = re.compile(r"\[%s=\d+\],?" % node.alias)
450 slots = []
451 for line in s:
452 line = line.replace('\\', '')
453 slots.append(regex.sub('', line))
454 allq.append(''.join(slots))
455 f = master.ssh.remote_file('/tmp/allq_new', 'w')
456 allq[-1] = allq[-1].strip()
457 if allq[-1].endswith(','):
458 allq[-1] = allq[-1][:-1]
459 f.write('\n'.join(allq))
460 f.close()
461 master.ssh.execute('source /etc/profile && qconf -Mq /tmp/allq_new')
462 master.ssh.execute(
463 'source /etc/profile && qconf -de %s' % node.alias)
464 master.ssh.execute(
465 'source /etc/profile && qconf -dconf %s' % node.alias)
466
467 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
483
488
502
503 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
518