1
2
3 """
4 clustersetup.py
5 """
6 import re
7 import posixpath
8
9 from starcluster import threadpool
10 from starcluster.utils import print_timing
11 from starcluster.templates import sge
12 from starcluster.logger import log
16 """
17 ClusterSetup Interface
18
19 This is the base class for all StarCluster plugins. A plugin should
20 implement at least one if not all of these methods.
21 """
24
25 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
26 """
27 This methods gets executed after a node has been added to the cluster
28 """
29 raise NotImplementedError('on_add_node method not implemented')
30
31 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
32 """
33 This method gets executed before a node is about to be removed from the
34 cluster
35 """
36 raise NotImplementedError('on_remove_node method not implemented')
37
38 - def on_restart(self, nodes, master, user, user_shell, volumes):
39 """
40 This method gets executed before restart the cluster
41 """
42 raise NotImplementedError('on_restart method not implemented')
43
44 - def on_shutdown(self, nodes, master, user, user_shell, volumes):
45 """
46 This method gets executed before shutting down the cluster
47 """
48 raise NotImplementedError('on_shutdown method not implemented')
49
50 - def run(self, nodes, master, user, user_shell, volumes):
51 """
52 Run this plugin's setup routines
53
54 This method gets executed after the default cluster setup routines have
55 been performed
56 """
57 raise NotImplementedError('run method not implemented')
58
61 """
62 Default ClusterSetup implementation for StarCluster
63 """
64 - def __init__(self, disable_queue=False, disable_threads=False,
65 num_threads=20):
66 self._nodes = None
67 self._master = None
68 self._user = None
69 self._user_shell = None
70 self._volumes = None
71 self._disable_queue = disable_queue
72 self._disable_threads = disable_threads
73 self._num_threads = num_threads
74 self._pool = None
75
76 @property
82
83 @property
85 return filter(lambda x: not x.is_master(), self._nodes)
86
87 @property
89 return filter(lambda x: x.state in ['running'], self._nodes)
90
100
102 """
103 Create cluster user on all StarCluster nodes
104
105 This command takes care to examine existing folders in /home
106 and set the new cluster_user's uid/gid accordingly. This is necessary
107 for the case of EBS volumes containing /home with large amounts of data
108 in them. It's much less expensive in this case to set the uid/gid of
109 the new user to be the existing uid/gid of the dir in EBS rather than
110 chowning potentially terabytes of data.
111 """
112 mconn = self._master.ssh
113 home_folder = '/home/%s' % self._user
114 first_uid = 1000
115 uid, gid = first_uid, first_uid
116 umap = self._master.get_user_map(key_by_uid=True)
117 if mconn.path_exists(home_folder):
118
119
120 s = mconn.stat(home_folder)
121 uid = s.st_uid
122 gid = s.st_gid
123 else:
124
125
126 uid_db = {}
127 files = mconn.ls('/home')
128 for file in files:
129 if mconn.isdir(file):
130 f = mconn.stat(file)
131 uid_db[f.st_uid] = (file, f.st_gid)
132 if uid_db.keys():
133 max_uid = max(uid_db.keys())
134 max_gid = uid_db[max_uid][1]
135 uid, gid = max_uid + 1, max_gid + 1
136
137 uid = max(uid, first_uid)
138 gid = max(gid, first_uid)
139
140 while umap.get(uid):
141 uid += 1
142 gid += 1
143 log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (self._user,
144 uid, gid))
145 self._add_user_to_nodes(uid, gid, self._nodes)
146
148 existing_user = node.getpwuid(uid)
149 if existing_user:
150 username = existing_user.pw_name
151 if username != self._user:
152 msg = ("user %s exists on %s with same uid/gid as "
153 "cluster user %s...removing user %s")
154 log.debug(
155 msg % (username, node.alias, self._user, username))
156 node.remove_user(username)
157 node.add_user(self._user, uid, gid, self._user_shell)
158 log.debug("user %s exists on node %s, no action" % \
159 (self._user, node.alias))
160 else:
161 log.debug("user %s does not exist, creating..." % self._user)
162 node.add_user(self._user, uid, gid, self._user_shell)
163
170
172 nconn = node.ssh
173 user_scratch = '/mnt/%s' % self._user
174 if not nconn.path_exists(user_scratch):
175 nconn.mkdir(user_scratch)
176 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % \
177 {'user': self._user})
178 scratch = '/scratch'
179 if not nconn.path_exists(scratch):
180 nconn.mkdir(scratch)
181 if not nconn.path_exists(posixpath.join(scratch, self._user)):
182 nconn.execute('ln -s %s %s' % (user_scratch, scratch))
183
192
201
219
221 """
222 Mount EBS volumes, if specified in ~/.starcluster/config to /home
223 """
224
225 master = self._master
226 devs = master.ssh.ls('/dev')
227 for vol in self._volumes:
228 vol = self._volumes[vol]
229 vol_id = vol.get("volume_id")
230 mount_path = vol.get('mount_path')
231 device = vol.get("device")
232 volume_partition = vol.get('partition')
233 if not (vol_id and device and mount_path):
234 log.error("missing required settings for vol %s" % vol)
235 continue
236 dev_exists = master.ssh.path_exists(device)
237 if not dev_exists and device.startswith('/dev/sd'):
238
239 device = device.replace('/dev/sd', '/dev/xvd')
240 dev_exists = master.ssh.path_exists(device)
241 if not dev_exists:
242 log.warn("Cannot find device %s for volume %s" %
243 (device, vol_id))
244 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
245 log.warn("This usually means there was a problem "
246 "attaching the EBS volume to the master node")
247 continue
248 if not volume_partition:
249 partitions = filter(lambda x: x.startswith(device), devs)
250 if len(partitions) == 1:
251 volume_partition = device
252 elif len(partitions) == 2:
253 volume_partition = device + '1'
254 else:
255 log.error(
256 "volume has more than one partition, please specify "
257 "which partition to use (e.g. partition=0, "
258 "partition=1, etc) in the volume's config")
259 continue
260 elif not master.ssh.path_exists(volume_partition):
261 log.warn("Cannot find partition %s on volume %s" % \
262 (volume_partition, vol_id))
263 log.warn("Not mounting %s on %s" % (vol_id,
264 mount_path))
265 log.warn("This either means that the volume has not " + \
266 "been partitioned or that the partition" + \
267 "specified does not exist on the volume")
268 continue
269 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
270 mount_map = self._master.get_mount_map()
271 dev = mount_map.get(volume_partition)
272 if dev:
273 path, fstype, options = dev
274 if path != mount_path:
275 log.error("Volume %s is mounted on %s, not on %s" % \
276 (vol_id, path, mount_path))
277 else:
278 log.info(
279 "Volume %s already mounted on %s...skipping" % \
280 (vol_id, mount_path))
281 continue
282 self._master.mount_device(volume_partition, mount_path)
283
285 export_paths = ['/home']
286 if not self._disable_queue:
287 export_paths.append('/opt/sge6')
288 for vol in self._volumes:
289 vol = self._volumes[vol]
290 mount_path = vol.get('mount_path')
291 if not mount_path in export_paths:
292 export_paths.append(mount_path)
293 return export_paths
294
296
297 master = self._master
298 export_paths = self._get_nfs_export_paths()
299 for node in nodes:
300 mount_map = node.get_mount_map()
301 mount_paths = []
302 for path in export_paths:
303 network_device = "%s:%s" % (master.alias, path)
304 if network_device in mount_map:
305 mount_path, type, options = mount_map.get(network_device)
306 log.debug(('nfs share %s already mounted to %s on ' + \
307 'node %s, skipping...') % \
308 (network_device, mount_path, node.alias))
309 else:
310 mount_paths.append(path)
311 self.pool.simple_job(node.mount_nfs_shares, (master, mount_paths),
312 jobid=node.alias)
313 self.pool.wait(numtasks=len(nodes))
314
315 @print_timing("Setting up NFS")
316 - def _setup_nfs(self, nodes=None, start_server=True):
334
336 """
337 Create or update an SGE parallel environment
338
339 name - name of parallel environment
340 nodes - list of nodes to include in the parallel environment
341 (default: all)
342 queue - configure queue to use the new parallel environment
343 """
344 mssh = self._master.ssh
345 pe_exists = mssh.get_status('qconf -sp %s' % name, source_profile=True)
346 pe_exists = pe_exists == 0
347 if not pe_exists:
348 log.info("Creating SGE parallel environment '%s'" % name)
349 else:
350 log.info("Updating SGE parallel environment '%s'" % name)
351
352 nodes = nodes or self._nodes
353 num_processors = sum(self.pool.map(lambda n: n.num_processors, nodes))
354 penv = mssh.remote_file("/tmp/pe.txt")
355 print >> penv, sge.sge_pe_template % (name, num_processors)
356 penv.close()
357 if not pe_exists:
358 mssh.execute("qconf -Ap %s" % penv.name, source_profile=True)
359 else:
360 mssh.execute("qconf -Mp %s" % penv.name, source_profile=True)
361 if queue:
362 log.info("Adding parallel environment '%s' to queue '%s'" %
363 (name, queue))
364 mssh.execute('qconf -mattr queue pe_list "%s" %s' % (name, queue),
365 source_profile=True)
366
368 """
369 Install Sun Grid Engine with a default parallel
370 environment on StarCluster
371 """
372
373 for node in self._nodes:
374 conn = node.ssh
375 conn.execute('pkill -9 sge', ignore_exit_status=True)
376 conn.execute('rm /etc/init.d/sge*', ignore_exit_status=True)
377 sge_profile = conn.remote_file("/etc/profile.d/sge.sh")
378 arch = conn.execute("/opt/sge6/util/arch")[0]
379 print >> sge_profile, sge.sgeprofile_template % {'arch': arch}
380 sge_profile.close()
381
382 master = self._master
383 default_cell = '/opt/sge6/default'
384 if master.ssh.isdir(default_cell):
385 log.info("Removing previous SGE installation...")
386 master.ssh.execute('rm -rf %s' % default_cell)
387 master.ssh.execute('exportfs -fr')
388 mconn = master.ssh
389 admin_list = ' '.join(map(lambda n: n.alias, self._nodes))
390 exec_list = admin_list
391 submit_list = admin_list
392 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf")
393
394 conf = sge.sgeinstall_template % (admin_list, exec_list, submit_list)
395 print >> ec2_sge_conf, conf
396 ec2_sge_conf.close()
397
398 log.info("Installing Sun Grid Engine...")
399 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x -noremote '
400 '-auto ./ec2_sge.conf', silent=True, only_printable=True)
401
402 mconn.execute('qconf -mattr queue shell "/bin/bash" all.q',
403 source_profile=True)
404 for node in self.nodes:
405 self._add_sge_administrative_host(node)
406 self._add_sge_submit_host(node)
407 self.pool.simple_job(self._add_to_sge, (node,), jobid=node.alias)
408 self.pool.wait(numtasks=len(self.nodes))
409 self._create_sge_pe()
410
411 - def run(self, nodes, master, user, user_shell, volumes):
430
435
438
444
446 master = self._master
447 master.ssh.execute('qconf -shgrp @allhosts > /tmp/allhosts',
448 source_profile=True)
449 hgrp_file = master.ssh.remote_file('/tmp/allhosts', 'r')
450 contents = hgrp_file.read().splitlines()
451 hgrp_file.close()
452 c = []
453 for line in contents:
454 line = line.replace(node.alias, '')
455 c.append(line)
456 hgrp_file = master.ssh.remote_file('/tmp/allhosts_new', 'w')
457 hgrp_file.writelines('\n'.join(c))
458 hgrp_file.close()
459 master.ssh.execute('qconf -Mhgrp /tmp/allhosts_new',
460 source_profile=True)
461 master.ssh.execute('qconf -sq all.q > /tmp/allq', source_profile=True)
462 allq_file = master.ssh.remote_file('/tmp/allq', 'r')
463 contents = allq_file.read()
464 allq_file.close()
465 c = [l.strip() for l in contents.splitlines()]
466 s = []
467 allq = []
468 for l in c:
469 if l.startswith('slots') or l.startswith('['):
470 s.append(l)
471 else:
472 allq.append(l)
473 regex = re.compile(r"\[%s=\d+\],?" % node.alias)
474 slots = []
475 for line in s:
476 line = line.replace('\\', '')
477 slots.append(regex.sub('', line))
478 allq.append(''.join(slots))
479 f = master.ssh.remote_file('/tmp/allq_new', 'w')
480 allq[-1] = allq[-1].strip()
481 if allq[-1].endswith(','):
482 allq[-1] = allq[-1][:-1]
483 f.write('\n'.join(allq))
484 f.close()
485 master.ssh.execute('qconf -Mq /tmp/allq_new', source_profile=True)
486 master.ssh.execute('qconf -dconf %s' % node.alias, source_profile=True)
487 master.ssh.execute('qconf -de %s' % node.alias, source_profile=True)
488 node.ssh.execute('pkill -9 sge_execd')
489 nodes = filter(lambda n: n.alias != node.alias, self._nodes)
490 self._create_sge_pe(nodes=nodes)
491
492 - def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
508
513
517
521
530
531 - def on_add_node(self, node, nodes, master, user, user_shell, volumes):
549