1
2
3 """
4 clustersetup.py
5 """
6
7 import os
8 import shutil
9 import tempfile
10
11 from starcluster.templates.sgeprofile import sgeprofile_template
12 from starcluster.templates.sgeinstall import sgeinstall_template
13 from starcluster.templates.sge_pe import sge_pe_template
14 from starcluster.logger import log
15
17 """
18 ClusterSetup Interface
19 """
22
23 - def run(self, nodes, master, user, user_shell, volumes):
24 """ Start cluster setup routines """
25 raise NotImplementedError('run method not implemented')
26
28 """
29 Default ClusterSetup implementation for StarCluster
30 """
32 self._nodes = None
33 self._master = None
34 self._user = None
35 self._user_shell = None
36 self._volumes = None
37
39 """
40 Create cluster user on all StarCluster nodes
41
42 This command takes care to examine existing folders in /home
43 and set the new cluster_user's uid/gid accordingly. This is necessary
44 for the case of EBS volumes containing /home with large amounts of data
45 in them. It's much less expensive in this case to set the uid/gid of the
46 new user to be the existing uid/gid of the dir in EBS rather than
47 chowning potentially terabytes of data.
48 """
49 mconn = self._master.ssh
50 home_folder = '/home/%s' % self._user
51 first_uid = 1000
52 uid, gid = first_uid, first_uid
53 if mconn.path_exists(home_folder):
54
55 s = mconn.stat(home_folder)
56 uid = s.st_uid
57 gid = s.st_gid
58 else:
59
60
61 uid_db = {}
62 files = mconn.ls('/home')
63 for file in files:
64 if mconn.isdir(file):
65 f = mconn.stat(file)
66 uid_db[f.st_uid] = (file, f.st_gid)
67 if uid_db.keys():
68 max_uid = max(uid_db.keys())
69 max_gid = uid_db[max_uid][1]
70 uid, gid = max_uid+1, max_gid+1
71 uid = max(uid, first_uid)
72 gid = max(gid, first_uid)
73 log.debug("Cluster user gid/uid: (%d, %d)" % (uid,gid))
74 log.info("Creating cluster user: %s" % self._user)
75 for node in self._nodes:
76 nconn = node.ssh
77 nconn.execute('groupadd -o -g %s %s' % (gid, self._user))
78 nconn.execute('useradd -o -u %s -g %s -m -s `which %s` %s' %
79 (uid, gid, self._user_shell, self._user))
80
82 """ Configure scratch space on all StarCluster nodes """
83 log.info("Configuring scratch space for user: %s" % self._user)
84 for node in self._nodes:
85 nconn = node.ssh
86 nconn.execute('mkdir /mnt/%s' % self._user)
87 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % {'user':self._user})
88 nconn.execute('mkdir /scratch')
89 nconn.execute('ln -s /mnt/%s /scratch' % self._user)
90
92 """ Configure /etc/hosts on all StarCluster nodes"""
93 log.info("Configuring /etc/hosts on each node")
94 for node in self._nodes:
95 conn = node.ssh
96 host_file = conn.remote_file('/etc/hosts')
97 print >> host_file, "# Do not remove the following line or programs that require network functionality will fail"
98 print >> host_file, "127.0.0.1 localhost.localdomain localhost"
99 for node in self._nodes:
100 print >> host_file, node.get_hosts_entry()
101 host_file.close()
102
104 """ Properly configure passwordless ssh for CLUSTER_USER on all StarCluster nodes"""
105 log.info("Configuring passwordless ssh for root")
106 mconn = self._master.ssh
107
108
109 mconn.execute('rm /root/.ssh/id_rsa*', ignore_exit_status=True)
110 mconn.execute('ssh-keygen -q -t rsa -f /root/.ssh/id_rsa -P ""')
111 tempdir = tempfile.mkdtemp(prefix="starcluster-")
112 temprsa = os.path.join(tempdir, 'id_rsa')
113 temprsa_pub = os.path.join(tempdir, 'id_rsa.pub')
114 tempknown_hosts = os.path.join(tempdir, 'known_hosts')
115 mconn.get('/root/.ssh/id_rsa', temprsa)
116 mconn.get('/root/.ssh/id_rsa.pub', temprsa_pub)
117
118
119 for node in self._nodes:
120 conn = node.ssh
121 conn.put(temprsa,'/root/.ssh/id_rsa')
122 conn.put(temprsa_pub,'/root/.ssh/id_rsa.pub')
123 conn.execute('chmod 400 /root/.ssh/id_rsa*')
124 conn.execute('cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys')
125
126
127
128
129
130 for node in self._nodes:
131 for name in node.network_names.values():
132 mconn.execute('ssh -o "StrictHostKeyChecking=no" %s hostname' % name)
133
134
135
136 mconn.get('/root/.ssh/known_hosts', tempknown_hosts)
137 for node in self._nodes:
138 conn = node.ssh.put(tempknown_hosts, '/root/.ssh/known_hosts')
139
140
141
142
143
144 log.info("Configuring passwordless ssh for user: %s" % self._user)
145
146 mconn.execute('mkdir -p /home/%s/.ssh' % self._user)
147 pkfiles_list = ("/home/%(user)s/.ssh/id_rsa /home/%(user)s/.ssh/id_rsa.pub" %
148 {'user':self._user}).split()
149
150 pkfiles_exist = [ eval(mconn.execute('test -f %s && echo "True" || echo "False"'%file)[0]) for file in pkfiles_list ]
151 has_all_pkfiles = (pkfiles_exist.count(True) == len(pkfiles_list))
152 pkfiles = zip(pkfiles_list, pkfiles_exist)
153
154 if not has_all_pkfiles:
155
156
157 for file,exists in pkfiles:
158 log.debug('Checking for orphaned private key file: %s | exists = %s' % (file, exists))
159 if exists:
160 log.debug('Removing orphaned private key file: %s' % file)
161 mconn.execute('rm %s' % file)
162 log.info("Generating local RSA ssh keys for user: %s" % self._user)
163 mconn.execute('ssh-keygen -q -t rsa -f /home/%s/.ssh/id_rsa -P ""' %
164 self._user)
165 else:
166
167 log.info("Using existing RSA ssh keys found for user: %s" %
168 self._user)
169
170 mconn.execute('cp /root/.ssh/authorized_keys /home/%s/.ssh/' %
171 self._user)
172 mconn.execute('cp /root/.ssh/known_hosts /home/%s/.ssh/' % self._user)
173 mconn.execute('chown -R %(user)s:%(user)s /home/%(user)s/.ssh' %
174 {'user':self._user})
175 mconn.execute('chmod 400 /home/%s/.ssh/id_rsa*' % self._user)
176 mconn.execute('cat /home/%(user)s/.ssh/id_rsa.pub >> /home/%(user)s/.ssh/authorized_keys' %
177 {'user':self._user})
178
180 """ Mount EBS volume, if specified, in ~/.starclustercfg to /home"""
181
182 for vol in self._volumes:
183 vol = self._volumes[vol]
184 vol_id = vol.get("volume_id")
185 device = vol.get("device")
186 volume_partition = vol.get('partition')
187 mount_path = vol.get('mount_path')
188 if vol_id and volume_partition and mount_path:
189 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
190 mconn = self._master.ssh
191 if not mconn.path_exists(device):
192 log.warn("Cannot find device %s for volume %s" % (device,
193 vol))
194 log.warn("Not mounting %s on %s" % (vol_id,
195 mount_path))
196 log.warn("This usually means there was a problem" + \
197 "attaching the EBS volume to the master node")
198 continue
199 if not mconn.path_exists(volume_partition):
200 log.warn("Cannot find partition %s on volume %s" %
201 (volume_partition, vol_id))
202 log.warn("Not mounting %s on %s" % (vol_id,
203 mount_path))
204 log.warn("This either means that the volume has not been" + \
205 "partitioned or that the partition specified" + \
206 "does not exist on the volume")
207 continue
208 master_fstab = mconn.remote_file('/etc/fstab', mode='a')
209 print >> master_fstab, "%s %s auto noauto,defaults 0 0 " % (
210 volume_partition, mount_path)
211 master_fstab.close()
212 mconn.execute('mkdir -p %s' % mount_path)
213 mconn.execute('mount %s' % mount_path)
214
216 """ Share /home and /opt/sge6 via nfs to all nodes"""
217 log.info("Configuring NFS...")
218
219 master = self._master
220 mconn = master.ssh
221
222
223 mconn.execute('rm -rf /opt/sge6')
224 mconn.execute('cp -r /opt/sge6-fresh /opt/sge6')
225 mconn.execute('chown -R %(user)s:%(user)s /opt/sge6' % {'user':
226 self._user})
227
228
229 nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
230 etc_exports = mconn.remote_file('/etc/exports')
231 for node in self._nodes:
232 if not node.is_master():
233 etc_exports.write('/home ' + node.private_dns_name + nfs_export_settings + '\n')
234 etc_exports.write('/opt/sge6 ' + node.private_dns_name + nfs_export_settings + '\n')
235 for vol in self._volumes:
236 vol = self._volumes[vol]
237 mount_path = vol.get('mount_path')
238 if not mount_path in ['/home','/opt/sge6']:
239 etc_exports.write(mount_path + ' ' + node.private_dns_name + nfs_export_settings + '\n')
240 etc_exports.close()
241
242 mconn.execute('/etc/init.d/portmap start')
243 mconn.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/')
244 mconn.execute('/etc/init.d/nfs start')
245 mconn.execute('/usr/sbin/exportfs -r')
246
247 mconn.execute('mount -t devpts none /dev/pts', ignore_exit_status=True)
248
249
250 for node in self._nodes:
251 if not node.is_master():
252 nconn = node.ssh
253 nconn.execute('/etc/init.d/portmap start')
254 nconn.execute('mkdir /opt/sge6')
255 nconn.execute('chown -R %(user)s:%(user)s /opt/sge6' % {'user':self._user})
256 nconn.execute('echo "%s:/home /home nfs user,rw,exec 0 0" >> /etc/fstab' % master.private_dns_name)
257 nconn.execute('echo "%s:/opt/sge6 /opt/sge6 nfs user,rw,exec 0 0" >> /etc/fstab' % master.private_dns_name)
258 nconn.execute('mount /home')
259 nconn.execute('mount /opt/sge6')
260
261 nconn.execute('mount -t devpts none /dev/pts',
262 ignore_exit_status=True)
263 for vol in self._volumes:
264 vol = self._volumes[vol]
265 mount_path = vol.get('mount_path')
266 if not mount_path in ['/home','/opt/sge6']:
267 nconn.execute(
268 'echo "%s:%s %s nfs user,rw,exec 0 0" >> /etc/fstab' %
269 (master.private_dns_name,mount_path,
270 mount_path))
271 nconn.execute('mkdir -p %s' % mount_path)
272 nconn.execute('mount %s' % mount_path)
273
275 """ Install Sun Grid Engine with a default parallel environment on StarCluster"""
276 log.info("Installing Sun Grid Engine...")
277
278
279 for node in self._nodes:
280 conn = node.ssh
281 sge_profile = conn.remote_file("/etc/profile.d/sge.sh")
282 arch = conn.execute("/opt/sge6/util/arch")[0]
283
284 print >> sge_profile, sgeprofile_template % {'arch': arch}
285 sge_profile.close()
286
287
288 master = self._master
289 mconn = master.ssh
290
291 admin_list = ''
292 for node in self._nodes:
293 admin_list = admin_list + " " + node.private_dns_name
294
295 exec_list = admin_list
296 submit_list = admin_list
297 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf")
298
299
300 print >> ec2_sge_conf, sgeinstall_template % (admin_list, exec_list, submit_list)
301 ec2_sge_conf.close()
302
303
304 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x -auto ./ec2_sge.conf', silent=True, only_printable=True)
305
306
307 mconn.execute('source /etc/profile && qconf -mattr queue shell "/bin/bash" all.q')
308
309
310
311 num_processors = 0
312 for node in self._nodes:
313 conn = node.ssh
314 num_procs = int(conn.execute('cat /proc/cpuinfo | grep processor | wc -l')[0])
315 num_processors += num_procs
316
317 parallel_environment = mconn.remote_file("/tmp/pe.txt")
318 print >> parallel_environment, sge_pe_template % num_processors
319 parallel_environment.close()
320 mconn.execute("source /etc/profile && qconf -Ap %s" % parallel_environment.name)
321
322 mconn.execute('source /etc/profile && qconf -mattr queue pe_list "orte" all.q')
323
324
325 log.info("Done Configuring Sun Grid Engine")
326
327 - def run(self, nodes, master, user, user_shell, volumes):
341