Home | Trees | Indices | Help |
|
---|
|
1 import posixpath 2 3 from starcluster import threadpool 4 from starcluster.clustersetup import ClusterSetup 5 from starcluster.logger import log 6 7 core_site_templ = """\ 8 <?xml version="1.0"?> 9 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 10 11 <!-- Put site-specific property overrides in this file. --> 12 13 <configuration> 14 <!-- In: conf/core-site.xml --> 15 <property> 16 <name>hadoop.tmp.dir</name> 17 <value>%(hadoop_tmpdir)s</value> 18 <description>A base for other temporary directories.</description> 19 </property> 20 21 <property> 22 <name>fs.default.name</name> 23 <value>hdfs://%(master)s:54310</value> 24 <description>The name of the default file system. A URI whose 25 scheme and authority determine the FileSystem implementation. The 26 uri's scheme determines the config property (fs.SCHEME.impl) naming 27 the FileSystem implementation class. The uri's authority is used to 28 determine the host, port, etc. for a filesystem.</description> 29 </property> 30 31 </configuration> 32 """ 33 34 hdfs_site_templ = """\ 35 <?xml version="1.0"?> 36 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 37 38 <!-- Put site-specific property overrides in this file. --> 39 40 <configuration> 41 <!-- In: conf/hdfs-site.xml --> 42 <property> 43 <name>dfs.permissions</name> 44 <value>false</value> 45 </property> 46 <property> 47 <name>dfs.replication</name> 48 <value>%(replication)d</value> 49 <description>Default block replication. 50 The actual number of replications can be specified when the file is created. 51 The default is used if replication is not specified in create time. 52 </description> 53 </property> 54 </configuration> 55 """ 56 57 mapred_site_templ = """\ 58 <?xml version="1.0"?> 59 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 60 61 <!-- Put site-specific property overrides in this file. --> 62 63 <configuration> 64 <!-- In: conf/mapred-site.xml --> 65 <property> 66 <name>mapred.job.tracker</name> 67 <value>%(master)s:54311</value> 68 <description>The host and port that the MapReduce job tracker runs 69 at. If "local", then jobs are run in-process as a single map 70 and reduce task. 71 </description> 72 </property> 73 </configuration> 74 """78 """ 79 Configures Hadoop using Cloudera packages on StarCluster 80 """ 8128583 self.hadoop_tmpdir = hadoop_tmpdir 84 self.hadoop_home = '/usr/lib/hadoop' 85 self.hadoop_conf = '/etc/hadoop-0.20/conf.starcluster' 86 self.empty_conf = '/etc/hadoop-0.20/conf.empty' 87 self.centos_java_home = '/usr/lib/jvm/java' 88 self.centos_alt_cmd = 'alternatives' 89 self.ubuntu_javas = ['/usr/lib/jvm/java-6-sun/jre', 90 '/usr/lib/jvm/java-6-openjdk/jre'] 91 self.ubuntu_alt_cmd = 'update-alternatives' 92 self._pool = None93 94 @property96 if self._pool is None: 97 self._pool = threadpool.get_thread_pool(20, disable_threads=False) 98 return self._pool99101 # check for CentOS, otherwise default to Ubuntu 10.04's JAVA_HOME 102 if node.ssh.isfile('/etc/redhat-release'): 103 return self.centos_java_home 104 for java in self.ubuntu_javas: 105 if node.ssh.isdir(java): 106 return java 107 raise Exception("Cant find JAVA jre")108110 # check for CentOS, otherwise default to Ubuntu 10.04 111 if node.ssh.isfile('/etc/redhat-release'): 112 return self.centos_alt_cmd 113 return self.ubuntu_alt_cmd114 117119 node.ssh.execute('cp -r %s %s' % (self.empty_conf, self.hadoop_conf)) 120 alternatives_cmd = self._get_alternatives_cmd(node) 121 cmd = '%s --install /etc/hadoop-0.20/conf ' % alternatives_cmd 122 cmd += 'hadoop-0.20-conf %s 50' % self.hadoop_conf 123 node.ssh.execute(cmd)124126 env_file_sh = posixpath.join(self.hadoop_conf, 'hadoop-env.sh') 127 node.ssh.remove_lines_from_file(env_file_sh, 'JAVA_HOME') 128 env_file = node.ssh.remote_file(env_file_sh, 'a') 129 env_file.write('export JAVA_HOME=%s\n' % self._get_java_home(node)) 130 env_file.close()131133 mapred_site_xml = posixpath.join(self.hadoop_conf, 'mapred-site.xml') 134 mapred_site = node.ssh.remote_file(mapred_site_xml) 135 mapred_site.write(mapred_site_templ % cfg) 136 mapred_site.close()137139 core_site_xml = posixpath.join(self.hadoop_conf, 'core-site.xml') 140 core_site = node.ssh.remote_file(core_site_xml) 141 core_site.write(core_site_templ % cfg) 142 core_site.close()143145 hdfs_site_xml = posixpath.join(self.hadoop_conf, 'hdfs-site.xml') 146 hdfs_site = node.ssh.remote_file(hdfs_site_xml) 147 hdfs_site.write(hdfs_site_templ % cfg) 148 hdfs_site.close()149151 masters_file = posixpath.join(self.hadoop_conf, 'masters') 152 masters_file = node.ssh.remote_file(masters_file) 153 masters_file.write(master.alias) 154 masters_file.close()155157 slaves_file = posixpath.join(self.hadoop_conf, 'slaves') 158 slaves_file = node.ssh.remote_file(slaves_file) 159 slaves_file.write('\n'.join(node_aliases)) 160 slaves_file.close()161163 self._setup_hadoop_dir(node, self.hadoop_tmpdir, 'hdfs', 'hadoop') 164 mapred_dir = posixpath.join(self.hadoop_tmpdir, 'hadoop-mapred') 165 self._setup_hadoop_dir(node, mapred_dir, 'mapred', 'hadoop') 166 userdir = posixpath.join(self.hadoop_tmpdir, 'hadoop-%s' % user) 167 self._setup_hadoop_dir(node, userdir, user, 'hadoop') 168 hdfsdir = posixpath.join(self.hadoop_tmpdir, 'hadoop-hdfs') 169 if not node.ssh.isdir(hdfsdir): 170 node.ssh.execute("su hdfs -c 'hadoop namenode -format'") 171 self._setup_hadoop_dir(node, hdfsdir, 'hdfs', 'hadoop')172174 if not node.ssh.isfile('/etc/dumbo.conf'): 175 f = node.ssh.remote_file('/etc/dumbo.conf') 176 f.write('[hadoops]\nstarcluster: %s\n' % self.hadoop_home) 177 f.close()178180 log.info("Configuring Hadoop...") 181 log.info("Adding user %s to hadoop group" % user) 182 for node in nodes: 183 self.pool.simple_job(self._setup_hadoop_user, (node, user), 184 jobid=node.alias) 185 self.pool.wait(numtasks=len(nodes)) 186 node_aliases = map(lambda n: n.alias, nodes) 187 cfg = {'master': master.alias, 'replication': 3, 188 'hadoop_tmpdir': posixpath.join(self.hadoop_tmpdir, 189 'hadoop-${user.name}')} 190 log.info("Installing configuration templates...") 191 for node in nodes: 192 self.pool.simple_job(self._install_empty_conf, (node,), 193 jobid=node.alias) 194 self.pool.wait(numtasks=len(nodes)) 195 log.info("Configuring environment...") 196 for node in nodes: 197 self.pool.simple_job(self._configure_env, (node,), 198 jobid=node.alias) 199 self.pool.wait(numtasks=len(nodes)) 200 log.info("Configuring MapReduce Site...") 201 for node in nodes: 202 self.pool.simple_job(self._configure_mapreduce_site, (node, cfg), 203 jobid=node.alias) 204 self.pool.wait(numtasks=len(nodes)) 205 log.info("Configuring Core Site...") 206 for node in nodes: 207 self.pool.simple_job(self._configure_core, (node, cfg), 208 jobid=node.alias) 209 self.pool.wait(numtasks=len(nodes)) 210 log.info("Configuring HDFS Site...") 211 for node in nodes: 212 self.pool.simple_job(self._configure_hdfs_site, (node, cfg), 213 jobid=node.alias) 214 self.pool.wait(numtasks=len(nodes)) 215 log.info("Configuring masters file...") 216 for node in nodes: 217 self.pool.simple_job(self._configure_masters, (node, master), 218 jobid=node.alias) 219 self.pool.wait(numtasks=len(nodes)) 220 log.info("Configuring slaves file...") 221 for node in nodes: 222 self.pool.simple_job(self._configure_slaves, (node, node_aliases), 223 jobid=node.alias) 224 self.pool.wait(numtasks=len(nodes)) 225 log.info("Configuring HDFS...") 226 for node in nodes: 227 self.pool.simple_job(self._setup_hdfs, (node, user), 228 jobid=node.alias) 229 self.pool.wait(numtasks=len(nodes)) 230 log.info("Configuring dumbo...") 231 for node in nodes: 232 self.pool.simple_job(self._setup_dumbo, (node,), jobid=node.alias) 233 self.pool.wait(numtasks=len(nodes))234236 if not node.ssh.isdir(path): 237 node.ssh.mkdir(path) 238 node.ssh.execute("chown -R %s:hadoop %s" % (user, path)) 239 node.ssh.execute("chmod -R %s %s" % (permission, path))240 243 246248 log.info("Starting namenode...") 249 master.ssh.execute('/etc/init.d/hadoop-0.20-namenode restart') 250 log.info("Starting secondary namenode...") 251 master.ssh.execute('/etc/init.d/hadoop-0.20-secondarynamenode restart') 252 253 for node in nodes: 254 log.info("Starting datanode on %s..." % node.alias) 255 self.pool.simple_job(self._start_datanode, (node,), 256 jobid=node.alias) 257 self.pool.wait() 258 log.info("Starting jobtracker...") 259 master.ssh.execute('/etc/init.d/hadoop-0.20-jobtracker restart') 260 for node in nodes: 261 log.info("Starting tasktracker on %s..." % node.alias) 262 self.pool.simple_job(self._start_tasktracker, (node,), 263 jobid=node.alias) 264 self.pool.wait()265267 ports = [50070, 50030] 268 ec2 = master.ec2 269 for group in master.cluster_groups: 270 for port in ports: 271 has_perm = ec2.has_permission(group, 'tcp', port, port, 272 '0.0.0.0/0') 273 if not has_perm: 274 group.authorize('tcp', port, port, '0.0.0.0/0')275277 try: 278 self._configure_hadoop(master, nodes, user) 279 self._start_hadoop(master, nodes) 280 self._open_ports(master) 281 log.info("Job tracker status: http://%s:50030" % master.dns_name) 282 log.info("Namenode status: http://%s:50070" % master.dns_name) 283 finally: 284 self.pool.shutdown()
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Tue Jan 3 23:11:47 2012 | http://epydoc.sourceforge.net |