Package starcluster :: Package plugins :: Module hadoop
[hide private]
[frames] | no frames]

Source Code for Module starcluster.plugins.hadoop

  1  import posixpath 
  2   
  3  from starcluster import threadpool 
  4  from starcluster.clustersetup import ClusterSetup 
  5  from starcluster.logger import log 
  6   
  7  core_site_templ = """\ 
  8  <?xml version="1.0"?> 
  9  <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 
 10   
 11  <!-- Put site-specific property overrides in this file. --> 
 12   
 13  <configuration> 
 14  <!-- In: conf/core-site.xml --> 
 15  <property> 
 16    <name>hadoop.tmp.dir</name> 
 17    <value>%(hadoop_tmpdir)s</value> 
 18    <description>A base for other temporary directories.</description> 
 19  </property> 
 20   
 21  <property> 
 22    <name>fs.default.name</name> 
 23    <value>hdfs://%(master)s:54310</value> 
 24    <description>The name of the default file system.  A URI whose 
 25    scheme and authority determine the FileSystem implementation.  The 
 26    uri's scheme determines the config property (fs.SCHEME.impl) naming 
 27    the FileSystem implementation class.  The uri's authority is used to 
 28    determine the host, port, etc. for a filesystem.</description> 
 29  </property> 
 30   
 31  </configuration> 
 32  """ 
 33   
 34  hdfs_site_templ = """\ 
 35  <?xml version="1.0"?> 
 36  <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 
 37   
 38  <!-- Put site-specific property overrides in this file. --> 
 39   
 40  <configuration> 
 41  <!-- In: conf/hdfs-site.xml --> 
 42  <property> 
 43    <name>dfs.permissions</name> 
 44    <value>false</value> 
 45  </property> 
 46  <property> 
 47    <name>dfs.replication</name> 
 48    <value>%(replication)d</value> 
 49    <description>Default block replication. 
 50    The actual number of replications can be specified when the file is created. 
 51    The default is used if replication is not specified in create time. 
 52    </description> 
 53  </property> 
 54  </configuration> 
 55  """ 
 56   
 57  mapred_site_templ = """\ 
 58  <?xml version="1.0"?> 
 59  <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> 
 60   
 61  <!-- Put site-specific property overrides in this file. --> 
 62   
 63  <configuration> 
 64  <!-- In: conf/mapred-site.xml --> 
 65  <property> 
 66    <name>mapred.job.tracker</name> 
 67    <value>%(master)s:54311</value> 
 68    <description>The host and port that the MapReduce job tracker runs 
 69    at.  If "local", then jobs are run in-process as a single map 
 70    and reduce task. 
 71    </description> 
 72  </property> 
 73  </configuration> 
 74  """ 
75 76 77 -class Hadoop(ClusterSetup):
78 """ 79 Configures Hadoop using Cloudera packages on StarCluster 80 """ 81
82 - def __init__(self, hadoop_tmpdir='/mnt/hadoop'):
83 self.hadoop_tmpdir = hadoop_tmpdir 84 self.hadoop_home = '/usr/lib/hadoop' 85 self.hadoop_conf = '/etc/hadoop-0.20/conf.starcluster' 86 self.empty_conf = '/etc/hadoop-0.20/conf.empty' 87 self.centos_java_home = '/usr/lib/jvm/java' 88 self.centos_alt_cmd = 'alternatives' 89 self.ubuntu_javas = ['/usr/lib/jvm/java-6-sun/jre', 90 '/usr/lib/jvm/java-6-openjdk/jre'] 91 self.ubuntu_alt_cmd = 'update-alternatives' 92 self._pool = None
93 94 @property
95 - def pool(self):
96 if self._pool is None: 97 self._pool = threadpool.get_thread_pool(20, disable_threads=False) 98 return self._pool
99
100 - def _get_java_home(self, node):
101 # check for CentOS, otherwise default to Ubuntu 10.04's JAVA_HOME 102 if node.ssh.isfile('/etc/redhat-release'): 103 return self.centos_java_home 104 for java in self.ubuntu_javas: 105 if node.ssh.isdir(java): 106 return java 107 raise Exception("Cant find JAVA jre")
108
109 - def _get_alternatives_cmd(self, node):
110 # check for CentOS, otherwise default to Ubuntu 10.04 111 if node.ssh.isfile('/etc/redhat-release'): 112 return self.centos_alt_cmd 113 return self.ubuntu_alt_cmd
114
115 - def _setup_hadoop_user(self, node, user):
116 node.ssh.execute('gpasswd -a %s hadoop' % user)
117
118 - def _install_empty_conf(self, node):
119 node.ssh.execute('cp -r %s %s' % (self.empty_conf, self.hadoop_conf)) 120 alternatives_cmd = self._get_alternatives_cmd(node) 121 cmd = '%s --install /etc/hadoop-0.20/conf ' % alternatives_cmd 122 cmd += 'hadoop-0.20-conf %s 50' % self.hadoop_conf 123 node.ssh.execute(cmd)
124
125 - def _configure_env(self, node):
126 env_file_sh = posixpath.join(self.hadoop_conf, 'hadoop-env.sh') 127 node.ssh.remove_lines_from_file(env_file_sh, 'JAVA_HOME') 128 env_file = node.ssh.remote_file(env_file_sh, 'a') 129 env_file.write('export JAVA_HOME=%s\n' % self._get_java_home(node)) 130 env_file.close()
131
132 - def _configure_mapreduce_site(self, node, cfg):
133 mapred_site_xml = posixpath.join(self.hadoop_conf, 'mapred-site.xml') 134 mapred_site = node.ssh.remote_file(mapred_site_xml) 135 mapred_site.write(mapred_site_templ % cfg) 136 mapred_site.close()
137
138 - def _configure_core(self, node, cfg):
139 core_site_xml = posixpath.join(self.hadoop_conf, 'core-site.xml') 140 core_site = node.ssh.remote_file(core_site_xml) 141 core_site.write(core_site_templ % cfg) 142 core_site.close()
143
144 - def _configure_hdfs_site(self, node, cfg):
145 hdfs_site_xml = posixpath.join(self.hadoop_conf, 'hdfs-site.xml') 146 hdfs_site = node.ssh.remote_file(hdfs_site_xml) 147 hdfs_site.write(hdfs_site_templ % cfg) 148 hdfs_site.close()
149
150 - def _configure_masters(self, node, master):
151 masters_file = posixpath.join(self.hadoop_conf, 'masters') 152 masters_file = node.ssh.remote_file(masters_file) 153 masters_file.write(master.alias) 154 masters_file.close()
155
156 - def _configure_slaves(self, node, node_aliases):
157 slaves_file = posixpath.join(self.hadoop_conf, 'slaves') 158 slaves_file = node.ssh.remote_file(slaves_file) 159 slaves_file.write('\n'.join(node_aliases)) 160 slaves_file.close()
161
162 - def _setup_hdfs(self, node, user):
163 self._setup_hadoop_dir(node, self.hadoop_tmpdir, 'hdfs', 'hadoop') 164 mapred_dir = posixpath.join(self.hadoop_tmpdir, 'hadoop-mapred') 165 self._setup_hadoop_dir(node, mapred_dir, 'mapred', 'hadoop') 166 userdir = posixpath.join(self.hadoop_tmpdir, 'hadoop-%s' % user) 167 self._setup_hadoop_dir(node, userdir, user, 'hadoop') 168 hdfsdir = posixpath.join(self.hadoop_tmpdir, 'hadoop-hdfs') 169 if not node.ssh.isdir(hdfsdir): 170 node.ssh.execute("su hdfs -c 'hadoop namenode -format'") 171 self._setup_hadoop_dir(node, hdfsdir, 'hdfs', 'hadoop')
172
173 - def _setup_dumbo(self, node):
174 if not node.ssh.isfile('/etc/dumbo.conf'): 175 f = node.ssh.remote_file('/etc/dumbo.conf') 176 f.write('[hadoops]\nstarcluster: %s\n' % self.hadoop_home) 177 f.close()
178
179 - def _configure_hadoop(self, master, nodes, user):
180 log.info("Configuring Hadoop...") 181 log.info("Adding user %s to hadoop group" % user) 182 for node in nodes: 183 self.pool.simple_job(self._setup_hadoop_user, (node, user), 184 jobid=node.alias) 185 self.pool.wait(numtasks=len(nodes)) 186 node_aliases = map(lambda n: n.alias, nodes) 187 cfg = {'master': master.alias, 'replication': 3, 188 'hadoop_tmpdir': posixpath.join(self.hadoop_tmpdir, 189 'hadoop-${user.name}')} 190 log.info("Installing configuration templates...") 191 for node in nodes: 192 self.pool.simple_job(self._install_empty_conf, (node,), 193 jobid=node.alias) 194 self.pool.wait(numtasks=len(nodes)) 195 log.info("Configuring environment...") 196 for node in nodes: 197 self.pool.simple_job(self._configure_env, (node,), 198 jobid=node.alias) 199 self.pool.wait(numtasks=len(nodes)) 200 log.info("Configuring MapReduce Site...") 201 for node in nodes: 202 self.pool.simple_job(self._configure_mapreduce_site, (node, cfg), 203 jobid=node.alias) 204 self.pool.wait(numtasks=len(nodes)) 205 log.info("Configuring Core Site...") 206 for node in nodes: 207 self.pool.simple_job(self._configure_core, (node, cfg), 208 jobid=node.alias) 209 self.pool.wait(numtasks=len(nodes)) 210 log.info("Configuring HDFS Site...") 211 for node in nodes: 212 self.pool.simple_job(self._configure_hdfs_site, (node, cfg), 213 jobid=node.alias) 214 self.pool.wait(numtasks=len(nodes)) 215 log.info("Configuring masters file...") 216 for node in nodes: 217 self.pool.simple_job(self._configure_masters, (node, master), 218 jobid=node.alias) 219 self.pool.wait(numtasks=len(nodes)) 220 log.info("Configuring slaves file...") 221 for node in nodes: 222 self.pool.simple_job(self._configure_slaves, (node, node_aliases), 223 jobid=node.alias) 224 self.pool.wait(numtasks=len(nodes)) 225 log.info("Configuring HDFS...") 226 for node in nodes: 227 self.pool.simple_job(self._setup_hdfs, (node, user), 228 jobid=node.alias) 229 self.pool.wait(numtasks=len(nodes)) 230 log.info("Configuring dumbo...") 231 for node in nodes: 232 self.pool.simple_job(self._setup_dumbo, (node,), jobid=node.alias) 233 self.pool.wait(numtasks=len(nodes))
234
235 - def _setup_hadoop_dir(self, node, path, user, group, permission="775"):
236 if not node.ssh.isdir(path): 237 node.ssh.mkdir(path) 238 node.ssh.execute("chown -R %s:hadoop %s" % (user, path)) 239 node.ssh.execute("chmod -R %s %s" % (permission, path))
240
241 - def _start_datanode(self, node):
242 node.ssh.execute('/etc/init.d/hadoop-0.20-datanode restart')
243
244 - def _start_tasktracker(self, node):
245 node.ssh.execute('/etc/init.d/hadoop-0.20-tasktracker restart')
246
247 - def _start_hadoop(self, master, nodes):
248 log.info("Starting namenode...") 249 master.ssh.execute('/etc/init.d/hadoop-0.20-namenode restart') 250 log.info("Starting secondary namenode...") 251 master.ssh.execute('/etc/init.d/hadoop-0.20-secondarynamenode restart') 252 253 for node in nodes: 254 log.info("Starting datanode on %s..." % node.alias) 255 self.pool.simple_job(self._start_datanode, (node,), 256 jobid=node.alias) 257 self.pool.wait() 258 log.info("Starting jobtracker...") 259 master.ssh.execute('/etc/init.d/hadoop-0.20-jobtracker restart') 260 for node in nodes: 261 log.info("Starting tasktracker on %s..." % node.alias) 262 self.pool.simple_job(self._start_tasktracker, (node,), 263 jobid=node.alias) 264 self.pool.wait()
265
266 - def _open_ports(self, master):
267 ports = [50070, 50030] 268 ec2 = master.ec2 269 for group in master.cluster_groups: 270 for port in ports: 271 has_perm = ec2.has_permission(group, 'tcp', port, port, 272 '0.0.0.0/0') 273 if not has_perm: 274 group.authorize('tcp', port, port, '0.0.0.0/0')
275
276 - def run(self, nodes, master, user, user_shell, volumes):
277 try: 278 self._configure_hadoop(master, nodes, user) 279 self._start_hadoop(master, nodes) 280 self._open_ports(master) 281 log.info("Job tracker status: http://%s:50030" % master.dns_name) 282 log.info("Namenode status: http://%s:50070" % master.dns_name) 283 finally: 284 self.pool.shutdown()
285