Package csb :: Package apps :: Module buildhmm
[frames] | no frames]

Source Code for Module csb.apps.buildhmm

  1  """ 
  2  Build an HMM from a FASTA sequence. This program is a proxy to buildali.pl 
  3  and hhmake from the HHpred package. 
  4   
  5  @note: assuming you have the full HHpred package installed and configured. 
  6  """ 
  7   
  8   
  9  import os 
 10  import abc 
 11   
 12  import csb.apps 
 13  import csb.core 
 14  import csb.io 
 15   
 16  from csb.bio.io.wwpdb import StructureParser 
 17  from csb.bio.io.hhpred import HHProfileParser 
 18  from csb.bio.io.fasta import FASTAOutputBuilder 
 19  from csb.bio.sequence import ChainSequence 
20 21 22 23 -class ExitCodes(csb.apps.ExitCodes):
24 25 IO_ERROR = 2 26 INVALID_DATA = 3 27 EXT_TOOL_FAILURE = 4
28
29 -class AppRunner(csb.apps.AppRunner):
30 31 @property
32 - def target(self):
33 return BuildProfileApp
34
35 - def command_line(self):
36 37 cmd = csb.apps.ArgHandler(self.program, __doc__) 38 39 cmd.add_scalar_option('query-id', 'q', str, 'ID of the query, in PDB-like format (accessionCHAIN).' 40 'Used for naming the output files. Also, if the input is a PDB file with ' 41 'multiple chains, CHAIN is used to pull the required chain from the file.', 42 required=True) 43 cmd.add_scalar_option('tk_root', 't', str, 'path to the ToolkitRoot folder in your HHpred setup', default='/ebio/abt1_toolkit/share/wye') 44 cmd.add_scalar_option('cpu', None, int, 'maximum degree of parallelism', default=1) 45 46 cmd.add_boolean_option('no-ss', None, 'do not include secondary structure', default=False) 47 cmd.add_boolean_option('no-pseudo', None, 'do not add emission and transition pseudocounts', default=False) 48 cmd.add_boolean_option('no-calibration', None, 'do not calibrate the profile', default=False) 49 50 cmd.add_positional_argument('query', str, 'input sequence (FASTA or PDB file)') 51 52 return cmd
53
54 55 -class BuildProfileApp(csb.apps.Application):
56
57 - def main(self):
58 59 if os.path.isfile(self.args.query_id + '.hhm'): 60 BuildProfileApp.exit('# Profile "{0}" already exists, skipping'.format(self.args.query_id), 61 ExitCodes.CLEAN) 62 63 try: 64 self.log('# Building profile HMM for {0}...'.format(self.args.query)) 65 pb = ProfileBuilder.create(self.args.query, self.args.query_id, self.args.tk_root, 66 pseudo=not self.args.no_pseudo, ss=not self.args.no_ss, cpu=self.args.cpu) 67 68 pb.build_alignment() 69 70 pb.make_hmm() 71 72 if not self.args.no_calibration: 73 pb.calibrate_hmm() 74 75 except BuildArgError as ae: 76 BuildProfileApp.exit(str(ae), ExitCodes.INVALID_DATA) 77 78 except BuildIOError as ioe: 79 BuildProfileApp.exit(str(ioe), ExitCodes.IO_ERROR) 80 81 except csb.io.InvalidCommandError as ose: 82 msg = '{0!s}: {0.cmd}'.format(ose) 83 BuildProfileApp.exit(msg, ExitCodes.IO_ERROR) 84 85 except NoOutputError as noe: 86 msg = 'Expected file {0} not produced by: {1.cmd}.\nSTDERR: {1.stderr}\nSTDOUT: {1.stdout}'.format(noe.expected, noe.context) 87 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE) 88 89 except csb.io.ProcessError as pe: 90 msg = 'Bad exit code #{0.code} from: {0.cmd}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context) 91 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE) 92 93 self.log(' successfully created profile "{0}"'.format(self.args.query_id))
94
95 96 -class BuildError(Exception):
97 pass
98
99 -class BuildIOError(BuildError):
100 pass
101
102 -class BuildArgError(BuildError):
103 pass
104
105 -class NoOutputError(BuildError):
106
107 - def __init__(self, expected, context, *args):
108 109 self.expected = expected 110 self.context = context 111 super(NoOutputError, self).__init__(*args)
112
113 114 -class ProfileBuilder(object):
115 116 __metaclass__ = abc.ABCMeta 117 118 EMISSION_PSEUDO = '-pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0' 119 TRANSITION_PSEUDO = '-gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6' 120 121 @staticmethod
122 - def create(query, target_id, tk_root, pseudo=True, ss=True, cpu=1):
123 124 if not os.path.isfile(query): 125 raise BuildIOError('File not found: ' + query) 126 127 for line in open(query): 128 129 if not line.strip(): 130 continue 131 132 if line.startswith('>'): 133 return FASTAProfileBuilder(query, target_id, tk_root, pseudo, ss, cpu) 134 elif line.startswith('HEADER') or line.startswith('ATOM'): 135 return PDBProfileBuilder(query, target_id, tk_root, pseudo, ss, cpu) 136 else: 137 raise BuildArgError('Unknown input file format')
138
139 - def __init__(self, query, target_id, tk_root, pseudo=True, ss=True, cpu=1):
140 141 self.tk_root = tk_root 142 if 'TK_ROOT' not in os.environ or not os.environ['TK_ROOT']: 143 os.putenv('TK_ROOT', tk_root) 144 145 self.query = query 146 self.accession = target_id[:-1] 147 self.chain = target_id[-1] 148 self.pseudo = bool(pseudo) 149 self.ss = bool(ss) 150 self.cpu = cpu 151 152 self._input = None 153 self._a3m = None 154 self._hhm = None 155 156 self.configure_input()
157
158 - def run(self):
159 160 self.build_alignment() 161 self.make_hmm() 162 self.calibrate_hmm()
163 164 @property
165 - def target_id(self):
166 return self.accession + self.chain
167 168 @abc.abstractmethod
169 - def configure_input(self):
170 pass
171
172 - def build_alignment(self):
173 assert self._input is not None 174 175 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'buildali.pl') 176 177 if not self.ss: 178 noss = '-noss' 179 else: 180 noss = '' 181 cmd = 'perl {0} {1} -cpu {2} {3}'.format(program, noss, self.cpu, self._input) 182 bali = csb.io.Shell.run(cmd) 183 184 ali = self.target_id + '.a3m' 185 if bali.code != 0: 186 raise csb.io.ProcessError(bali) 187 if not os.path.isfile(ali): 188 raise NoOutputError(ali, bali) 189 190 self._ali = ali 191 return ali
192
193 - def make_hmm(self):
194 assert self._ali is not None 195 196 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'hhmake') 197 hhm = self.target_id + '.hhm' 198 cmd = '{0} -i {1} -o {2}'.format(program, self._ali, hhm) 199 200 if self.pseudo: 201 cmd = '{0} {1} {2}'.format(cmd, ProfileBuilder.EMISSION_PSEUDO, ProfileBuilder.TRANSITION_PSEUDO) 202 203 nnmake = csb.io.Shell.run(cmd) 204 if nnmake.code != 0: 205 raise csb.io.ProcessError(nnmake) 206 if not os.path.isfile(hhm): 207 raise NoOutputError(hhm, nnmake) 208 209 self._hhm = hhm 210 return hhm
211
212 - def calibrate_hmm(self):
213 assert self._hhm is not None 214 215 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'hhsearch') 216 caldb = os.path.join(self.tk_root, 'databases', 'hhpred', 'cal.hhm') 217 218 cmd = '{0} -i {1}.hhm -d {2} -cal -cpu {3}'.format(program, self.target_id, caldb, self.cpu) 219 csb.io.Shell.runstrict(cmd)
220
221 222 -class FASTAProfileBuilder(ProfileBuilder):
223
224 - def configure_input(self):
225 226 if not os.path.isfile(self.query): 227 raise BuildIOError('File not found: ' + self.query) 228 229 fasta = self.target_id + '.fa' 230 231 with csb.io.EntryWriter(fasta) as f: 232 with open(self.query) as q: 233 f.write(q.read()) 234 235 self._input = fasta 236 return fasta
237
238 239 -class PDBProfileBuilder(ProfileBuilder):
240
241 - def configure_input(self):
242 243 try: 244 s = StructureParser(self.query).parse() 245 chain = s.chains[self.chain] 246 except csb.core.ItemNotFoundError: 247 raise BuildArgError('Chain {0.chain} not found in {0.query}'.format(self)) 248 except IOError as ioe: 249 raise BuildIOError(str(ioe)) 250 251 fasta = self.target_id + '.fa' 252 253 with open(fasta, 'w') as f: 254 sequence = ChainSequence.create(chain) 255 FASTAOutputBuilder(f).add_sequence(sequence) 256 257 self._input = fasta 258 return fasta
259
260 - def make_hmm(self):
261 262 super(PDBProfileBuilder, self).make_hmm() 263 self.format_structure()
264
265 - def format_structure(self):
266 assert self._hhm is not None 267 268 pdb = self.target_id + '.pdb' 269 270 parser = HHProfileParser(self._hhm) 271 parser.format_structure(self.query, self.chain, pdb) 272 273 self._pdb = pdb 274 return pdb
275 276 277 278 if __name__ == '__main__': 279 280 AppRunner().run() 281