Package csb :: Package apps :: Module hhfrag
[frames] | no frames]

Source Code for Module csb.apps.hhfrag

  1  """ 
  2  HHfrag: build a dynamic variable-length fragment library for protein structure 
  3  prediction with Rosetta AbInitio. 
  4  """ 
  5   
  6  import os 
  7  import multiprocessing 
  8   
  9  import csb.apps 
 10  import csb.apps.hhsearch as hhsearch 
 11   
 12  import csb.bio.io.hhpred 
 13  import csb.bio.fragments 
 14  import csb.bio.fragments.rosetta as rosetta 
 15  import csb.bio.structure 
 16   
 17  import csb.io 
 18  import csb.core 
19 20 21 -class ExitCodes(csb.apps.ExitCodes):
22 23 IO_ERROR = 2 24 INVALID_DATA = 3 25 HHSEARCH_FAILURE = 4 26 NO_OUTPUT = 5
27
28 29 -class AppRunner(csb.apps.AppRunner):
30 31 @property
32 - def target(self):
33 return HHfragApp
34
35 - def command_line(self):
36 37 cmd = csb.apps.ArgHandler(self.program, __doc__) 38 cpu = multiprocessing.cpu_count() 39 40 cmd.add_scalar_option('hhsearch', 'H', str, 'path to the HHsearch executable', default='hhsearch') 41 cmd.add_scalar_option('database', 'd', str, 'database directory (containing PDBS25.hhm)', required=True) 42 43 cmd.add_scalar_option('min', 'm', int, 'minimum query segment length', default=6) 44 cmd.add_scalar_option('max', 'M', int, 'maximum query segment length', default=21) 45 cmd.add_scalar_option('step', 's', int, 'query segmentation step', default=3) 46 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu) 47 48 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=2) 49 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.') 50 cmd.add_scalar_option('gap-filling', 'g', str, 'path to a Rosetta 9-mer fragment file, that will be used ' 51 'to complement gaps in the fragment map (if specified, a joint fragment file will be produced)') 52 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map', default=False) 53 cmd.add_boolean_option('c-alpha', None, 'include also C-alpha vectors in the output', default=False) 54 55 cmd.add_positional_argument('QUERY', str, 'query profile HMM (e.g. created with csb.apps.buildhmm)') 56 57 return cmd
58
59 60 -class HHfragApp(csb.apps.Application):
61
62 - def main(self):
63 if not os.path.isdir(self.args.output): 64 HHfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True) 65 66 if self.args.c_alpha: 67 builder = rosetta.ExtendedOutputBuilder 68 else: 69 builder = rosetta.OutputBuilder 70 71 try: 72 hhf = HHfrag(self.args.QUERY, self.args.hhsearch, self.args.database, logger=self) 73 output = os.path.join(self.args.output, hhf.query.id) 74 75 hhf.slice_query(self.args.min, self.args.max, self.args.step, self.args.cpu) 76 frags = hhf.extract_fragments() 77 78 if len(frags) == 0: 79 HHfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT) 80 81 fragmap = hhf.build_fragment_map() 82 fragmap.dump(output + '.hhfrags.09', builder) 83 84 if self.args.filtered_map: 85 fragmap = hhf.build_filtered_map() 86 fragmap.dump(output + '.filtered.09', builder) 87 88 if self.args.gap_filling: 89 fragmap = hhf.build_combined_map(self.args.gap_filling) 90 fragmap.dump(output + '.complemented.09', builder) 91 92 self.log('\nDONE.') 93 94 except ArgumentIOError as ae: 95 HHfragApp.exit(str(ae), code=ExitCodes.IO_ERROR) 96 97 except ArgumentError as ae: 98 HHfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA, usage=True) 99 100 except csb.io.InvalidCommandError as ose: 101 msg = '{0!s}: {0.program}'.format(ose) 102 HHfragApp.exit(msg, ExitCodes.IO_ERROR) 103 104 except csb.bio.io.hhpred.HHProfileFormatError as hfe: 105 msg = 'Corrupt HMM: {0!s}'.format(hfe) 106 HHfragApp.exit(msg, code=ExitCodes.INVALID_DATA) 107 108 except csb.io.ProcessError as pe: 109 message = 'Bad exit code from HHsearch: #{0.code}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context) 110 HHfragApp.exit(message, ExitCodes.HHSEARCH_FAILURE)
111
112 - def log(self, message, ending='\n', level=1):
113 114 if level <= self.args.verbosity: 115 super(HHfragApp, self).log(message, ending)
116
117 118 -class ArgumentError(ValueError):
119 pass
120
121 -class ArgumentIOError(ArgumentError):
122 pass
123
124 -class InvalidOperationError(ValueError):
125 pass
126
127 128 -class HHfrag(object):
129 130 PDBS = 'pdbs25.hhm' 131
132 - def __init__(self, query, binary, database, logger=None):
133 134 try: 135 self._query = csb.bio.io.HHProfileParser(query).parse() 136 except IOError as io: 137 raise ArgumentIOError(str(io)) 138 self._hsqs = None 139 self._matches = None 140 141 self._app = logger 142 self._database = None 143 self._pdbs25 = None 144 self._output = None 145 self._aligner = None 146 147 self.database = database 148 self.aligner = hhsearch.HHsearch(binary, self.pdbs25, cpu=2)
149 150 @property
151 - def query(self):
152 return self._query
153 154 @property
155 - def pdbs25(self):
156 return self._pdbs25
157 158 @property
159 - def database(self):
160 return self._database
161 @database.setter
162 - def database(self, value):
163 database = value 164 pdbs25 = os.path.join(value, HHfrag.PDBS) 165 if not os.path.isfile(pdbs25): 166 raise ArgumentError('PDBS25 not found here: ' + pdbs25) 167 self._database = database 168 self._pdbs25 = pdbs25
169 170 @property
171 - def aligner(self):
172 return self._aligner
173 @aligner.setter
174 - def aligner(self, value):
175 if hasattr(value, 'run') and hasattr(value, 'runmany'): 176 self._aligner = value 177 else: 178 raise TypeError(value)
179
180 - def log(self, *a, **ka):
181 182 if self._app: 183 self._app.log(*a, **ka)
184
185 - def slice_query(self, min=6, max=21, step=3, cpu=None):
186 187 if not 0 < min <= max: 188 raise ArgumentError('min and max must be positive numbers, with max >= min') 189 if not 0 < step: 190 raise ArgumentError('step must be positive number') 191 192 self.log('\n# Processing profile HMM "{0}"...'.format(self.query.id)) 193 self.log('', level=2) 194 qp = self.query 195 hsqs = [] 196 197 if not cpu: 198 cpu = max - min + 1 199 200 for start in range(1, qp.layers.length - min + 1 + 1, step): 201 202 self.log('{0:3}. '.format(start), ending='', level=1) 203 probes = [] 204 205 for end in range(start + min - 1, start + max): 206 if end > qp.layers.length: 207 break 208 context = SliceContext(qp.segment(start, end), start, end) 209 probes.append(context) 210 211 probes = self.aligner.runmany(probes, workers=cpu) 212 probes.sort() 213 214 if len(probes) > 0: 215 rep = probes[-1] 216 hsqs.append(rep) 217 self.log('{0.start:3} {0.end:3} ({0.length:2} aa) {0.recurrence:3} hits'.format(rep), level=1) 218 else: 219 self.log(' no hits', level=1) 220 221 self._hsqs = hsqs 222 return tuple(hsqs)
223
224 - def extract_fragments(self):
225 226 self.log('\n# Extracting fragments...') 227 228 if self._hsqs is None: 229 raise InvalidOperationError('The query has to be sliced first') 230 231 fragments = [] 232 233 for si in self._hsqs: 234 self.log('\nSEGMENT: {0.start:3} {0.end:3} ({0.recurrence})'.format(si), level=2) 235 236 for hit in si.hits: 237 cn = 0 238 for chunk in hit.alignment.segments: 239 chunk.qstart = chunk.qstart + si.start - 1 240 chunk.qend = chunk.qend + si.start - 1 241 cn += 1 242 self.log(' {0.id:5} L{0.qstart:3} {0.qend:3} {0.length:2}aa P:{0.probability:5.3f}'.format(chunk), ending='', level=2) 243 244 sourcefile = os.path.join(self.database, hit.id + '.pdb') 245 if not os.path.isfile(sourcefile): 246 self.log(' missing', level=2) 247 continue 248 source = csb.bio.io.StructureParser(sourcefile).parse().first_chain 249 assert hit.id[-1] == source.id 250 251 source.compute_torsion() 252 try: 253 fragment = csb.bio.fragments.Assignment(source, 254 chunk.start, chunk.end, hit.id, 255 chunk.qstart, chunk.qend, chunk.probability, 256 rmsd=None, tm_score=None) 257 fragments.append(fragment) 258 if cn > 1: 259 self.log(' (chunk #{0})'.format(cn), level=2) 260 else: 261 self.log('', level=2) 262 263 except csb.bio.structure.Broken3DStructureError: 264 self.log(' corrupt', level=2) 265 continue 266 267 self._matches = fragments 268 return tuple(fragments)
269
270 - def _plot_lengths(self):
271 272 self.log('\n {0} ungapped assignments'.format(len(self._matches))) 273 self.log('', level=2) 274 275 histogram = {} 276 for f in self._matches: 277 histogram[f.length] = histogram.get(f.length, 0) + 1 278 279 for length in sorted(histogram): 280 281 percent = histogram[length] * 100.0 / len(self._matches) 282 seg = b'\xe2\x9f\xa4'.decode('utf-8') 283 bar = b'{0:3} |{1} {2:5.1f}%'.decode('utf-8') 284 bar = bar.format(length, seg * int(percent), percent) 285 self.log(bar, level=2)
286
287 - def build_fragment_map(self):
288 289 self.log('\n# Building dynamic fragment map...') 290 291 if self._matches is None: 292 raise InvalidOperationError('You need to extract some fragments first') 293 294 self._plot_lengths() 295 296 target = csb.bio.fragments.Target.from_profile(self.query) 297 target.assignall(self._matches) 298 299 factory = csb.bio.fragments.RosettaFragsetFactory() 300 return factory.make_fragset(target)
301
302 - def _filter_event_handler(self, ri):
303 if ri.rep is None: 304 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} - - -'.format(ri, ri.rep), level=2) 305 else: 306 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1.id:5} {1.start:3} {1.end:3}'.format(ri, ri.rep), level=2)
307
308 - def build_filtered_map(self):
309 310 self.log('\n# Building filtered map...') 311 self.log('\n Confidence Count Representative', level=2) 312 313 target = csb.bio.fragments.Target.from_profile(self.query) 314 target.assignall(self._matches) 315 316 factory = csb.bio.fragments.RosettaFragsetFactory() 317 return factory.make_filtered(target, extend=True, 318 callback=self._filter_event_handler)
319
320 - def _merge_event_handler(self, rei):
321 if rei.confidence is None: 322 self.log('{0.rank:3}. - {0.count:3}'.format(rei), level=2) 323 else: 324 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3}'.format(rei), level=2)
325
326 - def build_combined_map(self, fragfile, top=25):
327 328 self.log('\n# Building complemented map...') 329 330 try: 331 filling = rosetta.RosettaFragmentMap.read(fragfile, top=top) 332 except IOError as io: 333 raise ArgumentIOError(str(io)) 334 335 self.log('\n {0} rosetta fragments loaded'.format(filling.size)) 336 self.log(' Confidence Count', level=2) 337 338 target = csb.bio.fragments.Target.from_profile(self.query) 339 target.assignall(self._matches) 340 341 factory = csb.bio.fragments.RosettaFragsetFactory() 342 return factory.make_combined(target, filling, threshold=0.5, 343 callback=self._merge_event_handler)
344
345 346 -class SliceContext(hhsearch.Context):
347
348 - def __init__(self, segment, start, end):
349 350 self.start = start 351 self.end = end 352 353 if not isinstance(segment, csb.core.string): 354 segment = segment.to_hmm(convert_scores=True) 355 356 super(SliceContext, self).__init__(segment)
357 358 @property
359 - def length(self):
360 return self.end - self.start + 1
361 362 @property
363 - def hits(self):
364 return self.result
365 366 @property
367 - def recurrence(self):
368 return len(self.result)
369
370 - def __lt__(self, other):
371 372 if self.recurrence == other.recurrence: 373 return self.length < other.length 374 else: 375 return self.recurrence < other.recurrence
376 377 378 379 380 if __name__ == '__main__': 381 382 AppRunner().run() 383