1 """
2 HHfrag: build a dynamic variable-length fragment library for protein structure
3 prediction with Rosetta AbInitio.
4 """
5
6 import os
7 import multiprocessing
8
9 import csb.apps
10 import csb.apps.hhsearch as hhsearch
11
12 import csb.bio.io.hhpred
13 import csb.bio.fragments
14 import csb.bio.fragments.rosetta as rosetta
15 import csb.bio.structure
16
17 import csb.io
18 import csb.core
27
30
31 @property
34
36
37 cmd = csb.apps.ArgHandler(self.program, __doc__)
38 cpu = multiprocessing.cpu_count()
39
40 cmd.add_scalar_option('hhsearch', 'H', str, 'path to the HHsearch executable', default='hhsearch')
41 cmd.add_scalar_option('database', 'd', str, 'database directory (containing PDBS25.hhm)', required=True)
42
43 cmd.add_scalar_option('min', 'm', int, 'minimum query segment length', default=6)
44 cmd.add_scalar_option('max', 'M', int, 'maximum query segment length', default=21)
45 cmd.add_scalar_option('step', 's', int, 'query segmentation step', default=3)
46 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu)
47
48 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=2)
49 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.')
50 cmd.add_scalar_option('gap-filling', 'g', str, 'path to a Rosetta 9-mer fragment file, that will be used '
51 'to complement gaps in the fragment map (if specified, a joint fragment file will be produced)')
52 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map', default=False)
53 cmd.add_boolean_option('c-alpha', None, 'include also C-alpha vectors in the output', default=False)
54
55 cmd.add_positional_argument('QUERY', str, 'query profile HMM (e.g. created with csb.apps.buildhmm)')
56
57 return cmd
58
61
63 if not os.path.isdir(self.args.output):
64 HHfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True)
65
66 if self.args.c_alpha:
67 builder = rosetta.ExtendedOutputBuilder
68 else:
69 builder = rosetta.OutputBuilder
70
71 try:
72 hhf = HHfrag(self.args.QUERY, self.args.hhsearch, self.args.database, logger=self)
73 output = os.path.join(self.args.output, hhf.query.id)
74
75 hhf.slice_query(self.args.min, self.args.max, self.args.step, self.args.cpu)
76 frags = hhf.extract_fragments()
77
78 if len(frags) == 0:
79 HHfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT)
80
81 fragmap = hhf.build_fragment_map()
82 fragmap.dump(output + '.hhfrags.09', builder)
83
84 if self.args.filtered_map:
85 fragmap = hhf.build_filtered_map()
86 fragmap.dump(output + '.filtered.09', builder)
87
88 if self.args.gap_filling:
89 fragmap = hhf.build_combined_map(self.args.gap_filling)
90 fragmap.dump(output + '.complemented.09', builder)
91
92 self.log('\nDONE.')
93
94 except ArgumentIOError as ae:
95 HHfragApp.exit(str(ae), code=ExitCodes.IO_ERROR)
96
97 except ArgumentError as ae:
98 HHfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA, usage=True)
99
100 except csb.io.InvalidCommandError as ose:
101 msg = '{0!s}: {0.program}'.format(ose)
102 HHfragApp.exit(msg, ExitCodes.IO_ERROR)
103
104 except csb.bio.io.hhpred.HHProfileFormatError as hfe:
105 msg = 'Corrupt HMM: {0!s}'.format(hfe)
106 HHfragApp.exit(msg, code=ExitCodes.INVALID_DATA)
107
108 except csb.io.ProcessError as pe:
109 message = 'Bad exit code from HHsearch: #{0.code}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context)
110 HHfragApp.exit(message, ExitCodes.HHSEARCH_FAILURE)
111
112 - def log(self, message, ending='\n', level=1):
116
119
122
125
128
129 PDBS = 'pdbs25.hhm'
130
131 - def __init__(self, query, binary, database, logger=None):
148
149 @property
152
153 @property
156
157 @property
159 return self._database
160 @database.setter
168
169 @property
172 @aligner.setter
174 if hasattr(value, 'run') and hasattr(value, 'runmany'):
175 self._aligner = value
176 else:
177 raise TypeError(value)
178
179 - def log(self, *a, **ka):
180
181 if self._app:
182 self._app.log(*a, **ka)
183
184 - def slice_query(self, min=6, max=21, step=3, cpu=None):
185
186 if not 0 < min <= max:
187 raise ArgumentError('min and max must be positive numbers, with max >= min')
188 if not 0 < step:
189 raise ArgumentError('step must be positive number')
190
191 self.log('\n# Processing profile HMM "{0}"...'.format(self.query.id))
192 self.log('', level=2)
193 qp = self.query
194 hsqs = []
195
196 if not cpu:
197 cpu = max - min + 1
198
199 for start in range(1, qp.layers.length - min + 1 + 1, step):
200
201 self.log('{0:3}. '.format(start), ending='', level=1)
202 probes = []
203
204 for end in range(start + min - 1, start + max):
205 if end > qp.layers.length:
206 break
207 context = SliceContext(qp.segment(start, end), start, end)
208 probes.append(context)
209
210 probes = self.aligner.runmany(probes, workers=cpu)
211 probes.sort()
212
213 if len(probes) > 0:
214 rep = probes[-1]
215 hsqs.append(rep)
216 self.log('{0.start:3} {0.end:3} ({0.length:2} aa) {0.recurrence:3} hits'.format(rep), level=1)
217 else:
218 self.log(' no hits', level=1)
219
220 self._hsqs = hsqs
221 return tuple(hsqs)
222
224
225 self.log('\n# Extracting fragments...')
226
227 if self._hsqs is None:
228 raise InvalidOperationError('The query has to be sliced first')
229
230 fragments = []
231
232 for si in self._hsqs:
233 self.log('\nSEGMENT: {0.start:3} {0.end:3} ({0.recurrence})'.format(si), level=2)
234
235 for hit in si.hits:
236 cn = 0
237 for chunk in hit.alignment.segments:
238 chunk.qstart = chunk.qstart + si.start - 1
239 chunk.qend = chunk.qend + si.start - 1
240 cn += 1
241 self.log(' {0.id:5} L{0.qstart:3} {0.qend:3} {0.length:2}aa P:{0.probability:5.3f}'.format(chunk), ending='', level=2)
242
243 sourcefile = os.path.join(self.database, hit.id + '.pdb')
244 if not os.path.isfile(sourcefile):
245 self.log(' missing', level=2)
246 continue
247 source = csb.bio.io.StructureParser(sourcefile).parse().first_chain
248 assert hit.id[-1] == source.id
249
250 source.compute_torsion()
251 try:
252 fragment = csb.bio.fragments.Assignment(source,
253 chunk.start, chunk.end, hit.id,
254 chunk.qstart, chunk.qend, chunk.probability,
255 rmsd=None, tm_score=None)
256 fragments.append(fragment)
257 if cn > 1:
258 self.log(' (chunk #{0})'.format(cn), level=2)
259 else:
260 self.log('', level=2)
261
262 except csb.bio.structure.Broken3DStructureError:
263 self.log(' corrupt', level=2)
264 continue
265
266 self._matches = fragments
267 return tuple(fragments)
268
270
271 self.log('\n {0} ungapped assignments'.format(len(self._matches)))
272 self.log('', level=2)
273
274 histogram = {}
275 for f in self._matches:
276 histogram[f.length] = histogram.get(f.length, 0) + 1
277
278 for length in sorted(histogram):
279
280 percent = histogram[length] * 100.0 / len(self._matches)
281 bar = '{0:3} |{1} {2:5.1f}%'.format(length, 'o' * int(percent), percent)
282 self.log(bar, level=2)
283
298
300 if ri.rep is None:
301 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} - - -'.format(ri, ri.rep), level=2)
302 else:
303 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1.id:5} {1.start:3} {1.end:3}'.format(ri, ri.rep), level=2)
304
316
318 if rei.confidence is None:
319 self.log('{0.rank:3}. - {0.count:3}'.format(rei), level=2)
320 else:
321 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3}'.format(rei), level=2)
322
341
342
343 -class SliceContext(hhsearch.Context):
344
345 - def __init__(self, segment, start, end):
346
347 self.start = start
348 self.end = end
349
350 if not isinstance(segment, csb.core.string):
351 segment = segment.to_hmm(convert_scores=True)
352
353 super(SliceContext, self).__init__(segment)
354
355 @property
357 return self.end - self.start + 1
358
359 @property
362
363 @property
364 - def recurrence(self):
365 return len(self.result)
366
367 - def __lt__(self, other):
368
369 if self.recurrence == other.recurrence:
370 return self.length < other.length
371 else:
372 return self.recurrence < other.recurrence
373
374
375
376
377 if __name__ == '__main__':
378
379 AppRunner().run()
380