1 """
2 HHfrag: build a dynamic variable-length fragment library for protein structure
3 prediction with Rosetta AbInitio.
4 """
5
6 import os
7 import multiprocessing
8
9 import csb.apps
10 import csb.apps.hhsearch as hhsearch
11
12 import csb.bio.io.hhpred
13 import csb.bio.fragments
14 import csb.bio.fragments.rosetta as rosetta
15 import csb.bio.structure
16
17 import csb.io
18 import csb.core
27
30
31 @property
34
36
37 cmd = csb.apps.ArgHandler(self.program, __doc__)
38 cpu = multiprocessing.cpu_count()
39
40 cmd.add_scalar_option('hhsearch', 'H', str, 'path to the HHsearch executable', default='hhsearch')
41 cmd.add_scalar_option('database', 'd', str, 'database directory (containing PDBS25.hhm)', required=True)
42
43 cmd.add_scalar_option('min', 'm', int, 'minimum query segment length', default=6)
44 cmd.add_scalar_option('max', 'M', int, 'maximum query segment length', default=21)
45 cmd.add_scalar_option('step', 's', int, 'query segmentation step', default=3)
46 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu)
47
48 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=2)
49 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.')
50 cmd.add_scalar_option('gap-filling', 'g', str, 'path to a Rosetta 9-mer fragment file, that will be used '
51 'to complement gaps in the fragment map (if specified, a joint fragment file will be produced)')
52 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map', default=False)
53 cmd.add_boolean_option('c-alpha', None, 'include also C-alpha vectors in the output', default=False)
54
55 cmd.add_positional_argument('QUERY', str, 'query profile HMM (e.g. created with csb.apps.buildhmm)')
56
57 return cmd
58
61
63 if not os.path.isdir(self.args.output):
64 HHfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True)
65
66 if self.args.c_alpha:
67 builder = rosetta.ExtendedOutputBuilder
68 else:
69 builder = rosetta.OutputBuilder
70
71 try:
72 hhf = HHfrag(self.args.QUERY, self.args.hhsearch, self.args.database, logger=self)
73 output = os.path.join(self.args.output, hhf.query.id)
74
75 hhf.slice_query(self.args.min, self.args.max, self.args.step, self.args.cpu)
76 frags = hhf.extract_fragments()
77
78 if len(frags) == 0:
79 HHfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT)
80
81 fragmap = hhf.build_fragment_map()
82 fragmap.dump(output + '.hhfrags.09', builder)
83
84 if self.args.filtered_map:
85 fragmap = hhf.build_filtered_map()
86 fragmap.dump(output + '.filtered.09', builder)
87
88 if self.args.gap_filling:
89 fragmap = hhf.build_combined_map(self.args.gap_filling)
90 fragmap.dump(output + '.complemented.09', builder)
91
92 self.log('\nDONE.')
93
94 except ArgumentIOError as ae:
95 HHfragApp.exit(str(ae), code=ExitCodes.IO_ERROR)
96
97 except ArgumentError as ae:
98 HHfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA, usage=True)
99
100 except csb.io.InvalidCommandError as ose:
101 msg = '{0!s}: {0.program}'.format(ose)
102 HHfragApp.exit(msg, ExitCodes.IO_ERROR)
103
104 except csb.bio.io.hhpred.HHProfileFormatError as hfe:
105 msg = 'Corrupt HMM: {0!s}'.format(hfe)
106 HHfragApp.exit(msg, code=ExitCodes.INVALID_DATA)
107
108 except csb.io.ProcessError as pe:
109 message = 'Bad exit code from HHsearch: #{0.code}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context)
110 HHfragApp.exit(message, ExitCodes.HHSEARCH_FAILURE)
111
112 - def log(self, message, ending='\n', level=1):
116
120
123
126
129
130 PDBS = 'pdbs25.hhm'
131
132 - def __init__(self, query, binary, database, logger=None):
149
150 @property
153
154 @property
157
158 @property
160 return self._database
161 @database.setter
169
170 @property
173 @aligner.setter
175 if hasattr(value, 'run') and hasattr(value, 'runmany'):
176 self._aligner = value
177 else:
178 raise TypeError(value)
179
180 - def log(self, *a, **ka):
181
182 if self._app:
183 self._app.log(*a, **ka)
184
185 - def slice_query(self, min=6, max=21, step=3, cpu=None):
186
187 if not 0 < min <= max:
188 raise ArgumentError('min and max must be positive numbers, with max >= min')
189 if not 0 < step:
190 raise ArgumentError('step must be positive number')
191
192 self.log('\n# Processing profile HMM "{0}"...'.format(self.query.id))
193 self.log('', level=2)
194 qp = self.query
195 hsqs = []
196
197 if not cpu:
198 cpu = max - min + 1
199
200 for start in range(1, qp.layers.length - min + 1 + 1, step):
201
202 self.log('{0:3}. '.format(start), ending='', level=1)
203 probes = []
204
205 for end in range(start + min - 1, start + max):
206 if end > qp.layers.length:
207 break
208 context = SliceContext(qp.segment(start, end), start, end)
209 probes.append(context)
210
211 probes = self.aligner.runmany(probes, workers=cpu)
212 probes.sort()
213
214 if len(probes) > 0:
215 rep = probes[-1]
216 hsqs.append(rep)
217 self.log('{0.start:3} {0.end:3} ({0.length:2} aa) {0.recurrence:3} hits'.format(rep), level=1)
218 else:
219 self.log(' no hits', level=1)
220
221 self._hsqs = hsqs
222 return tuple(hsqs)
223
225
226 self.log('\n# Extracting fragments...')
227
228 if self._hsqs is None:
229 raise InvalidOperationError('The query has to be sliced first')
230
231 fragments = []
232
233 for si in self._hsqs:
234 self.log('\nSEGMENT: {0.start:3} {0.end:3} ({0.recurrence})'.format(si), level=2)
235
236 for hit in si.hits:
237 cn = 0
238 for chunk in hit.alignment.segments:
239 chunk.qstart = chunk.qstart + si.start - 1
240 chunk.qend = chunk.qend + si.start - 1
241 cn += 1
242 self.log(' {0.id:5} L{0.qstart:3} {0.qend:3} {0.length:2}aa P:{0.probability:5.3f}'.format(chunk), ending='', level=2)
243
244 sourcefile = os.path.join(self.database, hit.id + '.pdb')
245 if not os.path.isfile(sourcefile):
246 self.log(' missing', level=2)
247 continue
248 source = csb.bio.io.StructureParser(sourcefile).parse().first_chain
249 assert hit.id[-1] == source.id
250
251 source.compute_torsion()
252 try:
253 fragment = csb.bio.fragments.Assignment(source,
254 chunk.start, chunk.end, hit.id,
255 chunk.qstart, chunk.qend, chunk.probability,
256 rmsd=None, tm_score=None)
257 fragments.append(fragment)
258 if cn > 1:
259 self.log(' (chunk #{0})'.format(cn), level=2)
260 else:
261 self.log('', level=2)
262
263 except csb.bio.structure.Broken3DStructureError:
264 self.log(' corrupt', level=2)
265 continue
266
267 self._matches = fragments
268 return tuple(fragments)
269
271
272 self.log('\n {0} ungapped assignments'.format(len(self._matches)))
273 self.log('', level=2)
274
275 histogram = {}
276 for f in self._matches:
277 histogram[f.length] = histogram.get(f.length, 0) + 1
278
279 for length in sorted(histogram):
280
281 percent = histogram[length] * 100.0 / len(self._matches)
282 seg = b'\xe2\x9f\xa4'.decode('utf-8')
283 bar = b'{0:3} |{1} {2:5.1f}%'.decode('utf-8')
284 bar = bar.format(length, seg * int(percent), percent)
285 self.log(bar, level=2)
286
301
303 if ri.rep is None:
304 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} - - -'.format(ri, ri.rep), level=2)
305 else:
306 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1.id:5} {1.start:3} {1.end:3}'.format(ri, ri.rep), level=2)
307
319
321 if rei.confidence is None:
322 self.log('{0.rank:3}. - {0.count:3}'.format(rei), level=2)
323 else:
324 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3}'.format(rei), level=2)
325
344
345
346 -class SliceContext(hhsearch.Context):
347
348 - def __init__(self, segment, start, end):
349
350 self.start = start
351 self.end = end
352
353 if not isinstance(segment, csb.core.string):
354 segment = segment.to_hmm(convert_scores=True)
355
356 super(SliceContext, self).__init__(segment)
357
358 @property
360 return self.end - self.start + 1
361
362 @property
365
366 @property
367 - def recurrence(self):
368 return len(self.result)
369
370 - def __lt__(self, other):
371
372 if self.recurrence == other.recurrence:
373 return self.length < other.length
374 else:
375 return self.recurrence < other.recurrence
376
377
378
379
380 if __name__ == '__main__':
381
382 AppRunner().run()
383