1 """
2 Build an HMM from a FASTA sequence. This program is a proxy to buildali.pl
3 and hhmake from the HHpred package.
4
5 @note: assuming you have the full HHpred package installed and configured.
6 """
7
8
9 import os
10 import abc
11
12 import csb.apps
13 import csb.core
14 import csb.io
15
16 from csb.bio.io.wwpdb import StructureParser
17 from csb.bio.io.hhpred import HHProfileParser
18 from csb.bio.io.fasta import FASTAOutputBuilder
19 from csb.bio.sequence import ChainSequence
28
30
31 @property
34
36
37 cmd = csb.apps.ArgHandler(self.program, __doc__)
38
39 cmd.add_scalar_option('query-id', 'q', str, 'ID of the query, in PDB-like format (accessionCHAIN).'
40 'Used for naming the output files. Also, if the input is a PDB file with '
41 'multiple chains, CHAIN is used to pull the required chain from the file.',
42 required=True)
43 cmd.add_scalar_option('tk_root', 't', str, 'path to the ToolkitRoot folder in your HHpred setup', default='/ebio/abt1_toolkit/share/wye')
44 cmd.add_scalar_option('cpu', None, int, 'maximum degree of parallelism', default=1)
45
46 cmd.add_boolean_option('no-ss', None, 'do not include secondary structure', default=False)
47 cmd.add_boolean_option('no-pseudo', None, 'do not add emission and transition pseudocounts', default=False)
48 cmd.add_boolean_option('no-calibration', None, 'do not calibrate the profile', default=False)
49
50 cmd.add_positional_argument('query', str, 'input sequence (FASTA or PDB file)')
51
52 return cmd
53
56
58
59 if os.path.isfile(self.args.query_id + '.hhm'):
60 BuildProfileApp.exit('# Profile "{0}" already exists, skipping'.format(self.args.query_id),
61 ExitCodes.CLEAN)
62
63 try:
64 self.log('# Building profile HMM for {0}...'.format(self.args.query))
65 pb = ProfileBuilder.create(self.args.query, self.args.query_id, self.args.tk_root,
66 pseudo=not self.args.no_pseudo, ss=not self.args.no_ss, cpu=self.args.cpu)
67
68 pb.build_alignment()
69
70 pb.make_hmm()
71
72 if not self.args.no_calibration:
73 pb.calibrate_hmm()
74
75 except BuildArgError as ae:
76 BuildProfileApp.exit(str(ae), ExitCodes.INVALID_DATA)
77
78 except BuildIOError as ioe:
79 BuildProfileApp.exit(str(ioe), ExitCodes.IO_ERROR)
80
81 except csb.io.InvalidCommandError as ose:
82 msg = '{0!s}: {0.cmd}'.format(ose)
83 BuildProfileApp.exit(msg, ExitCodes.IO_ERROR)
84
85 except NoOutputError as noe:
86 msg = 'Expected file {0} not produced by: {1.cmd}.\nSTDERR: {1.stderr}\nSTDOUT: {1.stdout}'.format(noe.expected, noe.context)
87 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)
88
89 except csb.io.ProcessError as pe:
90 msg = 'Bad exit code #{0.code} from: {0.cmd}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context)
91 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)
92
93 self.log(' successfully created profile "{0}"'.format(self.args.query_id))
94
98
101
104
106
107 - def __init__(self, expected, context, *args):
112
115
116 __metaclass__ = abc.ABCMeta
117
118 EMISSION_PSEUDO = '-pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0'
119 TRANSITION_PSEUDO = '-gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6'
120
121 @staticmethod
122 - def create(query, target_id, tk_root, pseudo=True, ss=True, cpu=1):
123
124 if not os.path.isfile(query):
125 raise BuildIOError('File not found: ' + query)
126
127 for line in open(query):
128
129 if not line.strip():
130 continue
131
132 if line.startswith('>'):
133 return FASTAProfileBuilder(query, target_id, tk_root, pseudo, ss, cpu)
134 elif line.startswith('HEADER') or line.startswith('ATOM'):
135 return PDBProfileBuilder(query, target_id, tk_root, pseudo, ss, cpu)
136 else:
137 raise BuildArgError('Unknown input file format')
138
139 - def __init__(self, query, target_id, tk_root, pseudo=True, ss=True, cpu=1):
140
141 self.tk_root = tk_root
142 if 'TK_ROOT' not in os.environ or not os.environ['TK_ROOT']:
143 os.putenv('TK_ROOT', tk_root)
144
145 self.query = query
146 self.accession = target_id[:-1]
147 self.chain = target_id[-1]
148 self.pseudo = bool(pseudo)
149 self.ss = bool(ss)
150 self.cpu = cpu
151
152 self._input = None
153 self._a3m = None
154 self._hhm = None
155
156 self.configure_input()
157
163
164 @property
167
168 @abc.abstractmethod
171
173 assert self._input is not None
174
175 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'buildali.pl')
176
177 if not self.ss:
178 noss = '-noss'
179 else:
180 noss = ''
181 cmd = 'perl {0} {1} -cpu {2} {3}'.format(program, noss, self.cpu, self._input)
182 bali = csb.io.Shell.run(cmd)
183
184 ali = self.target_id + '.a3m'
185 if bali.code != 0:
186 raise csb.io.ProcessError(bali)
187 if not os.path.isfile(ali):
188 raise NoOutputError(ali, bali)
189
190 self._ali = ali
191 return ali
192
211
220
237
275
276
277
278 if __name__ == '__main__':
279
280 AppRunner().run()
281