1 """
2 Python application for robust structure superposition of two structures.
3 bfit models non-rigid displacements in protein ensembles with outlier-tolerant
4 probability distributions.
5 """
6 import numpy
7
8 import csb.apps
9 import csb.bio.utils
10
11 from csb.bio.io.wwpdb import LegacyStructureParser
12 from csb.bio.utils import probabilistic_fit
13 from csb.statistics.scalemixture import ScaleMixture, GammaPrior, InvGammaPrior
14 from csb.statistics.scalemixture import GammaPosteriorMAP, InvGammaPosteriorMAP
15 from csb.bio.sequence import SequenceAlignment
21
23
24 @property
27
29
30 cmd = csb.apps.ArgHandler(self.program, __doc__)
31
32
33 cmd.add_positional_argument('pdb1', str,
34 'full path to the first structure')
35
36 cmd.add_positional_argument('pdb2', str,
37 'full path to the second structure')
38
39
40 cmd.add_scalar_option('chain1', 'c', str,
41 'Chain of the first structure',
42 default='A')
43
44 cmd.add_scalar_option('chain2', 'd', str,
45 'Chain of the second structure',
46 default='A')
47
48 cmd.add_scalar_option('scalemixture', 's', str,
49 'Scale mixture distribution',
50 default='student',
51 choices=['student', 'k'])
52
53
54 cmd.add_scalar_option('alignment', 'a', str,
55 'Alignment in fasta format defining equivalent positions\n'
56 + 'Assumes that chain1 is the first sequence of '
57 + 'the alignment and chain2 the second sequence')
58
59 cmd.add_scalar_option('outfile', 'o', str,
60 'file to which the rotated second ' +
61 'structure will be written',
62 default='bfit.pdb')
63
64 cmd.add_scalar_option('niter', 'n', int,
65 'Number of optimization steps',
66 default=200)
67
68 cmd.add_boolean_option('em', None,
69 'Use the EM algorithm for optimsation',
70 default = False)
71
72 return cmd
73
74
75
76 -class BFitApp(csb.apps.Application):
77 """
78 Python application for robust structure superposition of two protein structures
79 """
80
82 try:
83 parser = LegacyStructureParser(self.args.pdb1)
84 r = parser.parse()
85
86 parser = LegacyStructureParser(self.args.pdb2)
87 m = parser.parse()
88 except IOError as e:
89 self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR)
90
91 X = numpy.array(r[self.args.chain1].list_coordinates(['CA'], True))
92 Y = numpy.array(m[self.args.chain2].list_coordinates(['CA'], True))
93
94 if self.args.alignment is not None:
95 align = SequenceAlignment.parse(file(self.args.alignment).read())
96 align = align[:2, :]
97
98 matches = []
99 for i in range(1, align.length + 1):
100 if not align.gap_at(i):
101 matches.append([align.columns[i][0].rank - 1,
102 align.columns[i][1].rank - 1])
103 matches = numpy.array(matches)
104 X = X[matches[:, 0], :]
105 Y = Y[matches[:, 1], :]
106
107
108 if len(X) != len(Y):
109 self.exit('Structures are of different lengths,' +
110 ' please specify an alignment',
111 ExitCodes.INPUT_ERROR)
112
113 if self.args.scalemixture == 'student':
114 prior = GammaPrior()
115 if self.args.em:
116 prior.estimator = GammaPosteriorMAP()
117
118 elif self.args.scalemixture == 'k':
119 prior = InvGammaPrior()
120 if self.args.em:
121 prior.estimator = InvGammaPosteriorMAP()
122
123 mixture = ScaleMixture(scales=X.shape[0],
124 prior=prior, d=3)
125
126 R, t = csb.bio.utils.fit(X, Y)
127
128
129 for i in range(self.args.niter):
130
131 data = numpy.sum((X - numpy.dot(Y, numpy.transpose(R)) - t) ** 2,
132 axis= -1) ** (1. / 2)
133
134 mixture.estimate(data)
135
136 R, t = probabilistic_fit(X, Y, mixture.scales)
137
138
139 m.transform(R, t)
140 m.to_pdb(self.args.outfile)
141
142
143
144 if __name__ == '__main__':
145 AppRunner().run()
146