1 """
2 I-Sites fragment library APIs.
3
4 @warning: This module is no longer developed and is provided as is.
5 """
6
7 import sys
8 import collections
9 import operator
10 import math
11
12 import csb.io
13 import csb.core
14 import csb.bio.fragments
15 import csb.bio.structure as structure
16 import csb.bio.sequence
17 import csb.bio.hmm as hmm
25
30
32 """
33 Represents a single PDB chain sequence entry with torsion angles
34 """
35
44
45 @property
47 if hasattr(self, 'torsion'):
48 return len(self.torsion) > 0
49 else:
50 return False
51
52 @property
54 id = self.accession
55 if self.id is not None:
56 id += self.id
57 return id
58
59 @property
64
66 """
67 Describes an I-Sites protein profile/PSSM.
68
69 @param background: background amino acid frequencies
70 @type background: dict
71 @param matrix: initialization array with target frequencies
72 @type matrix: dict
73 @param alpha: fixed "pseudocount" number for this cluster/profile
74 @type alpha: float
75 """
76
77 BackgroundFreqs = {
78 'A': 0.077049999999999993, 'C': 0.016140000000000002, 'D': 0.058470000000000001,
79 'E': 0.065640000000000004, 'F': 0.041090000000000002, 'G': 0.072179999999999994,
80 'H': 0.023630000000000002, 'I': 0.058400000000000001, 'L': 0.089560000000000001,
81 'K': 0.061460000000000001, 'M': 0.021680000000000001, 'N': 0.045400000000000003,
82 'P': 0.045179999999999998, 'Q': 0.037080000000000002, 'R': 0.049790000000000001,
83 'S': 0.061870000000000001, 'T': 0.055660000000000001, 'V': 0.069790000000000005,
84 'W': 0.014319999999999999, 'Y': 0.035610000000000003,
85
86 'B': 0.051935000000000002, 'Z': 0.051360000000000003, 'X': 1.000000000000000000,
87
88 'O': sys.float_info.min, 'U': sys.float_info.min
89 }
90
102
103
105 """
106 Append a new column to the profile.
107
108 Keyword arguments are used to pass the individual amino acid target frequencies,
109 e.g. A=0.12, B=0.0, etc. If an amino acid is omitted, its probability will be
110 set automatically to 0.
111
112 @param target_frequencies: amino acid frequencies in that column
113 @type target_frequencies: dict
114
115 @return: the index of the new column
116 @rtype: int
117
118 @raise KeyError: if the target frequencies dict contains an unknown amino acid
119 with respect to C{profile.background}
120 """
121 if not set(target_frequencies.keys()).issubset(self.background):
122 raise KeyError('Unrecognized residue name(s)')
123
124 profcol = { }
125 pssmcol = { }
126
127 for aa in self.background:
128
129 assert self.background[aa] > 0
130
131 if aa in target_frequencies:
132 assert target_frequencies[aa] >= 0
133 profcol[aa] = target_frequencies[aa]
134 else:
135 profcol[aa] = 0.0
136
137 Qa = self.background[aa]
138 Pja = profcol[aa]
139 alpha = self.alpha
140
141 pssmcol[aa] = math.log( (Pja + alpha * Qa) / (Qa * (1 + alpha)) )
142
143 self._matrix.append(profcol)
144 self._pssm.append(pssmcol)
145 self._maxscore += max(pssmcol.values())
146
147 return len(self._matrix) - 1
148
149 @property
151 return len(self._matrix)
152
153 @property
156
157 @property
160
161 @property
163 return self._maxscore
164
166
167 - def __init__(self, fragment, qstart, qend, probability, rmsd, tm_score, qlength):
176
177 @property
180
181 @property
184
185 @property
188
190 """
191 Describes a profile-query match
192
193 @param id: id of the match
194 @type id: str
195 @param start: start position of the match
196 @type start: int
197 @param score: score
198 @type score: float
199 @param rmsd: RMSD between the query and the representative structure of the
200 fragment withing the matching region
201 @type rmsd: float
202 @param rmsd_dih: circular (torsion) RMSD between the query and the representative
203 structure of the fragment withing the matching region
204 @type rmsd_dih: float
205 @param tm_score: TM-Score between the query and the representative
206 structure of the fragment withing the matching region
207 @type tm_score: float
208 @param tm_scoreout: TM-Score, but calculated with local structural alignment
209 @type tm_scoreout: float
210 """
211
212 - def __init__(self, id, start, score, rmsd=None, rmsd_dih=None, tm_score=None, tm_scoreout=None):
213 self.id = id
214 self.start = start
215 self.score = score
216 self.rmsd = rmsd
217 self.rmsd_dih = rmsd_dih
218 self.tm_score = tm_score
219 self.tm_scoreout = tm_scoreout
220
222 return "<ProfileMatch: {0.id} at {0.start}, s={0.score}, rms={0.rmsd}, tm={0.tm_score},{0.tm_scoreout} >".format(self)
223
226
228 """
229 Convert the raw profile score to probability.
230
231 @param b0: C{cluster.linearfit[0]}
232 @type b0: float
233 @param b1: C{cluster.linearfit[1]}
234 @type b1: float
235 @param b2: C{cluster.linearfit[2]}
236 @type b2: float
237 @param b3: C{cluster.linearfit[3]}
238 @type b3: float
239
240 @return: probability for the hit being a true positive
241 @rtype: float
242 """
243
244 if self.score is None:
245 return 0
246 return ((1 / math.pi) * math.atan(b0 + b1 * self.score) + b2) * b3
247
249 """
250 A collection of profile-database hits.
251
252 @param size: target maximum size of the hitlist
253 @type size: int
254 @param cutoff: inclusion score cutoff
255 @type cutoff: float
256 @param rmsd_cutoff: inclusion RMSD cutoff
257 @type rmsd_cutoff: float
258
259 @raise ValueError: on invalid hitlist C{size}
260 """
261
262 - def __init__(self, size=20, cutoff=0.5, rmsd_cutoff=0.5):
263 if not size > 0:
264 raise ValueError('The maximum size of the matches table must be a positive number.')
265
266 self._matches = [ ]
267 self.list_size = int(size)
268 self.cutoff = float(cutoff)
269 self.rmsd_cutoff = float(rmsd_cutoff)
270
271 self._sorted = True
272
274 if not self._sorted:
275 self._matches.sort(key=operator.attrgetter('score'), reverse=True)
276 self._sorted = True
277
278 - def add(self, id, start, score, rmsd=None):
279 """
280 Add a new match to the collection of hits if its score satisfies the cutoff.
281
282 @param id: id of the match
283 @type id: str
284 @param start: start position of the match
285 @type start: int
286 @param score: score
287 @type score: float
288 @param rmsd: RMSD between the query and the representative structure of the
289 fragment withing the matching region
290 @type rmsd: float
291
292 @raise value_error: on invalid start position
293 """
294 if not start > 0:
295 raise ValueError('Match start position must be greater than zero.')
296 score = float(score)
297 start = int(start)
298
299 self._sorted = False
300
301 if len(self._matches) < self.list_size:
302 if score >= self.cutoff:
303 match = ProfileMatch(id, start, score, rmsd)
304 self._matches.append(match)
305
306 else:
307 best_match = self._matches[-1]
308 if score > best_match.score:
309 self._matches.pop()
310 match = ProfileMatch(id, start, score, rmsd)
311 self._matches.append(match)
312 self._sort()
313
314 @property
316 return len(self._matches)
317
318 @property
320 if self.length > 0:
321 self._sort()
322 return self._matches
323 else:
324 return None
325
326 @property
328 if self.length > 0:
329 return self.hits[0]
330 else:
331 return None
332
334 """
335 I-Site instance hitlist.
336
337 @param query: the query fragment
338 @type query: L{Cluster}
339 @param hits: a list of L{ProfileMatch}es
340 @type hits: list
341 """
342
347
350
352 return iter(self.hits)
353
367
369 """
370 @param n: number of top hits
371 @type n: int
372
373 @return: top C{n} hits in terms of profile score
374 @rtype: list of L{ProfileMatch}
375 """
376
377 hits = list(self.hits)
378 hits.sort()
379 return hits[-n:]
380
382 """
383 Representation of an I-Sites peptide fragment library.
384 Provides dictionary-like access to the underlying L{Cluster} objects by
385 either C{cluster.id} or C{cluster.representative.accesion}.
386 """
387
389
390 self.name = 'ISITES'
391 self.version = None
392 self.centroids = None
393 self.documentation = None
394
395 self._clusters = None
396 self._ondemand = False
397 self._byid = None
398 self._byrep = None
399
400 self.__iscfiles = []
401
403 if self._ondemand:
404 raise AttributeError("ID/rep-based access to clusters is not available in Delay-parsed libraries.")
405
406 if isinstance(item, csb.core.string):
407 return self._byrep[item]
408 else:
409 return self._byid[item]
410
412 try:
413 self[key]
414 return True
415 except KeyError:
416 return False
417
418 @property
420 return self._clusters
421
422 @clusters.setter
447
449 """
450 Compact the library by excluding unimportant information:
451
452 - documentation
453 - clusters: file, program, covariance
454
455 @raise AttributeError: on attempt to compact a delay-parsed library
456 """
457 if self._ondemand:
458 raise AttributeError("Delay-parsed libraries cannot be compacted.")
459
460 self.documentation = None
461 for c in self.clusters:
462 c.file = None
463 c.program = None
464 c.covariance = None
465
467 """
468 Serialize the library to a file.
469
470 @param file: output file name
471 @type file: str
472
473 @raise AttributeError: on attempt to serialize a delay-parsed library
474 """
475 if self._ondemand:
476 raise AttributeError("Delay-parsed libraries cannot be serialized.")
477
478 with open(file, 'wb') as output:
479 csb.io.Pickle.dump(self, output, protocol=csb.io.Pickle.HIGHEST_PROTOCOL)
480
481 @staticmethod
483 """
484 Load I-sites from a serialization file.
485
486 @param file: source file name
487 @type file: str
488
489 @return: the deserialized library object
490 @rtype: L{Library}
491 """
492
493 with open(file, 'rb') as output:
494 library = csb.io.Pickle.load(output)
495
496 return library
497
499 """
500 Object representation of an I-Sites library peptide fragment.
501 """
502
504
505 self.id = None
506 self.motiflen = None
507 self.profilelen = None
508 self.overhang = None
509 self.file = None
510 self.program = None
511 self.mda = None
512 self.dme = None
513 self.pseudocount = 0.2
514 self.linearfit = None
515 self.representative = None
516 self.covarweight = None
517 self.profile = ProteinProfile(ProteinProfile.BackgroundFreqs, alpha=self.pseudocount)
518 self.covariance = None
519
521 return self.id < other.id
522
523 @property
526
527 @property
529 return self.profile.length
530
531 @property
533 if self.representative.structure:
534 return True
535 return False
536
538 """
539 Serialize the cluster to a file.
540
541 @param file: output file name
542 @type file: str
543 """
544
545 with open(file, 'wb') as output:
546 csb.io.Pickle.dump(self, output, protocol=csb.io.Pickle.HIGHEST_PROTOCOL)
547
548 @staticmethod
550 """
551 Load a cluster from a serialized object.
552
553 @param file: source file name
554 @type file: str
555
556 @return: deserialized fragment object
557 @rtype: L{Cluster}
558 """
559
560 with open(file, 'rb') as output:
561 cluster = csb.io.Pickle.load(output)
562
563 return cluster
564
566 """
567 Parse the paradigm's pdb_file and attach structural data to
568 C{cluster.representative} (C{source} and C{structure} attributes).
569
570 @param pdb_file: source PDB file containing the structure of the paradigm
571 @type pdb_file: str
572
573 @raise InternalISitesError: on parse and/or structure assignment errors
574 @raise InvalidParadigmError: when the torsion angles in the cluster do not
575 match the angles computed from the PDB file
576 """
577 from csb.bio.io import StructureParser
578 rep = self.representative
579 s = StructureParser(pdb_file).parse_structure()
580
581 if not rep.chain:
582 if s.chains.length > 1:
583 raise InternalISitesError('Cluster {0} does not define any chain, but multiple choices exist.'.format(self.id))
584 else:
585 rep.chain = s.chains.keys()[0]
586 if rep.chain not in s.chains:
587 raise InternalISitesError('Structure {0} does not contain chain {1}'.format(rep.accession, rep.chain))
588
589 s.chains[rep.chain].compute_torsion()
590 start_residue = s.chains[rep.chain].find(rep.start)
591
592 current_rank = start_residue.rank
593 for i, rep_angles in enumerate(rep.angles, start=rep.angles.start_index):
594
595 current_residue = s.chains[rep.chain].residues[current_rank]
596 torsion_delta = 0
597
598 if current_residue.torsion.phi:
599 torsion_delta += abs(current_residue.torsion.phi - rep_angles.phi)
600 if current_residue.torsion.psi:
601 torsion_delta += abs(current_residue.torsion.psi - rep_angles.psi)
602
603 assert (i + rep.start - 1) == int(current_residue.id)
604
605 if torsion_delta > 1:
606 raise InvalidParadigmError('delta_phi + delta_psi for '
607 '{3.accession} {3.chain}[{4}] is {0}: phi: {1.phi} vs {2.phi}, psi: {1.psi} vs {2.psi}'.format(
608 torsion_delta, current_residue.torsion, rep_angles, rep, current_rank))
609 current_rank += 1
610
611 start = start_residue.rank - self.overhang
612 stop = start + self.profile.length - 1
613
614 if start < s.chains[rep.chain].residues.start_index or stop > s.chains[rep.chain].residues.last_index:
615 raise InternalISitesError('With the ovehangs included, fragment '
616 '{0} exceeds the boundaries of the chain: {1}..{2} vs {3.start_index}..{3.last_index}'.format(
617 self.id, start, stop, s.chains[rep.chain].residues))
618
619 rep.structure = s.chains[rep.chain].subregion(start, stop)
620
621 assert rep.structure.length == self.profile.length
622
623 backbone = set(['N', 'CA', 'C'])
624 for residue in rep.structure.residues:
625 if not (residue.has_structure and set(residue.atoms).issuperset(backbone)):
626 rep.structure = None
627 raise InternalISitesError('Fragment {0} is built on discontinuous structure: {1} {4} at {2}-{3}'.format(
628 self.id, rep.accession, start, stop, residue))
629
630 rep.source = ChainSequence()
631 rep.source.sequence = s.chains[rep.chain].sequence
632 rep.source.accession = s.accession
633 rep.source.id = rep.chain
634 rep.source.type = s.chains[rep.chain].type
635 rep.source.torsion = s.chains[rep.chain].torsion
636
638 """
639 Compute the log-odds score of C{sequence} matching C{self}'s profile.
640
641 @param sequence: subject sequence
642 @type sequence: str
643 @param start: do the comparison starting with this offset in the sequence
644 @type start: int
645
646 @return: log-odds score
647 @rtype: float
648 """
649 score = 0
650 window = self.profile.length
651
652 sequence = sequence[start : start + window]
653 assert len(sequence) == self.profile.length
654
655 for column, aa in enumerate(sequence):
656 aa = aa.upper()
657 if aa in self.profile.pssm[column]:
658 score += self.profile.pssm[column][aa]
659 else:
660 raise ValueError('Unknown residue {0}'.format(aa))
661
662 return score
663
665 """
666 Compute the similarity score of C{profile} matching C{self}'s profile (see
667 Bystroff 2001, 2008).
668
669 @param profile: subject profile, built with alpha=0.5
670 @type profile: L{ProteinProfile}
671 @param start: do the comparison starting with this offset in the subject
672 @type start: int
673
674 @return: similarity score calculated with Bystroff's metric
675 @rtype: float
676
677 @raise ValueError: on mismatch in the amino acid content at a given column
678 in both profiles; or when the subject has not been created
679 with alpha=0.5
680 """
681 if profile.alpha != 0.5:
682 raise ValueError('The target profile must be built with alpha=0.5')
683
684 score = 0
685 window = self.profile.length
686
687 for column in range(0, window):
688
689 if set(self.profile.pssm[column]) != set(profile.pssm[column + start]):
690 raise ValueError('Both profiles must contain identical amino acids at a given column')
691
692 for aa in self.profile.pssm[column]:
693 score += self.profile.pssm[column][aa] * profile.pssm[column + start][aa]
694
695 return score
696
698 """
699 Compute the similarity score of C{profile} matching C{self}'s profile using
700 the cross entropy method.
701
702 @param profile: subject profile
703 @type profile: L{ProteinProfile}
704 @param start: do the comparison starting with this offset in the subject
705 @type start: int
706
707 @return: the cross entropy between the profiles
708 @rtype: float
709 """
710 score = 0
711 window = self.profile.length
712
713 for column in range(0, window):
714 for aa in self.profile.pssm[column]:
715 score += self.profile.pssm[column][aa] * math.exp( profile.pssm[column + start][aa] )
716
717 return score
718
720 """
721 Compute the similarity score of C{profile} matching C{self}'s profile using
722 the log-sum-of-odds method (Soeding).
723
724 @param profile: subject profile
725 @type profile: L{ProteinProfile}
726 @param start: do the comparison starting with this offset in the subject
727 @type start: int
728
729 @return: the log-sum-of-odds similarity between the profiles
730 @rtype: float
731 """
732 score = 1
733 window = self.profile.length
734
735 for column in range(0, window):
736
737 dot_product = 0
738
739 for aa in self.profile.pssm[column]:
740 q_emission = self.profile.matrix[column][aa]
741 s_emission = profile.matrix[column + start][aa]
742 dot_product += ((q_emission * s_emission) / self.profile.background[aa])
743
744 score *= dot_product
745
746 return math.log(score or sys.float_info.min)
747
749 """
750 Compute the similarity score of C{profile} matching C{self}'s profile using
751 the exponential city block metric (see Baker 1995).
752
753 @param profile: subject profile
754 @type profile: L{ProteinProfile}
755 @param start: do the comparison starting with this offset in the subject
756 @type start: int
757
758 @return: the city block similarity between the profiles
759 @rtype: float
760 """
761
762 score = 0
763 window = self.profile.length
764
765 for column in range(0, window):
766 for aa in self.profile.pssm[column]:
767 diff = math.exp( -self.profile.matrix[column][aa] ) - math.exp( -profile.matrix[column + start][aa] )
768 score += abs(diff)
769
770 return score
771
799
801 """
802 Find profile and structural matches by sliding the fragment over a specified
803 C{target} chain.
804
805 @param target: subject chain to be scanned to fragment instances
806 @type target: L{structure.Chain}
807 @param mode: fragment instance searching mode - a L{FragmentMatching} member
808 @type mode: L{csb.core.EnumItem}
809
810 @return: a list of L{ProfileMatch}es
811 @rtype: list
812 """
813
814 compare = self.fragment_comparer(mode)
815
816 if hasattr(target, 'entry_id'):
817 entry_id = target.entry_id
818 else:
819 entry_id = target.id
820
821 window = self.profile.length
822 rmsd_window = window - 2 * self.overhang
823 matches = [ ]
824 query_residues = structure.Chain(self.representative.chain,
825 residues=self.representative.structure.residues[self.overhang + 1: -self.overhang])
826 target_sequence = target.sequence
827
828 for i in range(0, len(target_sequence) - window + 1):
829
830 start = i + 1
831 rmsd_start = start + self.overhang
832 score, tm, tm_out, rmsd_ca, rmsd_dih = 0, None, None, None, None
833
834 if mode == FragmentMatching.Sequence:
835 score = compare(target_sequence, start=i)
836 else:
837 score = compare(target.profile, start=i)
838
839 subject_residues = structure.Chain(target.id, residues=target.residues[rmsd_start : rmsd_start + rmsd_window])
840
841 try:
842 rmsd_ca = query_residues.rmsd(subject_residues)
843 except structure.Broken3DStructureError:
844 rmsd_ca = None
845 pass
846
847 if score is not None and (rmsd_ca is not None or rmsd_dih is not None):
848 match = ProfileMatch(entry_id, start, score, rmsd_ca, rmsd_dih, tm, tm_out)
849 matches.append(match)
850
851 return matches
852
891
893 """
894 Convert this fragment into a profile HMM segment. Each column in the
895 fragment's profile becomes a Match state with equivalent emissions.
896 However, all Match states are connected with a fixed transition
897 probability of 100%.
898
899 @return: an HMM wrapper around the cluster's profile
900 @rtype: L{hmm.ProfileHMM}
901 """
902
903 factory = hmm.StateFactory()
904
905 p = hmm.ProfileHMM(hmm.ScoreUnits.Probability)
906 p.family = 'is' + str(self.id)
907 p.name = 'is' + str(self.id)
908 p.id = str(self.id)
909 p.pseudocounts = False
910 p.version = '1.6'
911
912 background = dict([ (aa, self.profile.background[aa])
913 for aa in 'ACDEFGHIKLMNPQRSTVWY' ])
914
915 for i, row in enumerate(self.profile.matrix, start=1):
916
917 row = dict([ (aa, row[aa]) for aa in 'ACDEFGHIKLMNPQRSTVWY' ])
918 residue = self.representative.structure.residues[i]
919
920 match = factory.create_match(row, background)
921 match.rank = i
922
923 layer = hmm.HMMLayer(i, residue)
924 layer.append(match)
925
926 p.layers.append(layer)
927
928 for layer in p.layers:
929
930 match = layer[hmm.States.Match]
931
932 if layer.rank < p.layers.last_index:
933 next_match = p.layers[layer.rank + 1][hmm.States.Match]
934 tran = hmm.Transition(match, next_match, 1.0)
935 match.transitions.append(tran)
936
937 last_tran = hmm.Transition(p.layers[-1][hmm.States.Match], p.end, 1.0)
938 p.layers[-1][hmm.States.Match].transitions.append(last_tran)
939
940 start_tran = hmm.Transition(p.start, p.layers[1][hmm.States.Match], 1.0)
941 p.start.transitions.append(start_tran)
942
943 p.length.matches = p.layers.length
944 p.length.layers = p.layers.length
945 p.effective_matches = 10
946
947 rep_sequence = ''.join([ str(l.residue.type) for l in p.layers ])
948 seq = csb.bio.sequence.Sequence('dummy', '>dummy', rep_sequence,
949 type=csb.bio.sequence.SequenceTypes.Protein)
950 p.alignment = csb.bio.sequence.A3MAlignment([seq])
951
952 return p
953
954 @staticmethod
955 - def from_hmm(source, id, start=None, end=None, rep_accession=None,
956 rep_chain=None, alpha=0.2):
1054
1056 """
1057 Container class which describes an I-Sites paradigm structure for a cluster.
1058
1059 @param accession: paradigm's accession number
1060 @type accession: str
1061 @param chain: chain ID
1062 @type chain: str
1063 @param start: the start position in the paradigm's chain where the fragment
1064 instance is located (including the overhangs). In the original
1065 library this is a PDB sequence number, not a SEQRES rank. The
1066 residue rank can be retrieved from the structure instead:
1067
1068 >>> cluster.representative.structure.find(start).rank
1069 residue rank
1070
1071 @type start: int
1072 @param angles: torsion angles of the paradigm, in degrees
1073 @type angles: L{structure.TorsionAnglesCollection}
1074 @param source: the entire paradigm chain with pre-computed torsion angles
1075 @type source: L{ChainSequence}
1076 @param struct: a segment of the paradigm's L{structure.Chain} which
1077 contains the I-Site. Residue ranks and IDs are preserved
1078 as they were in the original chain (no shifting)
1079 @type struct: L{structure.Chain}
1080 """
1081
1082 - def __init__(self, accession, chain, start, angles=None, source=None, struct=None):
1098
1099 @property
1102 @angles.setter
1104 self._angles.update(angles)
1105
1106 @property
1109 @source.setter
1111 if type(chain) not in (structure.Chain, ChainSequence):
1112 raise TypeError("The source property must be a Chain or ChainSequence instance with torsion property pre-calculated.")
1113 self._source = chain
1114
1115 @property
1117 return self._structure
1118 @structure.setter
1120 if not isinstance(chain_fragment, structure.Chain):
1121 raise TypeError("The structure property must be a Chain instance.")
1122 self._structure = chain_fragment
1123