1 """
2 HHpred and Hidden Markov Model APIs.
3
4 This package defines the abstractions for working with HHpred's HMMs and
5 hit lists. L{ProfileHMM} is the most important object of this module.
6 It describes a sequence profile hidden Markov model in the way HHpred
7 sees this concept:
8
9 - a profile is composed of a list of L{HMMLayer}s, which contain a
10 number of L{State}s
11 - these L{States} can be of different types: Match, Insertion Deletion, etc.
12 - a profile contains a multiple alignment, from which it is derived
13 - this multiple alignment is an A3M (condensed) Alignment, where the
14 first sequence is a master sequence
15 - the match states in all layers correspond to the residues of the master
16 sequence
17
18 L{ProfileHMM} objects provide list-like access to their layers:
19
20 >>> hmm.layers[1]
21 <HMMLayer> # first layer: layer at master residue=1
22
23 Every layer provides dictionary-like access to its states:
24
25 >>> layer[States.Match]
26 <Match State>
27
28 and every state provides dictionary-like access to its transitions to other
29 states:
30
31 >>> state = hmm.layers[1][States.match]
32 >>> state.transitions[States.Insertion]
33 <Transition> # Match > Insertion
34 >>> transition.predecessor
35 <Match State> # source state
36 >>> transition.successor
37 <Insertion State> # target state
38
39 Whether this transition points to a state at the same (i) or the next layer
40 (i+1) depends on the semantics of the source and the target states.
41
42 Building HMMs from scratch is supported through a number of C{append} methods
43 at various places:
44
45 >>> layer = HMMLayer(...)
46 >>> layer.append(State(...))
47 >>> hmm.layers.append(layer)
48
49 See L{HMMLayersCollection}, L{HMMLayer}, L{EmissionTable} and L{TransitionTable}
50 for details.
51 """
52
53 import sys
54 import math
55
56 import csb.core
57 import csb.io
58 import csb.bio.structure as structure
59 import csb.bio.sequence as sequence
60
61 from csb.core import Enum
66
69
72
75
78
81
84
87
88
89 -class States(csb.core.enum):
94
96 """
97 Enumeration of HMM emission and transition score units
98 """
99 LogScales='LogScales'; Probability='Probability'
100
101 BACKGROUND = [ 0.076627178753322270, 0.018866884241976509, 0.053996136712517316,
102 0.059788009880742142, 0.034939432842683173, 0.075415244982547675,
103 0.036829356494115069, 0.050485048600600511, 0.059581159080509941,
104 0.099925728794059046, 0.021959667190729986, 0.040107059298840765,
105 0.045310838527464106, 0.032644867589507229, 0.051296350550656143,
106 0.046617000834108295, 0.071051060827250878, 0.072644631719882335,
107 0.012473412286822654, 0.039418044025976547 ]
108 """
109 Background amino acid probabilities
110 """
111
112 RELATIVE_SA = { 'A': 0.02, 'B': 0.14, 'C': 0.33, 'D': 0.55, 'E': 1.00 }
113 """
114 Relative solvent accessibility codes (upper bounds)
115 """
119 """
120 Describes a protein profile Hidden Markov Model.
121 Optional parameters:
122
123 @param units: defines the units of the transition and emission scores
124 @type units: L{ScoreUnits}
125 @param scale: the scaling factor used to convert emission/transition
126 probabilities
127 @type scale: float
128 @param logbase: the base of the logarithm used for scaling the emission and
129 transition probabilities
130 @type logbase: float
131 """
132
134
135 self._name = None
136 self._id = None
137 self._family = None
138 self._length = ProfileLength(0, 0)
139 self._alignment = None
140 self._consensus = None
141 self._dssp = None
142 self._dssp_solvent = None
143 self._psipred = None
144 self._effective_matches = None
145 self._evd = EVDParameters(None, None)
146 self._version = None
147 self._pseudocounts = False
148 self._emission_pseudocounts = False
149 self._transition_pseudocounts = False
150 self._layers = HMMLayersCollection()
151 self._start = State(States.Start)
152 self._start_insertion = None
153 self._end = State(States.End)
154 self._scale = scale
155 self._logbase = logbase
156 if units is None:
157 self._score_units = ScoreUnits.LogScales
158 else:
159 self._score_units = units
160
161 @property
163 """
164 Profile name (NAME)
165 @rtype: str
166 """
167 return self._name
168 @name.setter
169 - def name(self, value):
171
172 @property
174 """
175 Profile entry ID (FILE)
176 @rtype: str
177 """
178 return self._id
179 @id.setter
180 - def id(self, value):
181 self._id = str(value)
182
183 @property
185 """
186 Alternative entry ID (FAM)
187 @rtype: str
188 """
189 return self._family
190 @family.setter
192 self._family = str(value)
193
194 @property
196 """
197 Profile length
198 @rtype: L{ProfileLength}
199 """
200 return self._length
201 @length.setter
206
207 @property
209 """
210 Source multiple alignment
211 @rtype: L{A3MAlignment}
212 """
213 return self._alignment
214 @alignment.setter
219
220 @property
222 """
223 Consensus sequence
224 @rtype: L{AbstractSequence}
225 """
226 return self._consensus
227 @consensus.setter
232
233 @property
235 """
236 DSSP (calculated) secondary structure
237 @rtype: L{SecondaryStructure}
238 """
239 return self._dssp
240 @dssp.setter
241 - def dssp(self, value):
245
246 @property
248 """
249 Solvent accessibility values
250 @rtype: str
251 """
252 return self._dssp_solvent
253 @dssp_solvent.setter
255 self._dssp_solvent = str(value)
256
257 @property
259 """
260 PSIPRED (predicted) secondary structure
261 @rtype: L{SecondaryStructure}
262 """
263 return self._psipred
264 @psipred.setter
269
270 @property
272 """
273 Number of effective matches (NEFF)
274 """
275 return self._effective_matches
276 @effective_matches.setter
278 self._effective_matches = value
279
280 @property
282 """
283 Extreme-value distribution parameters (EVD)
284 @rtype: L{EVDParameters}
285 """
286 return self._evd
287 @evd.setter
288 - def evd(self, value):
292
293 @property
295 """
296 Format version number (HHsearch)
297 @rtype: str
298 """
299 return self._version
300 @version.setter
302 self._version = str(value)
303
304 @property
306 """
307 @rtype: bool
308 """
309 return self._pseudocounts
310 @pseudocounts.setter
312 self._pseudocounts = bool(value)
313
314 @property
316 """
317 @rtype: bool
318 """
319 return self._emission_pseudocounts
320 @emission_pseudocounts.setter
322 self._emission_pseudocounts = bool(value)
323
324 @property
326 """
327 @rtype: bool
328 """
329 return self._transition_pseudocounts
330 @transition_pseudocounts.setter
332 self._transition_pseudocounts = bool(value)
333
334 @property
336 """
337 List-like access to the HMM's layers
338 @rtype: L{HMMLayersCollection}
339 """
340 return self._layers
341
342 @property
344 """
345 Start state (at the start layer)
346 @rtype: L{State}
347 """
348 return self._start
349 @start.setter
355
356 @property
358 """
359 Insertion state at the start layer
360 @rtype: L{State}
361 """
362 return self._start_insertion
363 @start_insertion.setter
369
370 @property
372 """
373 Final state (at the end layer)
374 @rtype: L{State}
375 """
376 return self._end
377 @end.setter
378 - def end(self, value):
383
384 @property
386 """
387 Score scaling factor
388 @rtype: float
389 """
390 return self._scale
391
392 @property
394 """
395 Base of the logarithm used for score scaling
396 @rtype: float
397 """
398 return self._logbase
399
400 @property
402 """
403 Current score units
404 @rtype: L{ScoreUnits} member
405 """
406 return self._score_units
407
408 @property
417
418 @property
420 """
421 A list of layers including start and start_insertion
422 @rtype: list of L{HMMLayer}
423 """
424 complete_layers = []
425 first_layer = HMMLayer(rank=0, residue=None)
426 first_layer.append(self.start)
427 if self.start_insertion:
428 first_layer.append(self.start_insertion)
429 complete_layers.append(first_layer)
430 for layer in self.layers:
431 complete_layers.append(layer)
432
433 return complete_layers
434
435 @property
437 """
438 True if this profile contains structural data
439 @rtype: bool
440 """
441 has = False
442 for layer in self.layers:
443 if layer.residue.has_structure:
444 return True
445 return has
446
448 """
449 Serialize this HMM to a file.
450
451 @param file_name: target file name
452 @type file_name: str
453 """
454 rec = sys.getrecursionlimit()
455 sys.setrecursionlimit(10000)
456 csb.io.Pickle.dump(self, open(file_name, 'wb'))
457 sys.setrecursionlimit(rec)
458
459 @staticmethod
461 """
462 De-serialize an HMM from a file.
463
464 @param file_name: source file name (pickle)
465 @type file_name: str
466 """
467 rec = sys.getrecursionlimit()
468 sys.setrecursionlimit(10000)
469 try:
470 return csb.io.Pickle.load(open(file_name, 'rb'))
471 finally:
472 sys.setrecursionlimit(rec)
473
474 - def _convert(self, units, score, scale, logbase):
485
486 - def to_hmm(self, output_file=None, convert_scores=False):
515
517 """
518 Extract a sub-segment of the profile.
519
520 @param start: start layer of the segment (rank)
521 @type start: int
522 @param end: end layer of the segment (rank)
523 @type end: int
524
525 @return: a deepcopy of the extracted HMM segment
526 @rtype: L{ProfileHMMSegment}
527 """
528 return ProfileHMMSegment(self, start, end)
529
533
540
547
548 - def structure(self, chain_id=None, accession=None):
549 """
550 Extract the structural information from the HMM.
551
552 @param accession: defines the accession number of the structure
553 @type accession: str
554 @param chain_id: defines explicitly the chain identifier
555 @type chain_id: str
556
557 @return: a shallow L{Structure} wrapper around the residues in the HMM.
558 @rtype: L{Structure}
559 """
560 struct = structure.Structure(accession or self.id)
561 chain = self.chain(chain_id)
562 struct.chains.append(chain)
563
564 return struct
565
566 - def chain(self, chain_id=None):
585
587 """
588 Extract the emission scores of all match states in the profile.
589 The metric of the emission scores returned depends on the current
590 hmm.score_units setting - you may need to call hmm.convert_scores()
591 to adjust the hmm to your particular needs.
592
593 @return: a list of dictionaries; each dict key is a single amino acid
594 @rtype: list
595 """
596 profile = []
597
598 for layer in self.layers:
599 emission = {}
600
601 for aa in layer[States.Match].emission:
602 emission[str(aa)] = layer[States.Match].emission[aa] or 0.0
603 profile.append(emission)
604
605 return profile
606
608 """
609 Convert emission and transition scores to the specified units.
610
611 @param units: the target units for the conversion (a member of
612 L{ScoreUnits}).
613 @type units: L{csb.core.EnumItem}
614 @param method: if defined, implements the exact mathematical
615 transformation that will be applied. It must be a
616 function or lambda expression with the following
617 signature::
618 def (target_units, score, scale, logbase)
619
620 and it has to return the score converted to
621 C{target_units}. If method performs a conversion from
622 probabilities to scaled logs, you should also update
623 C{hmm.scale} and C{hmm.logbase}.
624 @type method: function, lambda
625 """
626
627 if self._score_units == units:
628 return
629
630 if method is not None:
631 convert = method
632 else:
633 convert = self._convert
634
635 for layer in self.layers:
636 for state_kind in layer:
637 state = layer[state_kind]
638 if not state.silent:
639 for residue in state.emission:
640 if state.emission[residue] is not None:
641 state.emission.update(residue, convert(
642 units, state.emission[residue],
643 self.scale, self.logbase))
644 for residue in state.background:
645 if state.background[residue] is not None:
646 state.background.update(residue, convert(
647 units, state.background[residue],
648 self.scale, self.logbase))
649 for tran_kind in state.transitions:
650 transition = state.transitions[tran_kind]
651 transition.probability = convert(units, transition.probability,
652 self.scale, self.logbase)
653
654
655
656 if self.start_insertion:
657 for t_it in self.start_insertion.transitions:
658 transition = self.start_insertion.transitions[t_it]
659 transition.probability = convert(units, transition.probability,
660 self.scale, self.logbase)
661
662 for residue in self.start_insertion.emission:
663 state = self.start_insertion
664 if state.emission[residue] is not None:
665 state.emission.update(residue,
666 convert(units, state.emission[residue], self.scale, self.logbase))
667 state.background.update(residue,
668 convert(units, state.background[residue], self.scale, self.logbase))
669
670 for tran_kind in self.start.transitions:
671 transition = self.start.transitions[tran_kind]
672 transition.probability = convert(units,
673 transition.probability, self.scale, self.logbase)
674
675
676 self._score_units = units
677
679 """
680 Compute the Log-sum-of-odds score between the emission tables of self
681 and other (Soeding 2004). If no observable Match state is found at a
682 given layer, the Insertion state is used instead.
683
684 @note: This is not a full implementation of the formula since only
685 emission vectors are involved in the computation and any transition
686 probabilities are ignored.
687
688 @param other: the subject HMM
689 @type other: L{ProfileHMM}
690
691 @return: emission log-sum-of-odds similarity between C{self} and
692 C{other}
693 @rtype: float
694
695 @raise ValueError: when self and other differ in their length, when the
696 score_units are not Probability, or when no
697 observable states are present
698 """
699 score = 1
700
701 if self.layers.length != other.layers.length or self.layers.length < 1:
702 raise ValueError('Both HMMs must have the same nonzero number of layers')
703 if self.score_units != ScoreUnits.Probability or \
704 other.score_units != ScoreUnits.Probability:
705 raise ValueError('Scores must be converted to probabilities first.')
706
707 for q_layer, s_layer in zip(self.layers, other.layers):
708
709 try:
710 if States.Match in q_layer and not q_layer[States.Match].silent:
711 q_state = q_layer[States.Match]
712 else:
713 q_state = q_layer[States.Insertion]
714
715 if States.Match in s_layer and not s_layer[States.Match].silent:
716 s_state = s_layer[States.Match]
717 else:
718 s_state = s_layer[States.Insertion]
719 except csb.core.ItemNotFoundError:
720 raise ValueError('Query and subject must contain observable states '
721 'at each layer')
722
723 emission_dotproduct = 0
724
725 for aa in q_state.emission:
726
727 q_emission = q_state.emission[aa] or sys.float_info.min
728 s_emission = s_state.emission[aa] or sys.float_info.min
729 emission_dotproduct += (q_emission * s_emission /
730 q_state.background[aa])
731
732 score *= emission_dotproduct
733
734 return math.log(score)
735
737 """
738 Attach references from each profile layer to the relevant DSSP secondary
739 structure element.
740 """
741 assert self.dssp is not None
742
743 for motif in self.dssp:
744 for i in range(motif.start, motif.end + 1):
745 self.layers[i].residue.secondary_structure = motif
746
749 """
750 Represents a segment (fragment) of a ProfileHMM.
751
752 @param hmm: source HMM
753 @type hmm: ProfileHMM
754 @param start: start layer of the segment (rank)
755 @type start: int
756 @param end: end layer of the segment (rank)
757 @type end: int
758
759 @raise ValueError: when start or end positions are out of range
760 """
761
763
764 if start < hmm.layers.start_index or start > hmm.layers.last_index:
765 raise IndexError('Start position {0} is out of range'.format(start))
766 if end < hmm.layers.start_index or end > hmm.layers.last_index:
767 raise IndexError('End position {0} is out of range'.format(end))
768
769
770
771 super(ProfileHMMSegment, self).__init__(units=hmm.score_units,
772 scale=hmm.scale, logbase=hmm.logbase)
773 self.id = hmm.id
774 self.family = hmm.family
775 self.name = hmm.name
776 self.pseudocounts = hmm.pseudocounts
777 self.evd = hmm.evd
778 self.version = hmm.version
779 self.source = hmm.id
780 self._source_start = start
781 self._source_end = end
782
783 if hmm.alignment:
784 self.alignment = hmm.alignment.hmm_subregion(start, end)
785 self.consensus = hmm.consensus.subregion(start, end)
786
787 layers = csb.core.deepcopy(hmm.layers[start : end + 1])
788 max_score = 1.0
789 if hmm.score_units != ScoreUnits.Probability:
790 max_score = hmm._convert(hmm.score_units,
791 max_score, hmm.scale, hmm.logbase)
792 self._build_graph(layers, max_score)
793
794 if hmm.dssp:
795 self.dssp = hmm.dssp.subregion(start, end)
796 self._assign_secstructure()
797 if hmm.psipred:
798 self.psipred = hmm.psipred.subregion(start, end)
799
800 self.length.layers = self.layers.length
801 self.length.matches = self.layers.length
802 self.effective_matches = sum([(l.effective_matches or 0.0) for l in self.layers]) / self.layers.length
803
804 @property
806 """
807 Start position of this segment in its source HMM
808 @rtype: int
809 """
810 return self._source_start
811
812 @property
814 """
815 End position of this segment in its source HMM
816 @rtype: int
817 """
818 return self._source_end
819
844
847 """
848 Represents a segment of the Match state emission probabilities of a L{ProfileHMM}.
849 Contains only Match states, connected with equal transition probabilities of 100%.
850 """
851
853
854 factory = StateFactory()
855
856 for rank, source_layer in enumerate(source_layers, start=1):
857
858 emission = source_layer[States.Match].emission
859 background = source_layer[States.Match].background
860
861 match = factory.create_match(emission, background)
862 match.rank = rank
863
864 layer = HMMLayer(rank, source_layer.residue)
865 layer.append(match)
866 self.layers.append(layer)
867
868 if rank == 1:
869 self.start.transitions.append(Transition(self.start, match, 1.0))
870 elif rank < len(source_layers):
871 prev_match = self.layers[rank - 1][States.Match]
872 prev_match.transitions.append(Transition(prev_match, match, 1.0))
873 elif rank == len(source_layers):
874 match.transitions.append(Transition(match, self.end, 1.0))
875 else:
876 assert False
877
909
910 @property
912 """
913 Start position of this segment in its source HMM
914 @rtype: int
915 """
916 return self._source_start
917
918 @property
920 """
921 End position of this segment in its source HMM
922 @rtype: int
923 """
924 return self._source_end
925
932
935
937 self.lamda = lamda
938 self.mu = mu
939
942
944 return (self.lamda is not None or self.mu is not None)
945
948 """
949 Represents a lookup table of emission probabilities. Provides dictionary-like
950 access:
951
952 >>> state.emission[ProteinAlphabet.ALA]
953 emission probability for ALA
954
955 @param emission: an initialization dictionary of emission probabilities
956 @type emission: dict
957 @param restrict: a list of residue types allowed for this emission table.
958 Defaults to the members of L{csb.bio.sequence.ProteinAlphabet}
959 @type restrict: list
960 """
961
964
965 - def append(self, residue, probability):
966 """
967 Append a new emission probability to the table.
968
969 @param residue: residue name (type) - a member of
970 L{csb.bio.sequence.ProteinAlphabet}
971 @type residue: L{csb.core.EnumItem}
972 @param probability: emission score
973 @type probability: float
974
975 @raise EmissionExistsError: if residue is already defined
976 """
977 if residue in self:
978 raise EmissionExistsError('Residue {0} is already defined.'.format(residue))
979
980 super(EmissionTable, self).append(residue, probability)
981
982 - def set(self, table):
983 """
984 Set the emission table using the dictionary provided in the argument.
985
986 @param table: the new emission table
987 @type table: dict
988 """
989 super(EmissionTable, self)._set(table)
990
991 - def update(self, residue, probability):
992 """
993 Update the emission C{probability} of a given emission C{residue}.
994
995 @param residue: name (type) of the residue to be updated
996 @type residue: L{csb.core.EnumItem}
997 @param probability: new emission score
998 @type probability: float
999 """
1000 super(EmissionTable, self)._update({residue: probability})
1001
1004 """
1005 Represents a lookup table of transitions that are possible from within a given state.
1006
1007 Provides dictionary-like access, where dictionary keys are target states.
1008 These are members of the L{States} enumeration, e.g.:
1009
1010 >>> state.transitions[States.Match]
1011 transition info regarding transition from the current state to a Match state
1012 >>> state.transitions[States.Match].predecessor
1013 state
1014 >>> state.transitions[States.Match].successor
1015 the next match state
1016
1017 @param transitions: an initialization dictionary of target L{State}:L{Transition} pairs
1018 @type transitions: dict
1019 @param restrict: a list of target states allowed for this transition table.
1020 Defaults to the L{States} enum members
1021 @type restrict: list
1022 """
1023
1025 super(TransitionTable, self).__init__(transitions, restrict)
1026
1027 @property
1030
1031 - def append(self, transition):
1032 """
1033 Append a new C{transition} to the table.
1034
1035 @param transition: transition info
1036 @type transition: L{Transition}
1037
1038 @raise TransitionExistsError: when a transition to the same target state
1039 already exists for the current state
1040 """
1041
1042 if transition.successor.type in self:
1043 msg = 'Transition to a {0} state is already defined.'
1044 raise TransitionExistsError(msg.format(transition.successor.type))
1045
1046 super(TransitionTable, self).append(transition.successor.type, transition)
1047
1048 - def set(self, table):
1049 """
1050 Set the transition table using the dictionary provided in the argument.
1051
1052 @param table: the new transition table
1053 @type table: dict
1054 """
1055 super(TransitionTable, self)._set(table)
1056
1057 - def update(self, target_statekind, transition):
1058 """
1059 Update the information of a transition, which points to a target
1060 state of the specified L{States} kind.
1061
1062 @param target_statekind: the key of the transition to be updated
1063 @type target_statekind: L{csb.core.EnumItem}
1064 @param transition: new transition info object
1065 @type transition: L{Transition}
1066
1067 @raise ValueError: if I{transition.successor.type} differs from
1068 C{target_statekind}
1069 """
1070 if transition.successor.type != target_statekind:
1071 raise ValueError("Successor's type differs from the specified target state.")
1072
1073 super(TransitionTable, self)._update({target_statekind: transition})
1074
1077 """
1078 Provides consecutive, 1-based access to all of the layers in the profile.
1079 Each profile layer contains a catalog of available states at that index, e.g.:
1080
1081 >>> profile.layers[i]
1082 the catalog at profile layer i
1083 >>> profile.layers[i][States.Deletion]
1084 the deletion state at index i
1085
1086 @param layers: initialization list of L{HMMLayer}s
1087 @type layers: list
1088 """
1091
1092 @property
1095
1096 -class HMMLayer(csb.core.DictionaryContainer):
1097 """
1098 Provides a dictionary-like catalog of the available states at this layer.
1099 Lookup keys are members of the L{States} enumeration, e.g.:
1100
1101 >>> profile.layers[i][States.Deletion]
1102 the deletion state at layer number i
1103
1104 @param rank: layer's number
1105 @type rank: int
1106 @param residue: a representative L{ProteinResidue} that is associated with
1107 this layer
1108 @type residue: L{ProteinResidue}
1109 @param states: initialization dictionary of L{States}.Item:L{State} pairs
1110 @type states: dict
1111 """
1112 - def __init__(self, rank, residue, states=None):
1123
1124 @property
1127
1128 @property
1130 """
1131 Layer's position
1132 @rtype: int
1133 """
1134 return self._rank
1135 @rank.setter
1136 - def rank(self, value):
1137 self._rank = int(value)
1138
1139 @property
1141 """
1142 Representative residue
1143 @rtype: L{Residue}
1144 """
1145 return self._residue
1146 @residue.setter
1151
1152 @property
1154 """
1155 Number of effective matches at this layer
1156 @rtype: int
1157 """
1158 return self._effective_matches
1159 @effective_matches.setter
1161 self._effective_matches = value
1162
1163 @property
1165 """
1166 Number of effective insertions at this layer
1167 @rtype: int
1168 """
1169 return self._effective_insertions
1170 @effective_insertions.setter
1172 self._effective_insertions = value
1173
1174 @property
1176 """
1177 Number of effective deletions at this layer
1178 @rtype: int
1179 """
1180 return self._effective_deletions
1181 @effective_deletions.setter
1183 self._effective_deletions = value
1184
1186 """
1187 Append a new C{state} to the catalog.
1188
1189 @param state: the new state
1190 @type state: L{State}
1191
1192 @raise StateExistsError: when a state of the same type is already defined
1193 """
1194 if state.type in self:
1195 raise StateExistsError(
1196 'State {0} is already defined at this position.'.format(state.type))
1197
1198 super(HMMLayer, self).append(state.type, state)
1199
1200 - def update(self, state_kind, state):
1201 """
1202 Update the sate of the specified kind under the current layer.
1203
1204 @param state_kind: state type (key) - a member of L{States}
1205 @type state_kind: L{csb.core.EnumItem}
1206 @param state: the new state info
1207 @type state: L{State}
1208
1209 @raise ValueError: if state.type differs from state_kind
1210 """
1211 if state.type != state_kind:
1212 raise ValueError("State's type differs from the specified state_kind")
1213
1214 super(HMMLayer, self)._update({state_kind: state})
1215
1216
1217 -class State(object):
1218 """
1219 Describes a Hidden Markov Model state.
1220
1221 @param type: one of the L{States} enumeration values, e.g. States.Match
1222 @type type: L{csb.core.EnumItem}
1223 @param emit: a collection of emittable state names allowed for the state,
1224 e.g. the members of I{SequenceAlphabets.Protein}. If not defined,
1225 the state will be created as a silent (unobservable).
1226 @type emit: list
1227
1228 @raise ValueError: if type is not a member of the States enum
1229 """
1230
1244
1246 return "<HMM {0.type!r} State>".format(self)
1247
1248 @property
1250 """
1251 State type: one of the L{States}
1252 @rtype: enum item
1253 """
1254 return self._type
1255 @type.setter
1256 - def type(self, value):
1260
1261 @property
1263 return self._rank
1264 @rank.setter
1265 - def rank(self, value):
1266 self._rank = int(value)
1267
1268 @property
1270 """
1271 Lookup table with available transitions to other states
1272 @rtype: L{TransitionTable}
1273 """
1274 return self._transitions
1275
1276 @property
1278 """
1279 Lookup table with available emission probabilities
1280 @rtype: L{EmissionTable}
1281 """
1282 if self._emission is None:
1283 raise UnobservableStateError('Silent {0!r} state'.format(self.type))
1284 return self._emission
1285
1286 @property
1288 """
1289 Lookup table with background probabilities
1290 @rtype: L{EmissionTable}
1291 """
1292 return self._background
1293
1294 @property
1296 """
1297 Whether this state can emit something
1298 @rtype: bool
1299 """
1300 try:
1301 return self.emission is None
1302 except UnobservableStateError:
1303 return True
1304
1306 """
1307 Simplifies the construction of protein profile HMM states.
1308 """
1309
1312
1319
1326
1328 return State(States.Deletion)
1329
1332
1336
1338 return '{0}->{1}'.format(self.source_state, self.target_state)
1339
1341 """
1342 Describes a Hidden Markov Model transition between two states.
1343
1344 @param predecessor: source state
1345 @type predecessor: L{State}
1346 @param successor: target state
1347 @type successor: L{State}
1348 @param probability: transition score
1349 @type probability: float
1350 """
1351
1352 - def __init__(self, predecessor, successor, probability):
1363
1365 return '<HMM Transition: {0.type} {0.probability}>'.format(self)
1366
1367 @property
1369 """
1370 Transition source state
1371 @rtype: L{State}
1372 """
1373 return self._predecessor
1374
1375 @property
1377 """
1378 Transition target state
1379 @rtype: L{State}
1380 """
1381 return self._successor
1382
1383 @property
1385 """
1386 Transition score
1387 @rtype: float
1388 """
1389 return self._probability
1390 @probability.setter
1392 if not (value >=0):
1393 raise ValueError('Transition probability must be a positive number.')
1394 self._probability = float(value)
1395
1396 @property
1398 """
1399 Struct, containing information about the source and target state types
1400 @rtype: L{TransitionType}
1401 """
1402 return self._type
1403
1406 """
1407 Represents a query-template alignment in an HHpred result.
1408
1409 @param hit: relevant hit object
1410 @type param: L{HHpredHit}
1411 @param query: the query sequence in the alignment region, with gaps
1412 @type query: str
1413 @param subject: the subject sequence in the alignment region, with gaps
1414 @type subject: str
1415 """
1416
1417 GAP = sequence.ProteinAlphabet.GAP
1418
1419 - def __init__(self, hit, query, subject):
1430
1431 @property
1433 """
1434 Query sequence (with gaps)
1435 @rtype: str
1436 """
1437 return self.rows[1].sequence
1438
1439 @property
1441 """
1442 Subject sequence (with gaps)
1443 @rtype: str
1444 """
1445 return self.rows[2].sequence
1446
1447 @property
1449 """
1450 Find all ungapped query-subject segments in the alignment.
1451 Return a generator over all ungapped alignment segments, represented
1452 by L{HHpredHit} objects
1453
1454 @rtype: generator
1455 """
1456
1457 def make_segment(sstart, send, qstart, qend):
1458
1459 seg = HHpredHit(self._hit.rank, self._hit.id, sstart, send,
1460 qstart, qend, self._hit.probability, self._hit.qlength)
1461
1462 seg.slength = self._hit.slength
1463 seg.evalue = self._hit.evalue
1464 seg.pvalue = self._hit.pvalue
1465 seg.score = self._hit.score
1466 seg.ss_score = self._hit.ss_score
1467 seg.identity = self._hit.identity
1468 seg.similarity = self._hit.similarity
1469 seg.prob_sum = self._hit.prob_sum
1470
1471 return seg
1472
1473 in_segment = False
1474 qs = self._hit.qstart - 1
1475 ss = self._hit.start - 1
1476 qi, si = qs, ss
1477 qe, se = qs, ss
1478
1479 for q, s in zip(self.query, self.subject):
1480
1481 if q != HHpredHitAlignment.GAP:
1482 qi += 1
1483 if s != HHpredHitAlignment.GAP:
1484 si += 1
1485
1486 if HHpredHitAlignment.GAP in (q, s):
1487 if in_segment:
1488 yield make_segment(ss, se, qs, qe)
1489 in_segment = False
1490 qs, ss = 0, 0
1491 qe, se = 0, 0
1492 else:
1493 if not in_segment:
1494 in_segment = True
1495 qs, ss = qi, si
1496
1497 qe, se = qi, si
1498
1499 if in_segment:
1500 yield make_segment(ss, se, qs, qe)
1501
1509
1511 """
1512 Represents a single HHsearch hit.
1513
1514 @param rank: rank of the hit
1515 @type rank: int
1516 @param id: id of the hit
1517 @type id: str
1518 @param start: subject start
1519 @type start: int
1520 @param end: subject end
1521 @type end: int
1522 @param qstart: query start
1523 @type qstart: int
1524 @param qend: query end
1525 @type qend: int
1526 @param probability: probability of the hit
1527 @type probability: float
1528 @param qlength: length of the query
1529 @type qlength: int
1530 """
1531
1532 - def __init__(self, rank, id, start, end, qstart, qend, probability,
1533 qlength):
1534
1535 self._rank = None
1536 self._id = None
1537 self._start = None
1538 self._end = None
1539 self._qstart = None
1540 self._qend = None
1541 self._probability = None
1542 self._qlength = None
1543 self._alignment = None
1544
1545 self._slength = None
1546 self._evalue = None
1547 self._pvalue = None
1548 self._score = None
1549 self._ss_score = None
1550 self._identity = None
1551 self._similarity = None
1552 self._prob_sum = None
1553
1554
1555 self.rank = rank
1556 self.id = id
1557 self.start = start
1558 self.end = end
1559 self.qstart = qstart
1560 self.qend = qend
1561 self.probability = probability
1562 self.qlength = qlength
1563
1565 return "{0.id} {0.probability} {0.start}-{0.end}".format(self)
1566
1568 return "<HHpredHit: {0!s}>".format(self)
1569
1572
1574 """
1575 Return True if C{self} is completely identical to C{other} (same id, same start
1576 and end positions).
1577
1578 @param other: right-hand-term
1579 @type other: HHpredHit
1580
1581 @rtype: bool
1582 """
1583 return (self.id == other.id and self.start == other.start and self.end == other.end)
1584
1586 """
1587 Return True if C{self} is a superior to C{other} in terms of length
1588 and probability. These criteria are applied in the following order:
1589
1590 1. Length (the longer hit is better)
1591 2. Probability (if they have the same length, the one with the higher
1592 probability is better)
1593 3. Address (if they have the same length and probability, the one with
1594 higher memory ID wins; for purely practical reasons)
1595
1596 @param other: right-hand-term
1597 @type other: HHpredHit
1598
1599 @rtype: bool
1600 """
1601 if self.length > other.length:
1602 return True
1603 elif self.length == other.length:
1604 if self.probability > other.probability:
1605 return True
1606 elif self.probability == other.probability:
1607 if id(self) > id(other):
1608 return True
1609 return False
1610
1611 - def includes(self, other, tolerance=1):
1612 """
1613 Return True if C{other} overlaps with C{self}, that means C{other}
1614 is fully or partially included in C{self} when aligned over the query.
1615
1616 @param other: right-hand-term
1617 @type other: HHpredHit
1618 @param tolerance: allow partial overlaps for that number of residues at
1619 either end
1620 @type tolerance: int
1621
1622 @rtype: bool
1623 """
1624 if self.id == other.id:
1625 if other.start >= self.start:
1626 if (other.end - self.end) <= tolerance:
1627 return True
1628 elif other.end <= self.end:
1629 if (self.start - other.start) <= tolerance:
1630 return True
1631
1632 return False
1633
1635 """
1636 Add query/subject alignment to the hit.
1637
1638 @param query: the query sequence within the alignment region, with gaps
1639 @type query: str
1640 @param subject: the subject sequence within the alignment region, with
1641 gaps
1642 @type subject: str
1643 """
1644 self._alignment = HHpredHitAlignment(self, query, subject)
1645
1646 @property
1649 @rank.setter
1650 - def rank(self, value):
1656
1657 @property
1660 @id.setter
1661 - def id(self, value):
1667
1668 @property
1671 @start.setter
1672 - def start(self, value):
1678
1679 @property
1682 @end.setter
1683 - def end(self, value):
1689
1690 @property
1693 @qstart.setter
1700
1701 @property
1704 @qend.setter
1705 - def qend(self, value):
1711
1712 @property
1714 return self._qlength
1715 @qlength.setter
1722
1723 @property
1725 return self._probability
1726 @probability.setter
1728 try:
1729 value = float(value)
1730 except:
1731 raise TypeError('probability must be float, not {0}'.format(type(value)))
1732 self._probability = value
1733
1734 @property
1736 return self._alignment
1737
1738 @property
1740 try:
1741 return self.end - self.start + 1
1742 except:
1743 return 0
1744
1745 @property
1747 return self._slength
1748 @slength.setter
1750 self._slength = value
1751
1752 @property
1755 @evalue.setter
1757 self._evalue = value
1758
1759 @property
1762 @pvalue.setter
1764 self._pvalue = value
1765
1766 @property
1769 @score.setter
1770 - def score(self, value):
1772
1773 @property
1775 return self._ss_score
1776 @ss_score.setter
1778 self._ss_score = value
1779
1780 @property
1782 return self._identity
1783 @identity.setter
1785 self._identity = value
1786
1787 @property
1789 return self._similarity
1790 @similarity.setter
1792 self._similarity = value
1793
1794 @property
1796 return self._prob_sum
1797 @prob_sum.setter
1799 self._prob_sum = value
1800
1803 """
1804 Represents a collection of L{HHpredHit}s.
1805 """
1806
1807 - def __init__(self, hits, query_name='', match_columns=-1, no_of_seqs='',
1808 neff=-1., searched_hmms=-1, date='', command=''):
1826
1827 @property
1829 return self._query_name
1830 @query_name.setter
1832 self._query_name = value
1833
1834 @property
1836 return self._match_columns
1837 @match_columns.setter
1839 self._match_columns = value
1840
1841 @property
1843 return self._no_of_seqs
1844 @no_of_seqs.setter
1846 self._no_of_seqs = value
1847
1848 @property
1851 @neff.setter
1852 - def neff(self, value):
1854
1855 @property
1857 return self._searched_hmms
1858 @searched_hmms.setter
1860 self._searched_hmms = value
1861
1862 @property
1865 @date.setter
1866 - def date(self, value):
1868
1869 @property
1871 return self._command
1872 @command.setter
1874 self._command = value
1875
1877 return "HHpredHitList\n\tquery={0.query_name}\n\tmatch_columns={0.match_columns}\n\tno_of_seqs={0.no_of_seqs}\n\tneff={0.neff}\n\tsearched_hmms={0.searched_hmms}\n\tdate={0.date}\n\tcommand={0.command}".format(self)
1878
1880 return "<HHpredHitList: {0} hits>".format(len(self))
1881
1883 return self._hits[index]
1884
1886 return iter(self._hits)
1887
1889 return len(self._hits)
1890
1892 self._hits.sort(key=lambda i: i.rank)
1893