1 """
2 Sequence and sequence alignment APIs.
3
4 This module defines the base interfaces for biological sequences and alignments:
5 L{AbstractSequence} and L{AbstractAlignment}. These are the central abstractions
6 here. This module provides also a number of useful enumerations, like L{SequenceTypes}
7 and L{SequenceAlphabets}.
8
9 Sequences
10 =========
11 L{AbstractSequence} has a number of implementations. These are of course interchangeable,
12 but have different intents and may differ significantly in performance. The standard
13 L{Sequence} implementation is what you are after if all you need is high performance
14 and efficient storage (e.g. when you are parsing big files). L{Sequence} objects store
15 their underlying sequences as strings. L{RichSequence}s on the other hand will store
16 their residues as L{ResidueInfo} objects, which have the same basic interface as the
17 L{csb.bio.structure.Residue} objects. This of course comes at the expense of degraded
18 performance. A L{ChainSequence} is a special case of a rich sequence, whose residue
19 objects are I{actually} real L{csb.bio.structure.Residue}s.
20
21 Basic usage:
22
23 >>> seq = RichSequence('id', 'desc', 'sequence', SequenceTypes.Protein)
24 >>> seq.residues[1]
25 <ResidueInfo [1]: SER>
26 >>> seq.dump(sys.stdout)
27 >desc
28 SEQUENCE
29
30 See L{AbstractSequence} for details.
31
32 Alignments
33 ==========
34 L{AbstractAlignment} defines a table-like interface to access the data in an
35 alignment:
36
37 >>> ali = SequenceAlignment.parse(">a\\nABC\\n>b\\nA-C")
38 >>> ali[0, 0]
39 <SequenceAlignment> # a new alignment, constructed from row #1, column #1
40 >>> ali[0, 1:3]
41 <SequenceAlignment> # a new alignment, constructed from row #1, columns #2..#3
42
43 which is just a shorthand for using the standard 1-based interface:
44
45 >>> ali.rows[1]
46 <AlignedSequenceAdapter: a, 3> # row #1 (first sequence)
47 >>> ali.columns[1]
48 (<ColumnInfo a [1]: ALA>, <ColumnInfo b [1]: ALA>) # residues at column #1
49
50 See L{AbstractAlignment} for all details and more examples.
51
52 There are a number of L{AbstractAlignment} implementations defined here.
53 L{SequenceAlignment} is the default one, nothing surprising. L{A3MAlignment}
54 is a more special one: the first sequence in the alignment is a master sequence.
55 This alignment is usually used in the context of HHpred. More important is the
56 L{StructureAlignment}, which is an alignment of L{csb.bio.structure.Chain} objects.
57 The residues in every aligned sequence are really the L{csb.bio.structure.Residue}
58 objects taken from those chains.
59 """
60
61 import re
62 import csb.core
63 import csb.io
64
65 from abc import ABCMeta, abstractmethod, abstractproperty
73
79
81 """
82 Enumeration of alignment strategies
83 """
84 Global='global'; Local='local'
85
92
100
107
109 """
110 Unknown sequence alphabet
111 """
112 UNK='X'; GAP='-'; INSERTION='.'
113
141
145
147
148 - def __init__(self, index=None, start=1, end=None):
158
160
161 if self.index is not None:
162 s = 'Position {0.index} is out of range [{0.start}, {0.end}]'
163 else:
164 s = 'Out of range [{0.start}, {0.end}]'
165
166 return s.format(self)
167
170
173
176
179
181
183
184 self._type = None
185 self._rank = rank
186
187 self.type = type
188
189 @property
191 """
192 Residue type - a member of any sequence alphabet
193 @rtype: enum item
194 """
195 return self._type
196 @type.setter
197 - def type(self, type):
201
202 @property
204 """
205 Residue position (1-based)
206 @rtype: int
207 """
208 return self._rank
209
211 return '<{1} [{0.rank}]: {0.type!r}>'.format(self, self.__class__.__name__)
212
215
216 - def __init__(self, column, id, rank, residue):
222
224 return '<{0.__class__.__name__} {0.id} [{0.column}]: {0.residue.type!r}>'.format(self)
225
227
229 self._container = container
230
237
239 return iter(self._container)
240
253
262
265 """
266 Represents a list of L{AbstractSequence}s.
267 """
268
271
286
289 """
290 Base abstract class for all Sequence objects.
291
292 Provides 1-based access to the residues in the sequence via the
293 sequence.residues property. The sequence object itself also behaves like
294 a collection and provides 0-based access to its elements (residues).
295
296 @param id: FASTA ID of this sequence (e.g. accession number)
297 @type id: str
298 @param header: FASTA sequence header
299 @type header: str
300 @param residues: sequence residues
301 @type residues: str or collection of L{ResidueInfo}
302 @param type: a L{SequenceTypes} member (defaults to protein)
303 @type type: L{EnumItem}
304 """
305
306 __metaclass__ = ABCMeta
307
308 DELIMITER = '>'
309
323
325
326 if isinstance(spec, slice):
327 spec = SliceHelper(spec, 0, self.length)
328 return self.subregion(spec.start + 1, spec.stop)
329 else:
330 if not 0 <= spec < self.length:
331 raise IndexError(spec)
332 return self._get(spec + 1)
333
335 for index in range(self.length):
336 yield self[index]
337
338 @abstractmethod
339 - def _add(self, residue):
340 """
341 Append a C{residue} to the sequence.
342
343 This is a hook method invoked internally for each residue during object
344 construction. By implementing this method, sub-classes define how
345 residues are attached to the sequence object.
346 """
347 pass
348
349 @abstractmethod
350 - def _get(self, rank):
351 """
352 Retrieve the sequence residue at the specified position (1-based, positive).
353
354 This is a hook method which defines the actual behavior of the sequence
355 residue indexer.
356
357 @rtype: L{ResidueInfo}
358 @raise SequencePositionError: when the supplied rank is out of range
359 """
360 pass
361
363 """
364 Return a new sequence of the current L{AbstractSequence} sub-class.
365 """
366 return self.__class__(*a, **k)
367
369 """
370 Remove all gaps and insertions from the sequence.
371
372 @return: a new sequence instance, containing no gaps
373 @rtype: L{AbstractSequence}
374 """
375 residues = [r for r in self._residues
376 if r.type not in (self.alphabet.GAP, self.alphabet.INSERTION)]
377
378 return self._factory(self.id, self.header, residues, self.type)
379
381 """
382 Extract a subsequence, defined by [start, end]. The start and end
383 positions are 1-based, inclusive.
384
385 @param start: start position
386 @type start: int
387 @param end: end position
388 @type end: int
389
390 @return: a new sequence
391 @rtype: L{AbstractSequence}
392
393 @raise SequencePositionError: if start/end positions are out of range
394 """
395 positions = range(start, end + 1)
396 return self.extract(positions)
397
399 """
400 Extract a subsequence, defined by a list of 1-based positions.
401
402 @param positions: positions to extract
403 @type positions: tuple of int
404
405 @return: a new sequence
406 @rtype: L{AbstractSequence}
407
408 @raise SequencePositionError: if any position is out of range
409 """
410
411 end = self.length
412 residues = []
413
414 for rank in sorted(set(positions)):
415 if 1 <= rank <= end:
416 residues.append(self._get(rank))
417 else:
418 raise SequencePositionError(rank, 1, end)
419
420 return self._factory(self.id, self.header, residues, self.type)
421
422 - def dump(self, output_file):
433
434 @property
436 """
437 Number of residues
438 @rtype: int
439 """
440 return len(self._residues)
441
442 @property
444 """
445 Sequence identifier
446 @rtype: str
447 """
448 return self._id
449 @id.setter
450 - def id(self, value):
454
455 @property
457 """
458 Sequence description
459 @rtype: str
460 """
461 return self._header
462 @header.setter
469
470 @property
472 """
473 Sequence type - a member of L{SequenceTypes}
474 @rtype: enum item
475 """
476 return self._type
477 @type.setter
478 - def type(self, value):
484
485 @property
487 """
488 The actual sequence
489 @rtype: str
490 """
491 return ''.join([str(r.type) for r in self._residues])
492
493 @property
495 """
496 The sequence alphabet corresponding to the current sequence type
497 @rtype: L{csb.core.enum}
498 """
499 return SequenceAlphabets.get(self._type)
500
501 @property
503 """
504 Rank-based access to the underlying L{residues<csb.bio.sequence.ResidueInfo>}
505 @rtype: L{SequenceIndexer}
506 """
507 return SequenceIndexer(self)
508
511
513 return '<{0.__class__.__name__}: {0.id}, {0.length} residues>'.format(self)
514
517
519 """
520 High-performance sequence object. The actual sequence is stored internally
521 as a string. The indexer acts as a residue factory, which creates a new
522 L{ResidueInfo} instance each time.
523
524 @note: This class was created with parsing large volumes of data in mind. This
525 comes at the expense of degraded performance of the sequence indexer.
526
527 @param id: FASTA ID of this sequence (e.g. accession number)
528 @type id: str
529 @param header: FASTA sequence header
530 @type header: str
531 @param residues: sequence string
532 @type residues: str
533 @param type: a L{SequenceTypes} member (defaults to protein)
534 @type type: L{EnumItem}
535 """
536
538
539 self._id = None
540 self._header = None
541 self._residues = ''
542 self._type = None
543
544 self.id = id
545 self.header = header
546 self.type = type
547
548 self._append(residues)
549
551
552 self._residues += re.sub('([^\w\-\.])+', '', string)
553
554 - def _add(self, char):
556
557 - def _get(self, rank):
558
559 type = csb.core.Enum.parse(self.alphabet, self._residues[rank - 1])
560 return ResidueInfo(rank, type)
561
567
575
588
589 @property
591 return self._residues
592
594 """
595 Sequence implementation, which converts the sequence into a list of
596 L{ResidueInfo} objects. See L{AbstractSequence} for details.
597 """
598
599 - def _add(self, residue):
610
611 - def _get(self, rank):
612 return self._residues[rank - 1]
613
614 @staticmethod
624
626 """
627 Sequence view for L{csb.bio.structure.Chain} objects.
628 See L{AbstractSequence} for details.
629 """
630
631 - def _add(self, residue):
637
638 - def _get(self, rank):
639 return self._residues[rank - 1]
640
641 @staticmethod
651
654 """
655 Base wrapper class for L{AbstractSequence} objects.
656 Needs to be sub-classed (does not do anything special on its own).
657
658 @param sequence: adaptee
659 @type sequence: L{AbstractSequence}
660 """
661
668
670 return self._subject[i]
671
673 return iter(self._subject)
674
676 return '<{0.__class__.__name__}: {0.id}, {0.length}>'.format(self)
677
679 return str(self._subject)
680
682 raise NotImplementedError()
683
684 - def _get(self, rank):
685 return self._subject._get(rank)
686
688 return self.__class__(self._subject._factory(*a, **k))
689
691 return self._subject.strip()
692
695
697 return self._subject.extract(positions)
698
699 @property
701 return self._subject.id
702
703 @property
705 return self._subject.length
706
707 @property
709 return self._subject.type
710
711 @property
713 return self._subject.header
714
715 @property
718
719 @property
722
724 """
725 Adapter, which wraps a gapped L{AbstractSequence} object and makes it
726 compatible with the MSA row/entry interface, expected by L{AbstractAlignment}.
727
728 The C{adapter.residues} property operates with an L{UngappedSequenceIndexer},
729 which provides a gap-free view of the underlying sequence.
730
731 The C{adapter.columns} property operates with a standard L{ColumnIndexer},
732 the same indexer which is used to provide the column view in multiple
733 alignments. Adapted sequences therefore act as alignment rows and allow for
734 MSA-column-oriented indexing.
735
736 @param sequence: adaptee
737 @type sequence: L{AbstractSequence}
738 """
739
756
758 if not 0 <= index < self.length:
759 raise IndexError(index)
760 return self._get_column(index + 1)
761
763 for c in sorted(self._fmap):
764 yield self._get_column(c)
765
766 @property
768 """
769 Provides 1-based access to the respective columns in the MSA.
770 @rtype: L{ColumnIndexer}
771 """
772 return ColumnIndexer(self)
773
774 @property
776 """
777 Provides 1-based access to the residues of the unaligned (ungapped)
778 sequence.
779 @rtype: L{UngappedSequenceIndexer}
780 """
781 return UngappedSequenceIndexer(self)
782
786
789
791 """
792 Return the MSA column number corresponding to the specified ungapped
793 sequence C{rank}.
794
795 @param rank: 1-based residue rank
796 @type rank: int
797 @rtype: int
798 """
799 return self._rmap[rank]
800
802 """
803 Return the ungapped sequence rank corresponding to the specified MSA
804 C{column} number.
805
806 @param column: 1-based alignment column number
807 @type column: int
808 @rtype: int
809 """
810 return self._fmap[column]
811
813
814 - def __init__(self, slice, start=0, stop=0):
815
816 s, e, t = slice.start, slice.stop, slice.step
817
818 if s is None:
819 s = start
820 if e is None:
821 e = stop
822 if t is None:
823 t = 1
824
825 for value in [s, e, t]:
826 if value < 0:
827 raise IndexError(value)
828
829 self.start = s
830 self.stop = e
831 self.step = t
832
870
873 """
874 Base class for all alignment objects.
875
876 Provides 1-based access to the alignment.rows and alignment.columns.
877 Alignment rows can also be accessed by sequence ID. In addition, all
878 alignments support 0-based slicing:
879
880 >>> alignment[rows, columns]
881 AbstractAlignment (sub-alignment)
882
883 where
884 - C{rows} can be a slice, tuple of row indexes or tuple of sequence IDs
885 - columns can be a slice or tuple of column indexes
886
887 For example:
888
889 >>> alignment[:, 2:]
890 AbstractAlignment # all rows, columns [3, alignment.length]
891 >>> alignment[(0, 'seqx'), (3, 5)]
892 AbstractAlignment # rows #1 and 'seq3', columns #4 and #5
893
894 @param sequences: alignment entries (must have equal length)
895 @type sequences: list of L{AbstractSequence}s
896 @param strict: if True, raise {DuplicateSequenceError} when a duplicate ID
897 is found (default=True)
898 @type strict: bool
899
900 @note: if C{strict} is False and there are C{sequences} with redundant identifiers,
901 those sequences will be added to the C{rows} collection with :An suffix,
902 where n is a serial number. Therefore, rows['ID'] will return only one sequence,
903 the first sequence with id=ID. All remaining sequences can be retrieved
904 with C{rows['ID:A1']}, {rows['ID:A2']}, etc. However, the sequence objects will
905 remain intact, e.g. {rows['ID:A1'].id} still returns 'ID' and not 'ID:A1'.
906 """
907
908 __metaclass__ = ABCMeta
909
910 - def __init__(self, sequences, strict=True):
911
912 self._length = None
913 self._msa = AlignmentRowsTable(self)
914 self._colview = ColumnIndexer(self)
915 self._map = {}
916 self._strict = bool(strict)
917
918 self._construct(sequences)
919
921
922
923
924
925
926 if not isinstance(spec, tuple) or len(spec) not in (1, 2):
927 raise TypeError('Invalid alignment slice expression')
928
929 if len(spec) == 2:
930 rowspec, colspec = spec
931 else:
932 rowspec, colspec = [spec, slice(None)]
933
934
935 if isinstance(rowspec, slice):
936 if isinstance(rowspec.start, csb.core.string) or isinstance(rowspec.stop, csb.core.string):
937 raise TypeError("Invalid row slice: only indexes are supported")
938 rowspec = SliceHelper(rowspec, 0, self.size)
939 rows = range(rowspec.start + 1, rowspec.stop + 1)
940 elif isinstance(rowspec, int):
941 rows = [rowspec + 1]
942 elif csb.core.iterable(rowspec):
943 try:
944 rows = []
945 for r in rowspec:
946 if isinstance(r, int):
947 rows.append(r + 1)
948 else:
949 rows.append(self._map[r])
950 except KeyError as ke:
951 raise KeyError('No such Sequence ID: {0!s}'.format(ke))
952 else:
953 raise TypeError('Unsupported row expression')
954
955
956 if isinstance(colspec, slice):
957 colspec = SliceHelper(colspec, 0, self._length or 0)
958 cols = range(colspec.start + 1, colspec.stop + 1)
959 elif isinstance(colspec, int):
960 cols = [colspec + 1]
961 elif csb.core.iterable(colspec):
962 try:
963 cols = [ c + 1 for c in colspec ]
964 except:
965 raise TypeError('Unsupported column expression')
966 else:
967 raise TypeError('Unsupported column expression')
968
969
970 if len(rows) == 0:
971 raise ValueError("The expression returns zero rows")
972 if len(cols) == 0:
973 raise ValueError("The expression returns zero columns")
974
975
976 return self._extract(rows, cols)
977
978 - def _range(self, slice, start, end):
979
980 s, e, t = slice.start, slice.end, slice.step
981
982 if s is None:
983 s = start
984 if e is None:
985 e = end
986 if t is None:
987 t = 1
988
989 return range(s, e, t)
990
992 for cn in range(1, self.length + 1):
993 yield self._get_column(cn)
994
995 @abstractmethod
997 """
998 Hook method, called internally upon object construction. Subclasses
999 define how the source alignment sequences are handled during alignment
1000 construction.
1001
1002 @param sequences: alignment entries
1003 @type sequences: list of L{AbstractSequence}s
1004 """
1005 pass
1006
1008 """
1009 Hook method, which is used to initialize various alignment properties
1010 (such as length) from the first alignned sequence.
1011 """
1012 if rep_sequence.length == 0:
1013 raise SequenceError("Sequence '{0}' is empty".format(rep_sequence.id))
1014
1015 assert self._length is None
1016 self._length = rep_sequence.length
1017
1018 - def add(self, sequence):
1038
1039 @property
1041 """
1042 Number of columns in the alignment
1043 @rtype: int
1044 """
1045 return self._length or 0
1046
1047 @property
1049 """
1050 Number of rows (sequences) in the alignment
1051 @rtype: int
1052 """
1053 return self._msa.length
1054
1055 @property
1057 """
1058 1-based access to the alignment entries (sequences)
1059 @rtype: L{AlignmentRowsTable}
1060 """
1061 return self._msa
1062
1063 @property
1065 """
1066 1-based access to the alignment columns
1067 @rtype: L{ColumnIndexer}
1068 """
1069 return self._colview
1070
1072 """
1073 Return True of C{column} contains at least one gap.
1074 @param column: column number, 1-based
1075 @type column: int
1076
1077 @rtype: bool
1078 """
1079
1080 for row in self._msa:
1081 if row.columns[column].residue.type == row.alphabet.GAP:
1082 return True
1083
1084 return False
1085
1087 return tuple(row._get_column(column) for row in self.rows)
1088
1090
1091 rows = set(rows)
1092 cols = set(cols)
1093
1094 if not 1 <= min(rows) <= max(rows) <= self.size:
1095 raise IndexError('Row specification out of range')
1096
1097 if not 1 <= min(cols) <= max(cols) <= self.length:
1098 raise IndexError('Column specification out of range')
1099
1100 sequences = []
1101
1102 for rn, row in enumerate(self.rows, start=1):
1103 if rn in rows:
1104 sequences.append(row.extract(cols))
1105
1106 return self.__class__(sequences)
1107
1109 """
1110 Extract a sub-alignment, ranging from C{start} to C{end} columns.
1111
1112 @param start: starting column, 1-based
1113 @type start: int
1114 @param end: ending column, 1-based
1115 @type end: int
1116
1117 @return: a new alignment of the current type
1118 @rtype: L{AbstractAlignment}
1119
1120 @raise ColumnPositionError: if start/end is out of range
1121 """
1122 if not 1 <= start <= end <= self.length:
1123 raise ColumnPositionError(None, 1, self.length)
1124
1125 sequences = []
1126
1127 for row in self.rows:
1128 sequences.append(row.subregion(start, end))
1129
1130 return self.__class__(sequences)
1131
1155
1157 """
1158 Multiple sequence alignment. See L{AbstractAlignment} for details.
1159 """
1160
1165
1166 @staticmethod
1167 - def parse(string, strict=True):
1180
1182 """
1183 Multiple structure alignment. Similar to a L{SequenceAlignment}, but
1184 the alignment holds the actual L{csb.bio.structure.ProteinResidue} objects,
1185 taken from the corresponding source L{csb.bio.structure.Chain}s.
1186
1187 See L{AbstractAlignment} for details.
1188 """
1189
1194
1195 @staticmethod
1196 - def parse(string, provider, id_factory=None, strict=True):
1197 """
1198 Create a new L{StructureAlignment} from an mFASTA string. See
1199 L{csb.bio.io.fasta.StructureAlignmentFactory} for details.
1200
1201 @param string: MSA-formatted string
1202 @type string: str
1203 @param provider: data source for all structures found in the alignment
1204 @type provider: L{csb.bio.io.wwpdb.StructureProvider}
1205 @param strict: see L{AbstractAlignment}
1206 @type strict: bool
1207 @param id_factory: callable factory, which transforms a sequence ID into
1208 a L{csb.bio.io.wwpdb.EntryID} object. By default
1209 this is L{csb.bio.io.wwpdb.EntryID.create}.
1210 @type id_factory: callable
1211 @rtype: L{StructureAlignment}
1212 """
1213 from csb.bio.io.fasta import StructureAlignmentFactory
1214
1215 factory = StructureAlignmentFactory(
1216 provider, id_factory=id_factory, strict=strict)
1217 return factory.make_alignment(string)
1218
1220 """
1221 A specific type of multiple alignment, which provides some operations
1222 relative to a master sequence (the first entry in the alignment).
1223 """
1224
1225 - def __init__(self, sequences, strict=True):
1232
1234
1235 super(A3MAlignment, self)._initialize(rep_sequence)
1236 self._alphabet = rep_sequence.alphabet
1237
1253
1254 @property
1256 """
1257 The master sequence
1258 @rtype: L{AbstractSequence}
1259 """
1260 return self._master
1261
1263 """
1264 Return True of C{column} contains at least one insertion.
1265
1266 @param column: column number, 1-based
1267 @type column: int
1268 @rtype: bool
1269 """
1270 return column in self._insertions
1271
1285
1288
1289 @property
1291 """
1292 Number of match states (residues in the ungapped master).
1293 @rtype: int
1294 """
1295 return self._matches
1296
1297 @staticmethod
1298 - def parse(string, strict=True):
1311