Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/core/seqgroup.py: 56%
55 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1"""
2This module provides the SeqGroup class with methods to operate with
3Multiple Sequence Files, including Multiple Sequence Alignments.
5Currently, Fasta, Phylip sequencial and Phylip interleaved formats are
6supported.
7"""
9from ..parser.fasta import read_fasta, write_fasta
10from ..parser.paml import read_paml, write_paml
11from ..parser.phylip import read_phylip, write_phylip
14__all__ = ['SeqGroup']
17class SeqGroup:
18 """Class to store a set of sequences (aligned or not)."""
20 def __init__(self, sequences=None, format='fasta',
21 fix_duplicates=True, **kwargs):
22 r"""
23 :param sequences: Path to the file containing the sequences or,
24 alternatively, the text string containing them.
25 :param format: Encoding format of sequences. Supported formats
26 are: ``fasta``, ``phylip`` (phylip sequencial) and
27 ``iphylip`` (phylip interleaved). Phylip format forces
28 sequence names to a maximum of 10 chars. To avoid this
29 effect, you can use the relaxed phylip format:
30 ``phylip_relaxed`` and ``iphylip_relaxed``.
32 Example::
34 seqs_str = ('>seq1\n'
35 'AAAAAAAAAAA\n'
36 '>seq2\n'
37 'TTTTTTTTTTTTT\n')
38 seqs = SeqGroup(seqs_str, format='fasta')
39 print(seqs.get_seq('seq1'))
40 """
41 self.parsers = {
42 'fasta': [read_fasta, write_fasta, {}],
43 'phylip': [read_phylip, write_phylip, {'interleaved': False, 'relaxed': False}],
44 'iphylip': [read_phylip, write_phylip, {'interleaved': True, 'relaxed': False}],
45 'phylip_relaxed': [read_phylip, write_phylip, {'interleaved': False, 'relaxed': True}],
46 'iphylip_relaxed': [read_phylip, write_phylip, {'interleaved': True, 'relaxed': True}],
47 'paml': [read_paml, write_paml, kwargs]
48 }
50 self.id2name = {}
51 self.name2id = {}
52 self.id2comment= {}
53 self.id2seq = {}
55 if sequences is not None:
56 format = format.lower()
57 if format in self.parsers:
58 read = self.parsers[format][0]
59 args = self.parsers[format][2]
60 read(sequences, obj=self, fix_duplicates=fix_duplicates, **args)
61 else:
62 raise ValueError(f'Unsupported format: {format}')
64 def __len__(self):
65 return len(self.id2seq)
67 def __contains__(self, item):
68 return item in self.name2id
70 def __str__(self):
71 return write_fasta(self)
73 def __iter__(self):
74 return self.iter_entries()
76 def __repr__(self):
77 return 'SeqGroup (%s)' % hex(self.__hash__())
79 def write(self, format='fasta', outfile=None):
80 """Return the text representation of the sequences.
82 :param format: Format for the output representation.
83 :param outfile: If given, the result is written to that file.
84 """
85 format = format.lower()
86 if format in self.parsers:
87 write = self.parsers[format][1]
88 args = self.parsers[format][2]
89 return write(self, outfile, **args)
90 else:
91 raise ValueError('Unsupported format: [%s]' % format)
93 def iter_entries(self):
94 """Return an iterator over all sequences in the collection.
96 Each item is a tuple with the sequence name, sequence, and
97 sequence comments.
98 """
99 for i, seq in self.id2seq.items():
100 yield self.id2name[i], seq, self.id2comment.get(i, [])
102 def get_seq(self, name):
103 """Return the sequence associated to a given entry name."""
104 return self.id2seq[self.name2id[name]]
106 def get_entries(self):
107 """Return the list of entries currently stored."""
108 keys = list(self.id2seq.keys())
109 seqs = list(self.id2seq.values())
110 comments = [self.id2comment.get(x, []) for x in keys]
111 names = [self.id2name[x] for x in keys]
112 return list(zip(names, seqs, comments))
114 def set_seq(self, name, seq, comments=None):
115 """Add or update a sequence."""
116 name = name.strip()
118 for c in ' \t\n\r':
119 seq = seq.replace(c, '')
121 seqid = self.name2id.get(name, max([0]+list(self.name2id.values()))+1)
123 self.name2id[name] = seqid
124 self.id2name[seqid] = name
125 self.id2comment[seqid] = comments or []
126 self.id2seq[seqid] = seq