Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/core/seqgroup.py: 56%

55 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1""" 

2This module provides the SeqGroup class with methods to operate with 

3Multiple Sequence Files, including Multiple Sequence Alignments. 

4 

5Currently, Fasta, Phylip sequencial and Phylip interleaved formats are 

6supported. 

7""" 

8 

9from ..parser.fasta import read_fasta, write_fasta 

10from ..parser.paml import read_paml, write_paml 

11from ..parser.phylip import read_phylip, write_phylip 

12 

13 

14__all__ = ['SeqGroup'] 

15 

16 

17class SeqGroup: 

18 """Class to store a set of sequences (aligned or not).""" 

19 

20 def __init__(self, sequences=None, format='fasta', 

21 fix_duplicates=True, **kwargs): 

22 r""" 

23 :param sequences: Path to the file containing the sequences or, 

24 alternatively, the text string containing them. 

25 :param format: Encoding format of sequences. Supported formats 

26 are: ``fasta``, ``phylip`` (phylip sequencial) and 

27 ``iphylip`` (phylip interleaved). Phylip format forces 

28 sequence names to a maximum of 10 chars. To avoid this 

29 effect, you can use the relaxed phylip format: 

30 ``phylip_relaxed`` and ``iphylip_relaxed``. 

31 

32 Example:: 

33 

34 seqs_str = ('>seq1\n' 

35 'AAAAAAAAAAA\n' 

36 '>seq2\n' 

37 'TTTTTTTTTTTTT\n') 

38 seqs = SeqGroup(seqs_str, format='fasta') 

39 print(seqs.get_seq('seq1')) 

40 """ 

41 self.parsers = { 

42 'fasta': [read_fasta, write_fasta, {}], 

43 'phylip': [read_phylip, write_phylip, {'interleaved': False, 'relaxed': False}], 

44 'iphylip': [read_phylip, write_phylip, {'interleaved': True, 'relaxed': False}], 

45 'phylip_relaxed': [read_phylip, write_phylip, {'interleaved': False, 'relaxed': True}], 

46 'iphylip_relaxed': [read_phylip, write_phylip, {'interleaved': True, 'relaxed': True}], 

47 'paml': [read_paml, write_paml, kwargs] 

48 } 

49 

50 self.id2name = {} 

51 self.name2id = {} 

52 self.id2comment= {} 

53 self.id2seq = {} 

54 

55 if sequences is not None: 

56 format = format.lower() 

57 if format in self.parsers: 

58 read = self.parsers[format][0] 

59 args = self.parsers[format][2] 

60 read(sequences, obj=self, fix_duplicates=fix_duplicates, **args) 

61 else: 

62 raise ValueError(f'Unsupported format: {format}') 

63 

64 def __len__(self): 

65 return len(self.id2seq) 

66 

67 def __contains__(self, item): 

68 return item in self.name2id 

69 

70 def __str__(self): 

71 return write_fasta(self) 

72 

73 def __iter__(self): 

74 return self.iter_entries() 

75 

76 def __repr__(self): 

77 return 'SeqGroup (%s)' % hex(self.__hash__()) 

78 

79 def write(self, format='fasta', outfile=None): 

80 """Return the text representation of the sequences. 

81 

82 :param format: Format for the output representation. 

83 :param outfile: If given, the result is written to that file. 

84 """ 

85 format = format.lower() 

86 if format in self.parsers: 

87 write = self.parsers[format][1] 

88 args = self.parsers[format][2] 

89 return write(self, outfile, **args) 

90 else: 

91 raise ValueError('Unsupported format: [%s]' % format) 

92 

93 def iter_entries(self): 

94 """Return an iterator over all sequences in the collection. 

95 

96 Each item is a tuple with the sequence name, sequence, and 

97 sequence comments. 

98 """ 

99 for i, seq in self.id2seq.items(): 

100 yield self.id2name[i], seq, self.id2comment.get(i, []) 

101 

102 def get_seq(self, name): 

103 """Return the sequence associated to a given entry name.""" 

104 return self.id2seq[self.name2id[name]] 

105 

106 def get_entries(self): 

107 """Return the list of entries currently stored.""" 

108 keys = list(self.id2seq.keys()) 

109 seqs = list(self.id2seq.values()) 

110 comments = [self.id2comment.get(x, []) for x in keys] 

111 names = [self.id2name[x] for x in keys] 

112 return list(zip(names, seqs, comments)) 

113 

114 def set_seq(self, name, seq, comments=None): 

115 """Add or update a sequence.""" 

116 name = name.strip() 

117 

118 for c in ' \t\n\r': 

119 seq = seq.replace(c, '') 

120 

121 seqid = self.name2id.get(name, max([0]+list(self.name2id.values()))+1) 

122 

123 self.name2id[name] = seqid 

124 self.id2name[seqid] = name 

125 self.id2comment[seqid] = comments or [] 

126 self.id2seq[seqid] = seq