Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/parser/paml.py: 9%

89 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1import os 

2import string 

3from sys import stderr as STDERR 

4from re import search 

5 

6from ete4.core import seqgroup 

7 

8def read_paml (source, obj=None, header_delimiter="\t", fix_duplicates=True): 

9 """ Reads a collection of sequences econded in PAML format... that is, something between PHYLIP and fasta 

10 

11 3 6 

12 seq1 

13 ATGATG 

14 seq2 

15 ATGATG 

16 seq3 

17 ATGATG 

18 

19 or 

20 

21 3 6 

22 >seq1 

23 ATGATG 

24 >seq2 

25 ATGATG 

26 >seq3 

27 ATGATG 

28 

29 or 

30 

31 >seq1 

32 ATGATG 

33 >seq2 

34 ATGATG 

35 >seq3 

36 ATGATG 

37 

38 """ 

39 

40 if obj is None: 

41 SC = seqgroup.SeqGroup() 

42 else: 

43 SC = obj 

44 

45 names = set([]) 

46 seq_id = -1 

47 

48 # Prepares handle from which read sequences 

49 if os.path.isfile(source): 

50 _source = open(source, "r") 

51 else: 

52 _source = iter(source.split("\n")) 

53 

54 seq_name = None 

55 num_seq = 0 

56 len_seq = 0 

57 in_seq = False 

58 for line in _source: 

59 line = line.strip() 

60 if line.startswith('#') or not line: 

61 continue 

62 if line.startswith('// end of file'): 

63 break 

64 # Reads seq number 

65 elif line.startswith('>') or ((num_seq and len_seq) and not in_seq): 

66 fasta = line.startswith('>') 

67 line = line.replace('>','') 

68 # Checks if previous name had seq 

69 if seq_id>-1 and SC.id2seq[seq_id] == "": 

70 raise Exception("No sequence found for "+seq_name) 

71 

72 seq_id += 1 

73 # Takes header info 

74 seq_header_fields = [_f.strip() for _f in line.split(header_delimiter)] 

75 in_seq = True 

76 if (not fasta) and ' ' in seq_header_fields[0].strip(): 

77 seq_header_fields = seq_header_fields[0].split(' ')[0] 

78 seq_header_fields = [seq_header_fields] 

79 SC.id2seq[seq_id] = line.split(' ')[-1].strip().replace(' ', '') 

80 if len_seq: 

81 if len(SC.id2seq[seq_id]) == len_seq: 

82 in_seq=False 

83 elif len(SC.id2seq[seq_id]) > len_seq: 

84 raise Exception("Error reading sequences: Wrong sequence length.\n"+line) 

85 else: 

86 SC.id2seq[seq_id] = "" 

87 seq_name = seq_header_fields[0] 

88 # Checks for duplicated seq names 

89 if fix_duplicates and seq_name in names: 

90 tag = str(len([k for k in list(SC.name2id.keys()) if k.endswith(seq_name)])) 

91 old_name = seq_name 

92 seq_name = tag+"_"+seq_name 

93 print("Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name), file=STDERR) 

94 

95 # stores seq_name 

96 SC.id2name[seq_id] = seq_name 

97 SC.name2id[seq_name] = seq_id 

98 SC.id2comment[seq_id] = seq_header_fields[1:] 

99 names.add(seq_name) 

100 else: 

101 if seq_name is None: 

102 if search ('^[0-9]+ *[0-9]+ *[A-Z]*', line): 

103 try: 

104 num_seq, len_seq = line.strip().split() 

105 except ValueError: 

106 num_seq, len_seq, _ = line.strip().split() 

107 num_seq = int(num_seq) 

108 len_seq = int(len_seq) 

109 continue 

110 if line.startswith('\n'): 

111 continue 

112 raise Exception("Error reading sequences: Wrong format.\n"+line) 

113 elif in_seq: 

114 # removes all white spaces in line 

115 s = line.strip().replace(" ","") 

116 

117 # append to seq_string 

118 SC.id2seq[seq_id] += s 

119 if len_seq: 

120 if len(SC.id2seq[seq_id]) == len_seq: 

121 in_seq=False 

122 elif len(SC.id2seq[seq_id]) > len_seq: 

123 raise Exception("Error reading sequences: Wrong sequence length.\n"+line) 

124 

125 if seq_name and SC.id2seq[seq_id] == "": 

126 print(seq_name,"has no sequence", file=STDERR) 

127 return None 

128 

129 # Everything ok 

130 return SC 

131 

132def write_paml(sequences, outfile = None, seqwidth = 80): 

133 """ 

134 Writes a SeqGroup python object using PAML format. 

135 sequences are ordered, because PAML labels tree according to this. 

136 """ 

137 text = ' %d %d\n' % (len (sequences), len (sequences.get_entries()[0][1])) 

138 text += '\n'.join(["%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for 

139 name, seq, comment in sorted(sequences)]) 

140 if outfile is not None: 

141 OUT = open(outfile,"w") 

142 OUT.write(text) 

143 OUT.close() 

144 else: 

145 return text 

146 

147def _seq2str(seq, seqwidth = 80): 

148 sequence = "" 

149 for i in range(0,len(seq),seqwidth): 

150 sequence+= seq[i:i+seqwidth] + "\n" 

151 return sequence