Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/parser/paml.py: 9%
89 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1import os
2import string
3from sys import stderr as STDERR
4from re import search
6from ete4.core import seqgroup
8def read_paml (source, obj=None, header_delimiter="\t", fix_duplicates=True):
9 """ Reads a collection of sequences econded in PAML format... that is, something between PHYLIP and fasta
11 3 6
12 seq1
13 ATGATG
14 seq2
15 ATGATG
16 seq3
17 ATGATG
19 or
21 3 6
22 >seq1
23 ATGATG
24 >seq2
25 ATGATG
26 >seq3
27 ATGATG
29 or
31 >seq1
32 ATGATG
33 >seq2
34 ATGATG
35 >seq3
36 ATGATG
38 """
40 if obj is None:
41 SC = seqgroup.SeqGroup()
42 else:
43 SC = obj
45 names = set([])
46 seq_id = -1
48 # Prepares handle from which read sequences
49 if os.path.isfile(source):
50 _source = open(source, "r")
51 else:
52 _source = iter(source.split("\n"))
54 seq_name = None
55 num_seq = 0
56 len_seq = 0
57 in_seq = False
58 for line in _source:
59 line = line.strip()
60 if line.startswith('#') or not line:
61 continue
62 if line.startswith('// end of file'):
63 break
64 # Reads seq number
65 elif line.startswith('>') or ((num_seq and len_seq) and not in_seq):
66 fasta = line.startswith('>')
67 line = line.replace('>','')
68 # Checks if previous name had seq
69 if seq_id>-1 and SC.id2seq[seq_id] == "":
70 raise Exception("No sequence found for "+seq_name)
72 seq_id += 1
73 # Takes header info
74 seq_header_fields = [_f.strip() for _f in line.split(header_delimiter)]
75 in_seq = True
76 if (not fasta) and ' ' in seq_header_fields[0].strip():
77 seq_header_fields = seq_header_fields[0].split(' ')[0]
78 seq_header_fields = [seq_header_fields]
79 SC.id2seq[seq_id] = line.split(' ')[-1].strip().replace(' ', '')
80 if len_seq:
81 if len(SC.id2seq[seq_id]) == len_seq:
82 in_seq=False
83 elif len(SC.id2seq[seq_id]) > len_seq:
84 raise Exception("Error reading sequences: Wrong sequence length.\n"+line)
85 else:
86 SC.id2seq[seq_id] = ""
87 seq_name = seq_header_fields[0]
88 # Checks for duplicated seq names
89 if fix_duplicates and seq_name in names:
90 tag = str(len([k for k in list(SC.name2id.keys()) if k.endswith(seq_name)]))
91 old_name = seq_name
92 seq_name = tag+"_"+seq_name
93 print("Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name), file=STDERR)
95 # stores seq_name
96 SC.id2name[seq_id] = seq_name
97 SC.name2id[seq_name] = seq_id
98 SC.id2comment[seq_id] = seq_header_fields[1:]
99 names.add(seq_name)
100 else:
101 if seq_name is None:
102 if search ('^[0-9]+ *[0-9]+ *[A-Z]*', line):
103 try:
104 num_seq, len_seq = line.strip().split()
105 except ValueError:
106 num_seq, len_seq, _ = line.strip().split()
107 num_seq = int(num_seq)
108 len_seq = int(len_seq)
109 continue
110 if line.startswith('\n'):
111 continue
112 raise Exception("Error reading sequences: Wrong format.\n"+line)
113 elif in_seq:
114 # removes all white spaces in line
115 s = line.strip().replace(" ","")
117 # append to seq_string
118 SC.id2seq[seq_id] += s
119 if len_seq:
120 if len(SC.id2seq[seq_id]) == len_seq:
121 in_seq=False
122 elif len(SC.id2seq[seq_id]) > len_seq:
123 raise Exception("Error reading sequences: Wrong sequence length.\n"+line)
125 if seq_name and SC.id2seq[seq_id] == "":
126 print(seq_name,"has no sequence", file=STDERR)
127 return None
129 # Everything ok
130 return SC
132def write_paml(sequences, outfile = None, seqwidth = 80):
133 """
134 Writes a SeqGroup python object using PAML format.
135 sequences are ordered, because PAML labels tree according to this.
136 """
137 text = ' %d %d\n' % (len (sequences), len (sequences.get_entries()[0][1]))
138 text += '\n'.join(["%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for
139 name, seq, comment in sorted(sequences)])
140 if outfile is not None:
141 OUT = open(outfile,"w")
142 OUT.write(text)
143 OUT.close()
144 else:
145 return text
147def _seq2str(seq, seqwidth = 80):
148 sequence = ""
149 for i in range(0,len(seq),seqwidth):
150 sequence+= seq[i:i+seqwidth] + "\n"
151 return sequence