1 __VERSION__="ete2-2.0rev96"
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 import os
26 import re
27 from sys import stderr as STDERR
28
30 if obj is None:
31 from ete2.coretype import SeqGroup
32 SG = SeqGroup()
33 else:
34 SG = obj
35
36
37 if os.path.isfile(source):
38 _source = open(source, "rU")
39 else:
40 _source = iter(source.split("\n"))
41
42 nchar, ntax = None, None
43 counter = 0
44 id_counter = 0
45 for line in _source:
46 line = line.strip("\n")
47
48 if not line or line[0] == "#":
49 continue
50
51 if not nchar or not ntax:
52 m = re.match("^\s*(\d+)\s+(\d+)",line)
53 if m:
54 ntax = int (m.groups()[0])
55 nchar = int (m.groups()[1])
56 else:
57 raise Exception, \
58 "A first line with the alignment dimension is required"
59
60 else:
61 if not interleaved:
62
63 if SG.id2name.get(id_counter, None) is None:
64 m = re.match("^(.{10})(.+)", line)
65 if m:
66 name = m.groups()[0].strip()
67 if name in SG.name2id:
68
69 tag = str(len([k for k in SG.name2id.keys() \
70 if k.endswith(name)]))
71 old_name = name
72
73
74 name = tag+"_"+name
75 print >>STDERR, \
76 "Duplicated entry [%s] was renamed to [%s]" %\
77 (old_name, name)
78 SG.id2name[id_counter] = name
79 SG.name2id[name] = id_counter
80 SG.id2seq[id_counter] = ""
81 line = m.groups()[1]
82 else:
83 raise Exception, \
84 "Wrong phylip sequencial format."
85 SG.id2seq[id_counter] += re.sub("\s","", line)
86 if len(SG.id2seq[id_counter]) == nchar:
87 id_counter += 1
88 name = None
89 elif len(SG.id2seq[id_counter]) > nchar:
90 raise Exception, \
91 "Unexpected length of sequence [%s] [%s]." %(name,SG.id2seq[id_counter])
92 else:
93 if len(SG)<ntax:
94 m = re.match("^(.{10})(.+)",line)
95 if m:
96 name = m.groups()[0].strip()
97
98 seq = re.sub("\s","",m.groups()[1])
99 SG.id2seq[id_counter] = seq
100 SG.id2name[id_counter] = name
101 if name in SG.name2id:
102 tag = str(len([k for k in SG.name2id.keys() \
103 if k.endswith(name)]))
104 old_name = name
105 name = tag+"_"+name
106 print >>STDERR, \
107 "Duplicated entry [%s] was renamed to [%s]" %\
108 (old_name, name)
109 SG.name2id[name] = id_counter
110 id_counter += 1
111 else:
112 raise Exception, \
113 "Unexpected number of sequences."
114 else:
115 seq = re.sub("\s", "", line)
116 if id_counter == len(SG):
117 id_counter = 0
118 SG.id2seq[id_counter] += seq
119 id_counter += 1
120
121 if len(SG) != ntax:
122 raise Exception, \
123 "Unexpected number of sequences."
124
125
126 for i in SG.id2seq.keys():
127 if len(SG.id2seq[i]) != nchar:
128 raise Exception, \
129 "Unexpected lenght of sequence [%s]" %SG.id2name[i]
130
131 return SG
132
134 width = 60
135 seq_visited = set([])
136
137 show_name_warning = False
138 lenghts = set((len(seq) for seq in aln.id2seq.values()))
139 if len(lenghts) >1:
140 raise Exception, "Phylip format requires sequences of equal lenght."
141 seqlength = lenghts.pop()
142
143 alg_text = " %d %d\n" %(len(aln), seqlength)
144 if interleaved:
145 visited = set([])
146 for i in xrange(0, seqlength, width):
147 for j in xrange(len(aln)):
148 name = aln.id2name[j]
149 if len(name)>10:
150 name = name[:10]
151 show_name_warning = True
152
153 seq = aln.id2seq[j][i:i+width]
154 if j not in visited:
155 alg_text += "%10s " %name
156 visited.add(j)
157 else:
158 alg_text += " "*13
159
160 alg_text += ' '.join([seq[k:k+10] for k in xrange(0, len(seq), 10)])
161 alg_text += "\n"
162 alg_text += "\n"
163 else:
164 for name, seq, comments in aln.iter_entries():
165 if len(name)>10:
166 name = name[:10]
167 show_name_warning = True
168 alg_text += "%10s %s\n%s\n" %\
169 (name, seq[0:width-13], '\n'.join([seq[k:k+width] \
170 for k in xrange(width-13, len(seq), width)]))
171 if show_name_warning:
172 print >>STDERR, "Warning! Some seqnames were truncated to 10 characters"
173
174 if outfile is not None:
175 OUT = open(outfile, "w")
176 OUT.write(alg_text)
177 OUT.close()
178 else:
179 return alg_text
180