Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/utils.py: 12%
192 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1from math import log, exp
2from numpy import floor, pi as PI, sin
4from .. import Tree
7def get_rooting(tol, seed_species, agename = False):
8 '''
9 returns dict of species age for a given TOL and a given seed
11 **Example:**
13 ::
15 tol = "((((((((Drosophila melanogaster,(Drosophila simulans,Drosophila secchellia)),(Drosophila yakuba,Drosophila erecta))[&&NHX:name=melanogaster subgroup],Drosophila ananassae)[&&NHX:name=melanogaster group],(Drosophila pseudoobscura,Drosophila persimilis)[&&NHX:name=obscura group])[&&NHX:name=Sophophora Old World],Drosophila willistoni)[&&NHX:name=subgenus Sophophora],(Drosophila grimshawi,(Drosophila virilis,Drosophila mojavensis))[&&NHX:name=subgenus Drosophila])[&&NHX:name=genus Drosophila],(Anopheles gambiae,Aedes aegypti)[&&NHX:name=Culicidae])[&&NHX:name=Arthropoda],Caenorhabditis elegans)[&&NHX:name=Animalia];"
16 seed = "Drosophila melanogaster"
17 ROOTING, age2name = get_rooting (tol, seed, True)
19 ROOTING == {"Aedes aegypti" : 7,
20 "Anopheles gambiae" : 7,
21 "Caenorhabditis elegans" : 8,
22 "Drosophila ananassae" : 3,
23 "Drosophila erecta" : 2,
24 "Drosophila grimshawi" : 6,
25 "Drosophila melanogaster" : 1,
26 "Drosophila mojavensis" : 6,
27 "Drosophila persimilis" : 4,
28 "Drosophila pseudoobscura": 4,
29 "Drosophila secchellia" : 1,
30 "Drosophila simulans" : 1,
31 "Drosophila virilis" : 6,
32 "Drosophila willistoni" : 5,
33 "Drosophila yakuba" : 2}
35 age2name == {1: "Drosophila melanogaster. Drosophila simulans. Drosophila secchellia",
36 2: "melanogaster subgroup",
37 3: "melanogaster group",
38 4: "Sophophora Old World",
39 5: "subgenus Sophophora",
40 6: "genus Drosophila",
41 7: "Arthropoda",
42 8: "Animalia"}
44 :argument seed_species: species name
45 :argument False agename: if True, also returns the inverse dictionary
47 :returns: ROOTING dictionary with age of each species
49 '''
51 tol = Tree (tol)
52 try:
53 node = tol.search_nodes (name=seed_species)[0]
54 except IndexError:
55 exit ('ERROR: Seed species not found in tree\n')
56 age = 1
57 ROOTING = {}
58 if agename:
59 age2name = {}
60 while not node.is_root:
61 node = node.up
62 for leaf in node.get_leaf_names():
63 if agename:
64 if node.name == 'NoName':
65 nam = '.'.join (node.get_leaf_names())
66 else:
67 nam = node.name
68 age2name.setdefault (age, nam)
69 ROOTING.setdefault (leaf, age)
70 age += 1
71 if agename:
72 return ROOTING, age2name
73 return ROOTING
76def translate(sequence):
77 '''
78 little function to translate DNA to protein...
79 from: http://python.genedrift.org/
80 TODO : inseqgroup functions?
82 :argument sequence: string
84 :returns: translated sequence
85 '''
86 #dictionary with the genetic code
87 gencode = {
88 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
89 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
90 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
91 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
92 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
93 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
94 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
95 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
96 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
97 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
98 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
99 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
100 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
101 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
102 'TAC':'Y', 'TAT':'Y', 'TAA':'.', 'TAG':'.',
103 'TGC':'C', 'TGT':'C', 'TGA':'.', 'TGG':'W',
104 '---':'-', 'nnn':'x', 'NNN':'X'
105 }
106 ambig = {'Y':['A', 'G'], 'R':['C', 'T'], 'M':['G', 'T'], 'K':['A', 'C'], \
107 'S':['G', 'C'],'W':['A', 'T'], 'V':['C', 'G', 'T'], \
108 'H':['A', 'G', 'T'], 'D':['A', 'C', 'T'], 'B':['A', 'C', 'G'], \
109 'N':['A', 'C', 'G', 'T']}
110 proteinseq = ''
111 #loop to read DNA sequence in codons, 3 nucleotides at a time
112 sequence = sequence.upper()
113 for n in range(0, len(sequence), 3):
114 #checking to see if the dictionary has the key
115 try:
116 proteinseq += gencode[sequence[n:n+3]]
117 except KeyError:
118 newcod = []
119 for nt in sequence[n:n+3]:
120 if nt in ambig:
121 newcod.append(ambig[nt])
122 else :
123 newcod.append(list (nt))
124 aa = ''
125 for nt1 in newcod[0]:
126 for nt2 in newcod[1]:
127 for nt3 in newcod[2]:
128 try:
129 if aa == '':
130 aa = gencode[nt1+nt2+nt3]
131 elif gencode[nt1+nt2+nt3] != aa:
132 aa = 'X'
133 break
134 except KeyError:
135 aa = 'X'
136 break
137 proteinseq += aa
138 return proteinseq
141# reused from pycogent
142ROUND_ERROR = 1e-14
143MAXLOG = 7.09782712893383996843E2
144MAXLGM = 2.556348e305
145big = 4.503599627370496e15
146biginv = 2.22044604925031308085e-16
147MACHEP = 1.11022302462515654042E-16
148LS2PI = 0.91893853320467274178
149LOGPI = 1.14472988584940017414
152def chi_high(x, df):
153 """Returns right-hand tail of chi-square distribution (x to infinity).
155 df, the degrees of freedom, ranges from 1 to infinity (assume integers).
156 Typically, df is (r-1)*(c-1) for a r by c table.
158 Result ranges from 0 to 1.
160 See Cephes docs for details.
161 """
162 x = fix_rounding_error(x)
164 if x < 0:
165 raise ValueError("chi_high: x must be >= 0 (got %s)." % x)
166 if df < 1:
167 raise ValueError("chi_high: df must be >= 1 (got %s)." % df)
168 return igamc(float(df)/2, x/2)
171def fix_rounding_error(x):
172 """If x is almost in the range 0-1, fixes it.
174 Specifically, if x is between -ROUND_ERROR and 0, returns 0.
175 If x is between 1 and 1+ROUND_ERROR, returns 1.
176 """
177 if -ROUND_ERROR < x < 0:
178 return 0
179 elif 1 < x < 1+ROUND_ERROR:
180 return 1
181 return x
184def igamc(a,x):
185 """Complemented incomplete Gamma integral: see Cephes docs."""
186 if x <= 0 or a <= 0:
187 return 1
188 if x < 1 or x < a:
189 return 1 - igam(a, x)
190 ax = a * log(x) - x - lgam(a)
191 if ax < -MAXLOG: #underflow
192 return 0
193 ax = exp(ax)
194 #continued fraction
195 y = 1 - a
196 z = x + y + 1
197 c = 0
198 pkm2 = 1
199 qkm2 = x
200 pkm1 = x + 1
201 qkm1 = z * x
202 ans = pkm1/qkm1
204 while 1:
205 c += 1
206 y += 1
207 z += 2
208 yc = y * c
209 pk = pkm1 * z - pkm2 * yc
210 qk = qkm1 * z - qkm2 * yc
211 if qk != 0:
212 r = pk/qk
213 t = abs((ans-r)/r)
214 ans = r
215 else:
216 t = 1
217 pkm2 = pkm1
218 pkm1 = pk
219 qkm2 = qkm1
220 qkm1 = qk
221 if abs(pk) > big:
222 pkm2 *= biginv
223 pkm1 *= biginv
224 qkm2 *= biginv
225 qkm1 *= biginv
226 if t <= MACHEP:
227 break
228 return ans * ax
231def lgam(x):
232 """Natural log of the gamma fuction: see Cephes docs for details"""
233 if x < -34:
234 q = -x
235 w = lgam(q)
236 p = floor(q)
237 if p == q:
238 raise OverflowError("lgam returned infinity.")
239 z = q - p
240 if z > 0.5:
241 p += 1
242 z = p - q
243 z = q * sin(PI * z)
244 if z == 0:
245 raise OverflowError("lgam returned infinity.")
246 z = LOGPI - log(z) - w
247 return z
248 if x < 13:
249 z = 1
250 p = 0
251 u = x
252 while u >= 3:
253 p -= 1
254 u = x + p
255 z *= u
256 while u < 2:
257 if u == 0:
258 raise OverflowError("lgam returned infinity.")
259 z /= u
260 p += 1
261 u = x + p
262 if z < 0:
263 z = -z
264 if u == 2:
265 return log(z)
266 p -= 2
267 x = x + p
268 p = x * polevl(x, GB)/polevl(x,GC)
269 return log(z) + p
270 if x > MAXLGM:
271 raise OverflowError("Too large a value of x in lgam.")
272 q = (x - 0.5) * log(x) - x + LS2PI
273 if x > 1.0e8:
274 return q
275 p = 1/(x*x)
276 if x >= 1000:
277 q += (( 7.9365079365079365079365e-4 * p
278 -2.7777777777777777777778e-3) *p
279 + 0.0833333333333333333333) / x
280 else:
281 q += polevl(p, GA)/x
282 return q
285def polevl(x, coef):
286 """evaluates a polynomial y = C_0 + C_1x + C_2x^2 + ... + C_Nx^N
288 Coefficients are stored in reverse order, i.e. coef[0] = C_N
289 """
290 result = 0
291 for c in coef:
292 result = result * x + c
293 return result
296def igam(a, x):
297 """Left tail of incomplete gamma function: see Cephes docs for details"""
298 if x <= 0 or a <= 0:
299 return 0
300 if x > 1 and x > a:
301 return 1 - igamc(a,x)
303 #Compute x**a * exp(x) / Gamma(a)
305 ax = a * log(x) - x - lgam(a)
306 if ax < -MAXLOG: #underflow
307 return 0.0
308 ax = exp(ax)
310 #power series
311 r = a
312 c = 1
313 ans = 1
314 while 1:
315 r += 1
316 c *= x/r
317 ans += c
318 if c/ans <= MACHEP:
319 break
321 return ans * ax / a
323#Coefficients for Gamma follow:
324GA = [
325 8.11614167470508450300E-4,
326 -5.95061904284301438324E-4,
327 7.93650340457716943945E-4,
328 -2.77777777730099687205E-3,
329 8.33333333333331927722E-2,
330]
332GB = [
333 -1.37825152569120859100E3,
334 -3.88016315134637840924E4,
335 -3.31612992738871184744E5,
336 -1.16237097492762307383E6,
337 -1.72173700820839662146E6,
338 -8.53555664245765465627E5,
339]
341GC = [
342 1.00000000000000000000E0,
343 -3.51815701436523470549E2,
344 -1.70642106651881159223E4,
345 -2.20528590553854454839E5,
346 -1.13933444367982507207E6,
347 -2.53252307177582951285E6,
348 -2.01889141433532773231E6,
349]
351GP = [
352 1.60119522476751861407E-4,
353 1.19135147006586384913E-3,
354 1.04213797561761569935E-2,
355 4.76367800457137231464E-2,
356 2.07448227648435975150E-1,
357 4.94214826801497100753E-1,
358 9.99999999999999996796E-1,
359]
361GQ = [
362 -2.31581873324120129819E-5,
363 5.39605580493303397842E-4,
364 -4.45641913851797240494E-3,
365 1.18139785222060435552E-2,
366 3.58236398605498653373E-2,
367 -2.34591795718243348568E-1,
368 7.14304917030273074085E-2,
369 1.00000000000000000320E0,
370]