Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/utils.py: 12%

192 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1from math import log, exp 

2from numpy import floor, pi as PI, sin 

3 

4from .. import Tree 

5 

6 

7def get_rooting(tol, seed_species, agename = False): 

8 ''' 

9 returns dict of species age for a given TOL and a given seed 

10 

11 **Example:** 

12 

13 :: 

14 

15 tol = "((((((((Drosophila melanogaster,(Drosophila simulans,Drosophila secchellia)),(Drosophila yakuba,Drosophila erecta))[&&NHX:name=melanogaster subgroup],Drosophila ananassae)[&&NHX:name=melanogaster group],(Drosophila pseudoobscura,Drosophila persimilis)[&&NHX:name=obscura group])[&&NHX:name=Sophophora Old World],Drosophila willistoni)[&&NHX:name=subgenus Sophophora],(Drosophila grimshawi,(Drosophila virilis,Drosophila mojavensis))[&&NHX:name=subgenus Drosophila])[&&NHX:name=genus Drosophila],(Anopheles gambiae,Aedes aegypti)[&&NHX:name=Culicidae])[&&NHX:name=Arthropoda],Caenorhabditis elegans)[&&NHX:name=Animalia];" 

16 seed = "Drosophila melanogaster" 

17 ROOTING, age2name = get_rooting (tol, seed, True) 

18 

19 ROOTING == {"Aedes aegypti" : 7, 

20 "Anopheles gambiae" : 7, 

21 "Caenorhabditis elegans" : 8, 

22 "Drosophila ananassae" : 3, 

23 "Drosophila erecta" : 2, 

24 "Drosophila grimshawi" : 6, 

25 "Drosophila melanogaster" : 1, 

26 "Drosophila mojavensis" : 6, 

27 "Drosophila persimilis" : 4, 

28 "Drosophila pseudoobscura": 4, 

29 "Drosophila secchellia" : 1, 

30 "Drosophila simulans" : 1, 

31 "Drosophila virilis" : 6, 

32 "Drosophila willistoni" : 5, 

33 "Drosophila yakuba" : 2} 

34 

35 age2name == {1: "Drosophila melanogaster. Drosophila simulans. Drosophila secchellia", 

36 2: "melanogaster subgroup", 

37 3: "melanogaster group", 

38 4: "Sophophora Old World", 

39 5: "subgenus Sophophora", 

40 6: "genus Drosophila", 

41 7: "Arthropoda", 

42 8: "Animalia"} 

43 

44 :argument seed_species: species name 

45 :argument False agename: if True, also returns the inverse dictionary 

46 

47 :returns: ROOTING dictionary with age of each species 

48 

49 ''' 

50 

51 tol = Tree (tol) 

52 try: 

53 node = tol.search_nodes (name=seed_species)[0] 

54 except IndexError: 

55 exit ('ERROR: Seed species not found in tree\n') 

56 age = 1 

57 ROOTING = {} 

58 if agename: 

59 age2name = {} 

60 while not node.is_root: 

61 node = node.up 

62 for leaf in node.get_leaf_names(): 

63 if agename: 

64 if node.name == 'NoName': 

65 nam = '.'.join (node.get_leaf_names()) 

66 else: 

67 nam = node.name 

68 age2name.setdefault (age, nam) 

69 ROOTING.setdefault (leaf, age) 

70 age += 1 

71 if agename: 

72 return ROOTING, age2name 

73 return ROOTING 

74 

75 

76def translate(sequence): 

77 ''' 

78 little function to translate DNA to protein... 

79 from: http://python.genedrift.org/ 

80 TODO : inseqgroup functions? 

81 

82 :argument sequence: string 

83 

84 :returns: translated sequence 

85 ''' 

86 #dictionary with the genetic code 

87 gencode = { 

88 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 

89 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 

90 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 

91 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 

92 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 

93 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 

94 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 

95 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 

96 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 

97 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 

98 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 

99 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 

100 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 

101 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 

102 'TAC':'Y', 'TAT':'Y', 'TAA':'.', 'TAG':'.', 

103 'TGC':'C', 'TGT':'C', 'TGA':'.', 'TGG':'W', 

104 '---':'-', 'nnn':'x', 'NNN':'X' 

105 } 

106 ambig = {'Y':['A', 'G'], 'R':['C', 'T'], 'M':['G', 'T'], 'K':['A', 'C'], \ 

107 'S':['G', 'C'],'W':['A', 'T'], 'V':['C', 'G', 'T'], \ 

108 'H':['A', 'G', 'T'], 'D':['A', 'C', 'T'], 'B':['A', 'C', 'G'], \ 

109 'N':['A', 'C', 'G', 'T']} 

110 proteinseq = '' 

111 #loop to read DNA sequence in codons, 3 nucleotides at a time 

112 sequence = sequence.upper() 

113 for n in range(0, len(sequence), 3): 

114 #checking to see if the dictionary has the key 

115 try: 

116 proteinseq += gencode[sequence[n:n+3]] 

117 except KeyError: 

118 newcod = [] 

119 for nt in sequence[n:n+3]: 

120 if nt in ambig: 

121 newcod.append(ambig[nt]) 

122 else : 

123 newcod.append(list (nt)) 

124 aa = '' 

125 for nt1 in newcod[0]: 

126 for nt2 in newcod[1]: 

127 for nt3 in newcod[2]: 

128 try: 

129 if aa == '': 

130 aa = gencode[nt1+nt2+nt3] 

131 elif gencode[nt1+nt2+nt3] != aa: 

132 aa = 'X' 

133 break 

134 except KeyError: 

135 aa = 'X' 

136 break 

137 proteinseq += aa 

138 return proteinseq 

139 

140 

141# reused from pycogent 

142ROUND_ERROR = 1e-14 

143MAXLOG = 7.09782712893383996843E2 

144MAXLGM = 2.556348e305 

145big = 4.503599627370496e15 

146biginv = 2.22044604925031308085e-16 

147MACHEP = 1.11022302462515654042E-16 

148LS2PI = 0.91893853320467274178 

149LOGPI = 1.14472988584940017414 

150 

151 

152def chi_high(x, df): 

153 """Returns right-hand tail of chi-square distribution (x to infinity). 

154 

155 df, the degrees of freedom, ranges from 1 to infinity (assume integers). 

156 Typically, df is (r-1)*(c-1) for a r by c table. 

157 

158 Result ranges from 0 to 1. 

159 

160 See Cephes docs for details. 

161 """ 

162 x = fix_rounding_error(x) 

163 

164 if x < 0: 

165 raise ValueError("chi_high: x must be >= 0 (got %s)." % x) 

166 if df < 1: 

167 raise ValueError("chi_high: df must be >= 1 (got %s)." % df) 

168 return igamc(float(df)/2, x/2) 

169 

170 

171def fix_rounding_error(x): 

172 """If x is almost in the range 0-1, fixes it. 

173 

174 Specifically, if x is between -ROUND_ERROR and 0, returns 0. 

175 If x is between 1 and 1+ROUND_ERROR, returns 1. 

176 """ 

177 if -ROUND_ERROR < x < 0: 

178 return 0 

179 elif 1 < x < 1+ROUND_ERROR: 

180 return 1 

181 return x 

182 

183 

184def igamc(a,x): 

185 """Complemented incomplete Gamma integral: see Cephes docs.""" 

186 if x <= 0 or a <= 0: 

187 return 1 

188 if x < 1 or x < a: 

189 return 1 - igam(a, x) 

190 ax = a * log(x) - x - lgam(a) 

191 if ax < -MAXLOG: #underflow 

192 return 0 

193 ax = exp(ax) 

194 #continued fraction 

195 y = 1 - a 

196 z = x + y + 1 

197 c = 0 

198 pkm2 = 1 

199 qkm2 = x 

200 pkm1 = x + 1 

201 qkm1 = z * x 

202 ans = pkm1/qkm1 

203 

204 while 1: 

205 c += 1 

206 y += 1 

207 z += 2 

208 yc = y * c 

209 pk = pkm1 * z - pkm2 * yc 

210 qk = qkm1 * z - qkm2 * yc 

211 if qk != 0: 

212 r = pk/qk 

213 t = abs((ans-r)/r) 

214 ans = r 

215 else: 

216 t = 1 

217 pkm2 = pkm1 

218 pkm1 = pk 

219 qkm2 = qkm1 

220 qkm1 = qk 

221 if abs(pk) > big: 

222 pkm2 *= biginv 

223 pkm1 *= biginv 

224 qkm2 *= biginv 

225 qkm1 *= biginv 

226 if t <= MACHEP: 

227 break 

228 return ans * ax 

229 

230 

231def lgam(x): 

232 """Natural log of the gamma fuction: see Cephes docs for details""" 

233 if x < -34: 

234 q = -x 

235 w = lgam(q) 

236 p = floor(q) 

237 if p == q: 

238 raise OverflowError("lgam returned infinity.") 

239 z = q - p 

240 if z > 0.5: 

241 p += 1 

242 z = p - q 

243 z = q * sin(PI * z) 

244 if z == 0: 

245 raise OverflowError("lgam returned infinity.") 

246 z = LOGPI - log(z) - w 

247 return z 

248 if x < 13: 

249 z = 1 

250 p = 0 

251 u = x 

252 while u >= 3: 

253 p -= 1 

254 u = x + p 

255 z *= u 

256 while u < 2: 

257 if u == 0: 

258 raise OverflowError("lgam returned infinity.") 

259 z /= u 

260 p += 1 

261 u = x + p 

262 if z < 0: 

263 z = -z 

264 if u == 2: 

265 return log(z) 

266 p -= 2 

267 x = x + p 

268 p = x * polevl(x, GB)/polevl(x,GC) 

269 return log(z) + p 

270 if x > MAXLGM: 

271 raise OverflowError("Too large a value of x in lgam.") 

272 q = (x - 0.5) * log(x) - x + LS2PI 

273 if x > 1.0e8: 

274 return q 

275 p = 1/(x*x) 

276 if x >= 1000: 

277 q += (( 7.9365079365079365079365e-4 * p 

278 -2.7777777777777777777778e-3) *p 

279 + 0.0833333333333333333333) / x 

280 else: 

281 q += polevl(p, GA)/x 

282 return q 

283 

284 

285def polevl(x, coef): 

286 """evaluates a polynomial y = C_0 + C_1x + C_2x^2 + ... + C_Nx^N 

287 

288 Coefficients are stored in reverse order, i.e. coef[0] = C_N 

289 """ 

290 result = 0 

291 for c in coef: 

292 result = result * x + c 

293 return result 

294 

295 

296def igam(a, x): 

297 """Left tail of incomplete gamma function: see Cephes docs for details""" 

298 if x <= 0 or a <= 0: 

299 return 0 

300 if x > 1 and x > a: 

301 return 1 - igamc(a,x) 

302 

303 #Compute x**a * exp(x) / Gamma(a) 

304 

305 ax = a * log(x) - x - lgam(a) 

306 if ax < -MAXLOG: #underflow 

307 return 0.0 

308 ax = exp(ax) 

309 

310 #power series 

311 r = a 

312 c = 1 

313 ans = 1 

314 while 1: 

315 r += 1 

316 c *= x/r 

317 ans += c 

318 if c/ans <= MACHEP: 

319 break 

320 

321 return ans * ax / a 

322 

323#Coefficients for Gamma follow: 

324GA = [ 

325 8.11614167470508450300E-4, 

326 -5.95061904284301438324E-4, 

327 7.93650340457716943945E-4, 

328 -2.77777777730099687205E-3, 

329 8.33333333333331927722E-2, 

330] 

331 

332GB = [ 

333 -1.37825152569120859100E3, 

334 -3.88016315134637840924E4, 

335 -3.31612992738871184744E5, 

336 -1.16237097492762307383E6, 

337 -1.72173700820839662146E6, 

338 -8.53555664245765465627E5, 

339] 

340 

341GC = [ 

342 1.00000000000000000000E0, 

343 -3.51815701436523470549E2, 

344 -1.70642106651881159223E4, 

345 -2.20528590553854454839E5, 

346 -1.13933444367982507207E6, 

347 -2.53252307177582951285E6, 

348 -2.01889141433532773231E6, 

349] 

350 

351GP = [ 

352 1.60119522476751861407E-4, 

353 1.19135147006586384913E-3, 

354 1.04213797561761569935E-2, 

355 4.76367800457137231464E-2, 

356 2.07448227648435975150E-1, 

357 4.94214826801497100753E-1, 

358 9.99999999999999996796E-1, 

359] 

360 

361GQ = [ 

362 -2.31581873324120129819E-5, 

363 5.39605580493303397842E-4, 

364 -4.45641913851797240494E-3, 

365 1.18139785222060435552E-2, 

366 3.58236398605498653373E-2, 

367 -2.34591795718243348568E-1, 

368 7.14304917030273074085E-2, 

369 1.00000000000000000320E0, 

370]