Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/evoltree.py: 20%

225 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1#!/usr/bin/python3 

2 

3""" 

4This module defines the EvolNode dataytype to manage evolutionary 

5variables and integrate them within phylogenetic trees. It inherits 

6the PhyloTree and add some speciall features to the the node 

7instances. 

8""" 

9from __future__ import absolute_import 

10from ..tools.utils import which 

11from .utils import translate 

12from .model import Model, PARAMS, AVAIL 

13from .. import PhyloTree, SeqGroup 

14from warnings import warn 

15import sys 

16import os 

17 

18__author__ = "Francois-Jose Serra" 

19__email__ = "francois@barrabin.org" 

20__licence__ = "GPLv3" 

21__version__ = "0.0" 

22__references__ = ''' 

23Yang, Z., Nielsen, R., Goldman, N., & Pedersen, A. M. 2000. 

24 Codon-substitution models for heterogeneous selection pressure at amino acid sites. 

25 Genetics 155: 431-49. 

26 Retrieved from http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1461088&tool=pmcentrez&rendertype=abstract 

27Yang, Z., & Nielsen, R. 2002. 

28 Codon-substitution models for detecting molecular adaptation at individual sites along specific lineages. 

29 Molecular biology and evolution 19: 908-17. 

30 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/12032247 

31Bielawski, J. P., & Yang, Z. 2004. 

32 A maximum likelihood method for detecting functional divergence at individual codon sites, with application to gene family evolution. 

33 Journal of molecular evolution 59: 121-32. 

34 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/15383915 

35Zhang, J., Nielsen, R., & Yang, Z. 2005. 

36 Evaluation of an improved branch-site likelihood method for detecting positive selection at the molecular level. 

37 Molecular biology and evolution 22: 2472-9. 

38 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/16107592 

39Yang, Z. 2007. 

40 PAML 4: phylogenetic analysis by maximum likelihood. 

41 Molecular biology and evolution 24: 1586-91. 

42 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/17483113 

43''' 

44 

45 

46try: 

47 from scipy.stats import chi2 

48 def chi_high(x, y): return 1 - chi2.cdf(x, y) 

49except ImportError: 

50 from .utils import chi_high 

51 

52try: 

53 from ..treeview import TreeStyle 

54except ImportError: 

55 TREEVIEW = False 

56else: 

57 TREEVIEW = True 

58 

59__all__ = ["EvolNode", "EvolTree"] 

60 

61 

62def _parse_species(name): 

63 return name[:3] # just to return specie name from fasta description 

64 

65 

66class EvolNode(PhyloTree): 

67 """Re-implementation of the standart Tree instance. It adds 

68 attributes and methods to work with phylogentic trees. 

69 

70 :param newick: Path to tree in newick format, can also be a string 

71 :param alignment: Path to alignment, can also be a string. 

72 :param fasta alg_format: Alignment format. 

73 :param sp_naming_function: Function to infer species name. 

74 :param format: Type of newick format 

75 :param binpath: Path to binaries, in case codeml or SLR are not in global path. 

76 """ 

77 

78 def __init__(self, newick=None, alignment=None, alg_format="fasta", 

79 sp_naming_function=_parse_species, parser=0, 

80 binpath='', **kwargs): 

81 ''' 

82 freebranch: path to find codeml output of freebranch model. 

83 ''' 

84 # _update names? 

85 self.workdir = '/tmp/ete3-tmp/' 

86 if not binpath: 

87 ete3_path = which("ete3") 

88 binpath = os.path.split(ete3_path)[0] 

89 

90 self.execpath = binpath 

91 self._models = {} 

92 self.__gui_mark_mode = False 

93 

94 PhyloTree.__init__(self, newick=newick, parser=parser, 

95 sp_naming_function=sp_naming_function, **kwargs) 

96 

97 if newick: 

98 self._label_as_paml() 

99 # initialize node marks 

100 self.mark_tree([]) 

101 

102 def _set_mark_mode(self, val): 

103 self.__gui_mark_mode = val 

104 

105 def _is_mark_mode(self): 

106 return self.__gui_mark_mode 

107 

108 def _label_internal_nodes(self, nid=None): 

109 """ 

110 nid needs to be a list in order to keep count through recursivity 

111 """ 

112 for node in self.get_children(): 

113 if node.is_leaf: 

114 continue 

115 nid[0] += 1 

116 node.add_prop('node_id', nid[0]) 

117 node._label_internal_nodes(nid) 

118 

119 def _label_as_paml(self): 

120 ''' 

121 to label tree as paml, nearly walking man over the tree algorithm 

122 WARNING: sorted names in same order that sequence 

123 WARNING: depends on tree topology conformation, not the same after a swap 

124 activates the function get_descendants_by_pamlid 

125 ''' 

126 nid = 1 

127 # check we do not have dupplicated names in tree 

128 if (len(self)) != len(set(self.leaf_names())): 

129 duplis = [n for n in self.leaf_names( 

130 ) if self.leaf_names().count(n) > 1] 

131 raise Exception('EvolTree require unique names for leaves', duplis) 

132 # put ids 

133 for leaf in sorted(self, key=lambda x: x.name): 

134 leaf.add_prop('node_id', nid) 

135 nid += 1 

136 self.add_prop('node_id', nid) 

137 self._label_internal_nodes([nid]) 

138 

139 def get_descendant_by_node_id(self, idname): 

140 ''' 

141 returns node list corresponding to a given idname 

142 ''' 

143 try: 

144 for n in self.descendants(): 

145 if n.props.get('node_id') == idname: 

146 return n 

147 if self.props.get('node_id') == idname: 

148 return self 

149 except AttributeError: 

150 warn('Should be first labelled as paml ' + 

151 '(automatically done when alignemnt is loaded)') 

152 

153 def _write_algn(self, fullpath): 

154 """ 

155 to write algn in paml format 

156 """ 

157 seq_group = SeqGroup() 

158 for n in self: 

159 seq_group.id2seq [n.props.get('node_id')] = n.props.get('nt_sequence') 

160 seq_group.id2name [n.props.get('node_id')] = n.name 

161 seq_group.name2id [n.name ] = n.props.get('node_id') 

162 seq_group.write(outfile=fullpath, format='paml') 

163 

164 def run_model(self, model_name, ctrl_string='', keep=True, **kwargs): 

165 ''' 

166 To compute evolutionnary models. e.g.: b_free_lala.vs.lele, will launch one free branch model, and store 

167 it in "WORK_DIR/b_free_lala.vs.lele" directory 

168 

169 WARNING: this functionality needs to create a working directory in "rep" 

170 

171 WARNING: you need to have codeml and/or SLR in your path 

172 

173 The models available are: 

174 

175 =========== ============================= ================== 

176 Model name Description Model kind 

177 =========== ============================= ==================\n%s 

178 =========== ============================= ==================\n 

179 

180 **Note that M1 and M2 models are making reference to the new versions 

181 of these models, with continuous omega rates (namely M1a and M2a in the 

182 PAML user guide).** 

183 

184 :argument model_name: a string like "model-name[.some-secondary-name]" (e.g.: "fb.my_first_try", or just "fb") 

185 * model-name is compulsory, is the name of the model (see table above for the full list) 

186 * the second part is accessory, it is to avoid over-writing models with the same name. 

187 :argument ctrl_string: list of parameters that can be used as control file. 

188 :argument True keep: links the model to the tree (equivalen of running `EVOL_TREE.link_to_evol_model(MODEL_NAME)`) 

189 :argument kwargs: extra parameters should be one of: %s. 

190 ''' 

191 from subprocess import Popen, PIPE 

192 model_obj = Model(model_name, self, **kwargs) 

193 fullpath = os.path.join(self.workdir, model_obj.name) 

194 os.system("mkdir -p %s" % fullpath) 

195 # write tree file 

196 self._write_algn(fullpath + '/algn') 

197 if model_obj.properties['exec'] == 'Slr': 

198 self.write(outfile=fullpath+'/tree', format=(11)) 

199 else: 

200 self.write(outfile=fullpath+'/tree', 

201 format=(10 if model_obj.properties['allow_mark'] else 9)) 

202 # write algn file 

203 # MODEL MODEL MDE 

204 if ctrl_string == '': 

205 ctrl_string = model_obj.get_ctrl_string(fullpath+'/tmp.ctl') 

206 else: 

207 open(fullpath+'/tmp.ctl', 'w').write(ctrl_string) 

208 hlddir = os.getcwd() 

209 os.chdir(fullpath) 

210 bin_ = os.path.join(self.execpath, model_obj.properties['exec']) 

211 try: 

212 proc = Popen([bin_, 'tmp.ctl'], stdout=PIPE, 

213 stdin=PIPE, stderr=PIPE) 

214 except OSError: 

215 raise Exception(('ERROR: {} not installed, ' + 

216 'or wrong path to binary\n').format(bin_)) 

217 # send \n via stdin in case codeml/slr asks something (note on py3, stdin needs bytes) 

218 run, err = proc.communicate(b'\n') 

219 run = run.decode(sys.stdout.encoding) 

220 

221 os.chdir(hlddir) 

222 if err: 

223 warn("ERROR: inside codeml!!\n" + err) 

224 return 1 

225 if keep: 

226 setattr(model_obj, 'run', run) 

227 self.link_to_evol_model(os.path.join(fullpath, 'out'), model_obj) 

228 sep = '\n' 

229 run_model.__doc__ = run_model.__doc__ % \ 

230 (sep.join([' %-8s %-27s %-15s ' % 

231 ('%s' % (x), AVAIL[x]['evol'], AVAIL[x]['typ']) for x in sorted(sorted(AVAIL.keys()), key=lambda x: 

232 AVAIL[x]['typ'], 

233 reverse=True)]), 

234 ', '.join(list(PARAMS.keys()))) 

235 

236 # def test_codon_model(self): 

237 # for c_frq in range(4): 

238 # self.run_model('M0.model_test-'+str(c_frq), CodonFreq=c_frq) 

239 # if self.get_most_likely('M0.model_test-1', 'M0.model_test-0') > 0.05: 

240 # 

241 # self.get_most_likely('M0.model_test-2', 'M0.model_test-0') 

242 # self.get_most_likely('M0.model_test-3', 'M0.model_test-0') 

243 # self.get_most_likely('M0.model_test-2', 'M0.model_test-1') 

244 # self.get_most_likely('M0.model_test-3', 'M0.model_test-1') 

245 # self.get_most_likely('M0.model_test-3', 'M0.model_test-2') 

246 

247 def link_to_alignment(self, alignment, alg_format="paml", 

248 nucleotides=True, **kwargs): 

249 ''' 

250 same function as for phyloTree, but translate sequences if nucleotides 

251 nucleotidic sequence is kept under node.get('nt_sequence') 

252 

253 :argument alignment: path to alignment or string 

254 :argument alg_format: one of fasta phylip or paml 

255 :argument True alignment: set to False in case we want to keep it untranslated 

256 

257 ''' 

258 super(EvolTree, self).link_to_alignment(alignment, 

259 alg_format=alg_format, **kwargs) 

260 check_len = 0 

261 for leaf in self.leaves(): 

262 seq_len = len(str(leaf.props.get('sequence'))) 

263 if check_len and check_len != seq_len: 

264 warn('WARNING: sequences with different lengths found!') 

265 check_len = seq_len 

266 leaf.add_prop('nt_sequence', str(leaf.props.get('sequence'))) 

267 if nucleotides: 

268 leaf.add_prop('sequence', translate(leaf.props.get('nt_sequence'))) 

269 

270 def show(self, layout=None, tree_style=None, histfaces=None): 

271 ''' 

272 call super show of PhyloTree 

273 histface should be a list of models to be displayes as histfaces 

274 

275 :argument layout: a layout function 

276 :argument None tree_style: tree_style object 

277 :argument Nonehistface: an histogram face function. This is only to plot selective pressure among sites 

278 

279 ''' 

280 if TREEVIEW: 

281 if not tree_style: 

282 ts = TreeStyle() 

283 else: 

284 ts = tree_style 

285 if histfaces: 

286 for hist in histfaces: 

287 try: 

288 mdl = self.get_evol_model(hist) 

289 except AttributeError: 

290 warn('model %s not computed' % (hist)) 

291 if not 'histface' in mdl.properties: 

292 if len(histfaces) > 1 and histfaces.index(hist) != 0: 

293 mdl.set_histface(up=False) 

294 else: 

295 mdl.set_histface() 

296 if mdl.properties['histface'].up: 

297 ts.aligned_header.add_face( 

298 mdl.properties['histface'], 1) 

299 else: 

300 ts.aligned_foot.add_face( 

301 mdl.properties['histface'], 1) 

302 super(EvolTree, self).show(layout=layout, tree_style=ts) 

303 else: 

304 raise ValueError("Treeview module is disabled") 

305 

306 def render(self, file_name, layout=None, w=None, h=None, 

307 tree_style=None, header=None, histfaces=None): 

308 """Add up and down faces and render the tree representation to a file. 

309 

310 :param histface: A histogram face function. This is only to 

311 plot selective pressure among sites. 

312 """ 

313 if not TREEVIEW: 

314 raise ValueError("Treeview module is disabled") 

315 

316 ts = tree_style or TreeStyle() 

317 

318 for hist in (histfaces or []): 

319 try: 

320 mdl = self.get_evol_model(hist) 

321 except AttributeError: 

322 warn('model %s not computed' % (hist)) 

323 

324 if not 'histface' in mdl.properties: 

325 if len(histfaces) > 1 and histfaces.index(hist) != 0: 

326 mdl.set_histface(up=False) 

327 else: 

328 mdl.set_histface() 

329 

330 if mdl.properties['histface'].up: 

331 ts.aligned_header.add_face(mdl.properties['histface'], 1) 

332 else: 

333 ts.aligned_foot.add_face(mdl.properties['histface'], 1) 

334 

335 return super().render(file_name, layout=layout, tree_style=ts, w=w, h=h) 

336 

337 def mark_tree(self, node_ids, verbose=False, **kargs): 

338 ''' 

339 function to mark branches on tree in order that paml could interpret it. 

340 takes a "marks" argument that should be a list of #1,#1,#2 

341 e.g.: 

342 :: 

343 

344 t=Tree.mark_tree([2,3], marks=["#1","#2"]) 

345 

346 :argument node_ids: list of node ids (have a look to node.props.get('node_id')) 

347 :argument False verbose: warn if marks do not correspond to codeml standard 

348 :argument kwargs: mainly for the marks key-word which needs a list of marks (marks=['#1', '#2']) 

349 

350 ''' 

351 from re import match 

352 node_ids = list(map(int, node_ids)) 

353 if 'marks' in kargs: 

354 marks = list(kargs['marks']) 

355 else: 

356 marks = ['#1']*len(node_ids) 

357 for node in self.traverse(): 

358 if not node.props.get('node_id'): 

359 continue 

360 if node.props.get('node_id') in node_ids: 

361 if ('.' in marks[node_ids.index(node.props.get('node_id'))] or 

362 match('#[0-9]+', 

363 marks[node_ids.index(node.props.get('node_id'))]) is None) and verbose: 

364 warn('WARNING: marks should be "#" sign directly ' + 

365 'followed by integer\n' + self.mark_tree.__doc__) 

366 node.add_prop('mark', ' '+marks[node_ids.index(node.props.get('node_id'))]) 

367 elif node.props.get('mark') == None: 

368 node.add_prop('mark', '') 

369 

370 def link_to_evol_model(self, path, model): 

371 ''' 

372 link EvolTree to evolutionary model 

373 * free-branch model ("fb") will append evol values to tree 

374 * Site models (M0, M1, M2, M7, M8) will give evol values by site 

375 and likelihood 

376 

377 :argument path: path to outfile containing model computation result 

378 :argument model: either the name of a model, or a Model object (usually empty) 

379 

380 ''' 

381 if isinstance(model, str): 

382 model = Model(model, self, path) 

383 else: 

384 model._load(path) 

385 # new entry in _models dict 

386 while model.name in self._models: 

387 model.name = model.name.split('__')[0] + str( 

388 (int(model.name.split('__')[1]) + 1) 

389 if '__' in model.name else 0) 

390 self._models[model.name] = model 

391 if not os.path.isfile(path): 

392 warn("ERROR: not a file: " + path) 

393 return 1 

394 if len(self._models) == 1 and model.properties['exec'] == 'codeml': 

395 self.change_dist_to_evol('bL', model, fill=True) 

396 

397 def get_evol_model(self, modelname): 

398 ''' 

399 returns one precomputed model 

400 

401 :argument modelname: string of the name of a model object stored 

402 :returns: Model object 

403 ''' 

404 try: 

405 return self._models[modelname] 

406 except KeyError: 

407 Exception("ERROR: Model %s not found." % (modelname)) 

408 

409 def write(self, properties=None, outfile=None, format=10): 

410 """ 

411 Inherits from Tree but adds the tenth format, that allows to display marks for CodeML. 

412 TODO: internal writting format need to be something like 0 

413 """ 

414 from re import sub 

415 if int(format) == 11: 

416 nwk = ' %s 1\n' % (len(self)) 

417 nwk += sub(r'\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \ 

418 write_newick(self, properties=['mark'],format=9)) 

419 elif int(format)==10: 

420 nwk = sub(r'\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \ 

421 write_newick(self, properties=['mark'],format=9)) 

422 else: 

423 nwk = write_newick(self, properties=properties,format=format) 

424 if outfile is not None: 

425 open(outfile, "w").write(nwk) 

426 return nwk 

427 else: 

428 return nwk 

429 #write.__doc__ += super(PhyloTree, PhyloTree()).write.__doc__.replace( 

430 # 'argument format', 'argument 10 format') 

431 

432 def get_most_likely(self, altn, null): 

433 """Return pvalue of LRT between alternative model and null model. 

434 

435 Usual comparison are: 

436 

437 .. table:: 

438 

439 =========== ======= =========================================== 

440 Alternative Null Test 

441 =========== ======= =========================================== 

442 M2 M1 PS on sites (M2 prone to miss some sites) 

443 (Yang 2000). 

444 M3 M0 test of variability among sites 

445 M8 M7 PS on sites 

446 (Yang 2000) 

447 M8 M8a RX on sites?? think so.... 

448 bsA bsA1 PS on sites on specific branch 

449 (Zhang 2005) 

450 bsA M1 RX on sites on specific branch 

451 (Zhang 2005) 

452 bsC M1 different omegas on clades branches sites 

453 ref: Yang Nielsen 2002 

454 bsD M3 different omegas on clades branches sites 

455 (Yang Nielsen 2002, Bielawski 2004) 

456 b_free b_neut foreground branch not neutral (w != 1) 

457 - RX if P<0.05 (means that w on frg=1) 

458 - PS if P>0.05 and wfrg>1 

459 - CN if P>0.05 and wfrg>1 

460 (Yang Nielsen 2002) 

461 b_free M0 different ratio on branches 

462 (Yang Nielsen 2002) 

463 =========== ======= =========================================== 

464 

465 Note that M1 and M2 models are making reference to the new 

466 versions of these models, with continuous omega rates (namely 

467 M1a and M2a in the PAML user guide). 

468 

469 :param altn: model with higher number of parameters (np). 

470 :param null: model with lower number of parameters (np). 

471 """ 

472 altn = self.get_evol_model(altn) 

473 null = self.get_evol_model(null) 

474 if null.np > altn.np: 

475 warn("first model should be the alternative, change the order") 

476 return 1.0 

477 try: 

478 if hasattr(altn, 'lnL') and hasattr(null, 'lnL'): 

479 if null.lnL - altn.lnL < 0: 

480 return chi_high(2 * abs(altn.lnL - null.lnL), 

481 float(altn.np - null.np)) 

482 else: 

483 warn("\nWARNING: Likelihood of the alternative model is " 

484 "smaller than null's (%f - %f = %f)" % ( 

485 null.lnL, altn.lnL, null.lnL - altn.lnL) + 

486 "\nLarge differences (> 0.1) may indicate mistaken " 

487 "assigantion of null and alternative models") 

488 return 1 

489 except KeyError: 

490 warn("at least one of %s or %s, was not calculated" % (altn.name, 

491 null.name)) 

492 exit(self.get_most_likely.__doc__) 

493 

494 def change_dist_to_evol(self, evol, model, fill=False): 

495 

496 """Change dist/branch length of tree to a given evolutionary variable. 

497 

498 :param evol: Evolutionary variable (dN, dS, w, bL, by default bL). 

499 :param model: Model obj from which to retrieve evolutionary variables. 

500 :param fill: In addition to changing the dist parameter, annotate nodes 

501 with all evolutionary variables (add to their props: dN, w...). 

502 """ 

503 # branch-site outfiles do not give specific branch info 

504 if not model.branches: 

505 return 

506 for node in self.descendants(): 

507 if not evol in model.branches[node.props.get('node_id')]: 

508 continue 

509 node.dist = model.branches[node.props.get('node_id')][evol] 

510 if fill: 

511 for e in ['dN', 'dS', 'w', 'bL']: 

512 node.add_prop(e, model.branches [node.props.get('node_id')][e]) 

513 

514 

515# cosmetic alias 

516EvolTree = EvolNode