Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/evoltree.py: 20%
225 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1#!/usr/bin/python3
3"""
4This module defines the EvolNode dataytype to manage evolutionary
5variables and integrate them within phylogenetic trees. It inherits
6the PhyloTree and add some speciall features to the the node
7instances.
8"""
9from __future__ import absolute_import
10from ..tools.utils import which
11from .utils import translate
12from .model import Model, PARAMS, AVAIL
13from .. import PhyloTree, SeqGroup
14from warnings import warn
15import sys
16import os
18__author__ = "Francois-Jose Serra"
19__email__ = "francois@barrabin.org"
20__licence__ = "GPLv3"
21__version__ = "0.0"
22__references__ = '''
23Yang, Z., Nielsen, R., Goldman, N., & Pedersen, A. M. 2000.
24 Codon-substitution models for heterogeneous selection pressure at amino acid sites.
25 Genetics 155: 431-49.
26 Retrieved from http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1461088&tool=pmcentrez&rendertype=abstract
27Yang, Z., & Nielsen, R. 2002.
28 Codon-substitution models for detecting molecular adaptation at individual sites along specific lineages.
29 Molecular biology and evolution 19: 908-17.
30 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/12032247
31Bielawski, J. P., & Yang, Z. 2004.
32 A maximum likelihood method for detecting functional divergence at individual codon sites, with application to gene family evolution.
33 Journal of molecular evolution 59: 121-32.
34 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/15383915
35Zhang, J., Nielsen, R., & Yang, Z. 2005.
36 Evaluation of an improved branch-site likelihood method for detecting positive selection at the molecular level.
37 Molecular biology and evolution 22: 2472-9.
38 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/16107592
39Yang, Z. 2007.
40 PAML 4: phylogenetic analysis by maximum likelihood.
41 Molecular biology and evolution 24: 1586-91.
42 Retrieved from http://www.ncbi.nlm.nih.gov/pubmed/17483113
43'''
46try:
47 from scipy.stats import chi2
48 def chi_high(x, y): return 1 - chi2.cdf(x, y)
49except ImportError:
50 from .utils import chi_high
52try:
53 from ..treeview import TreeStyle
54except ImportError:
55 TREEVIEW = False
56else:
57 TREEVIEW = True
59__all__ = ["EvolNode", "EvolTree"]
62def _parse_species(name):
63 return name[:3] # just to return specie name from fasta description
66class EvolNode(PhyloTree):
67 """Re-implementation of the standart Tree instance. It adds
68 attributes and methods to work with phylogentic trees.
70 :param newick: Path to tree in newick format, can also be a string
71 :param alignment: Path to alignment, can also be a string.
72 :param fasta alg_format: Alignment format.
73 :param sp_naming_function: Function to infer species name.
74 :param format: Type of newick format
75 :param binpath: Path to binaries, in case codeml or SLR are not in global path.
76 """
78 def __init__(self, newick=None, alignment=None, alg_format="fasta",
79 sp_naming_function=_parse_species, parser=0,
80 binpath='', **kwargs):
81 '''
82 freebranch: path to find codeml output of freebranch model.
83 '''
84 # _update names?
85 self.workdir = '/tmp/ete3-tmp/'
86 if not binpath:
87 ete3_path = which("ete3")
88 binpath = os.path.split(ete3_path)[0]
90 self.execpath = binpath
91 self._models = {}
92 self.__gui_mark_mode = False
94 PhyloTree.__init__(self, newick=newick, parser=parser,
95 sp_naming_function=sp_naming_function, **kwargs)
97 if newick:
98 self._label_as_paml()
99 # initialize node marks
100 self.mark_tree([])
102 def _set_mark_mode(self, val):
103 self.__gui_mark_mode = val
105 def _is_mark_mode(self):
106 return self.__gui_mark_mode
108 def _label_internal_nodes(self, nid=None):
109 """
110 nid needs to be a list in order to keep count through recursivity
111 """
112 for node in self.get_children():
113 if node.is_leaf:
114 continue
115 nid[0] += 1
116 node.add_prop('node_id', nid[0])
117 node._label_internal_nodes(nid)
119 def _label_as_paml(self):
120 '''
121 to label tree as paml, nearly walking man over the tree algorithm
122 WARNING: sorted names in same order that sequence
123 WARNING: depends on tree topology conformation, not the same after a swap
124 activates the function get_descendants_by_pamlid
125 '''
126 nid = 1
127 # check we do not have dupplicated names in tree
128 if (len(self)) != len(set(self.leaf_names())):
129 duplis = [n for n in self.leaf_names(
130 ) if self.leaf_names().count(n) > 1]
131 raise Exception('EvolTree require unique names for leaves', duplis)
132 # put ids
133 for leaf in sorted(self, key=lambda x: x.name):
134 leaf.add_prop('node_id', nid)
135 nid += 1
136 self.add_prop('node_id', nid)
137 self._label_internal_nodes([nid])
139 def get_descendant_by_node_id(self, idname):
140 '''
141 returns node list corresponding to a given idname
142 '''
143 try:
144 for n in self.descendants():
145 if n.props.get('node_id') == idname:
146 return n
147 if self.props.get('node_id') == idname:
148 return self
149 except AttributeError:
150 warn('Should be first labelled as paml ' +
151 '(automatically done when alignemnt is loaded)')
153 def _write_algn(self, fullpath):
154 """
155 to write algn in paml format
156 """
157 seq_group = SeqGroup()
158 for n in self:
159 seq_group.id2seq [n.props.get('node_id')] = n.props.get('nt_sequence')
160 seq_group.id2name [n.props.get('node_id')] = n.name
161 seq_group.name2id [n.name ] = n.props.get('node_id')
162 seq_group.write(outfile=fullpath, format='paml')
164 def run_model(self, model_name, ctrl_string='', keep=True, **kwargs):
165 '''
166 To compute evolutionnary models. e.g.: b_free_lala.vs.lele, will launch one free branch model, and store
167 it in "WORK_DIR/b_free_lala.vs.lele" directory
169 WARNING: this functionality needs to create a working directory in "rep"
171 WARNING: you need to have codeml and/or SLR in your path
173 The models available are:
175 =========== ============================= ==================
176 Model name Description Model kind
177 =========== ============================= ==================\n%s
178 =========== ============================= ==================\n
180 **Note that M1 and M2 models are making reference to the new versions
181 of these models, with continuous omega rates (namely M1a and M2a in the
182 PAML user guide).**
184 :argument model_name: a string like "model-name[.some-secondary-name]" (e.g.: "fb.my_first_try", or just "fb")
185 * model-name is compulsory, is the name of the model (see table above for the full list)
186 * the second part is accessory, it is to avoid over-writing models with the same name.
187 :argument ctrl_string: list of parameters that can be used as control file.
188 :argument True keep: links the model to the tree (equivalen of running `EVOL_TREE.link_to_evol_model(MODEL_NAME)`)
189 :argument kwargs: extra parameters should be one of: %s.
190 '''
191 from subprocess import Popen, PIPE
192 model_obj = Model(model_name, self, **kwargs)
193 fullpath = os.path.join(self.workdir, model_obj.name)
194 os.system("mkdir -p %s" % fullpath)
195 # write tree file
196 self._write_algn(fullpath + '/algn')
197 if model_obj.properties['exec'] == 'Slr':
198 self.write(outfile=fullpath+'/tree', format=(11))
199 else:
200 self.write(outfile=fullpath+'/tree',
201 format=(10 if model_obj.properties['allow_mark'] else 9))
202 # write algn file
203 # MODEL MODEL MDE
204 if ctrl_string == '':
205 ctrl_string = model_obj.get_ctrl_string(fullpath+'/tmp.ctl')
206 else:
207 open(fullpath+'/tmp.ctl', 'w').write(ctrl_string)
208 hlddir = os.getcwd()
209 os.chdir(fullpath)
210 bin_ = os.path.join(self.execpath, model_obj.properties['exec'])
211 try:
212 proc = Popen([bin_, 'tmp.ctl'], stdout=PIPE,
213 stdin=PIPE, stderr=PIPE)
214 except OSError:
215 raise Exception(('ERROR: {} not installed, ' +
216 'or wrong path to binary\n').format(bin_))
217 # send \n via stdin in case codeml/slr asks something (note on py3, stdin needs bytes)
218 run, err = proc.communicate(b'\n')
219 run = run.decode(sys.stdout.encoding)
221 os.chdir(hlddir)
222 if err:
223 warn("ERROR: inside codeml!!\n" + err)
224 return 1
225 if keep:
226 setattr(model_obj, 'run', run)
227 self.link_to_evol_model(os.path.join(fullpath, 'out'), model_obj)
228 sep = '\n'
229 run_model.__doc__ = run_model.__doc__ % \
230 (sep.join([' %-8s %-27s %-15s ' %
231 ('%s' % (x), AVAIL[x]['evol'], AVAIL[x]['typ']) for x in sorted(sorted(AVAIL.keys()), key=lambda x:
232 AVAIL[x]['typ'],
233 reverse=True)]),
234 ', '.join(list(PARAMS.keys())))
236 # def test_codon_model(self):
237 # for c_frq in range(4):
238 # self.run_model('M0.model_test-'+str(c_frq), CodonFreq=c_frq)
239 # if self.get_most_likely('M0.model_test-1', 'M0.model_test-0') > 0.05:
240 #
241 # self.get_most_likely('M0.model_test-2', 'M0.model_test-0')
242 # self.get_most_likely('M0.model_test-3', 'M0.model_test-0')
243 # self.get_most_likely('M0.model_test-2', 'M0.model_test-1')
244 # self.get_most_likely('M0.model_test-3', 'M0.model_test-1')
245 # self.get_most_likely('M0.model_test-3', 'M0.model_test-2')
247 def link_to_alignment(self, alignment, alg_format="paml",
248 nucleotides=True, **kwargs):
249 '''
250 same function as for phyloTree, but translate sequences if nucleotides
251 nucleotidic sequence is kept under node.get('nt_sequence')
253 :argument alignment: path to alignment or string
254 :argument alg_format: one of fasta phylip or paml
255 :argument True alignment: set to False in case we want to keep it untranslated
257 '''
258 super(EvolTree, self).link_to_alignment(alignment,
259 alg_format=alg_format, **kwargs)
260 check_len = 0
261 for leaf in self.leaves():
262 seq_len = len(str(leaf.props.get('sequence')))
263 if check_len and check_len != seq_len:
264 warn('WARNING: sequences with different lengths found!')
265 check_len = seq_len
266 leaf.add_prop('nt_sequence', str(leaf.props.get('sequence')))
267 if nucleotides:
268 leaf.add_prop('sequence', translate(leaf.props.get('nt_sequence')))
270 def show(self, layout=None, tree_style=None, histfaces=None):
271 '''
272 call super show of PhyloTree
273 histface should be a list of models to be displayes as histfaces
275 :argument layout: a layout function
276 :argument None tree_style: tree_style object
277 :argument Nonehistface: an histogram face function. This is only to plot selective pressure among sites
279 '''
280 if TREEVIEW:
281 if not tree_style:
282 ts = TreeStyle()
283 else:
284 ts = tree_style
285 if histfaces:
286 for hist in histfaces:
287 try:
288 mdl = self.get_evol_model(hist)
289 except AttributeError:
290 warn('model %s not computed' % (hist))
291 if not 'histface' in mdl.properties:
292 if len(histfaces) > 1 and histfaces.index(hist) != 0:
293 mdl.set_histface(up=False)
294 else:
295 mdl.set_histface()
296 if mdl.properties['histface'].up:
297 ts.aligned_header.add_face(
298 mdl.properties['histface'], 1)
299 else:
300 ts.aligned_foot.add_face(
301 mdl.properties['histface'], 1)
302 super(EvolTree, self).show(layout=layout, tree_style=ts)
303 else:
304 raise ValueError("Treeview module is disabled")
306 def render(self, file_name, layout=None, w=None, h=None,
307 tree_style=None, header=None, histfaces=None):
308 """Add up and down faces and render the tree representation to a file.
310 :param histface: A histogram face function. This is only to
311 plot selective pressure among sites.
312 """
313 if not TREEVIEW:
314 raise ValueError("Treeview module is disabled")
316 ts = tree_style or TreeStyle()
318 for hist in (histfaces or []):
319 try:
320 mdl = self.get_evol_model(hist)
321 except AttributeError:
322 warn('model %s not computed' % (hist))
324 if not 'histface' in mdl.properties:
325 if len(histfaces) > 1 and histfaces.index(hist) != 0:
326 mdl.set_histface(up=False)
327 else:
328 mdl.set_histface()
330 if mdl.properties['histface'].up:
331 ts.aligned_header.add_face(mdl.properties['histface'], 1)
332 else:
333 ts.aligned_foot.add_face(mdl.properties['histface'], 1)
335 return super().render(file_name, layout=layout, tree_style=ts, w=w, h=h)
337 def mark_tree(self, node_ids, verbose=False, **kargs):
338 '''
339 function to mark branches on tree in order that paml could interpret it.
340 takes a "marks" argument that should be a list of #1,#1,#2
341 e.g.:
342 ::
344 t=Tree.mark_tree([2,3], marks=["#1","#2"])
346 :argument node_ids: list of node ids (have a look to node.props.get('node_id'))
347 :argument False verbose: warn if marks do not correspond to codeml standard
348 :argument kwargs: mainly for the marks key-word which needs a list of marks (marks=['#1', '#2'])
350 '''
351 from re import match
352 node_ids = list(map(int, node_ids))
353 if 'marks' in kargs:
354 marks = list(kargs['marks'])
355 else:
356 marks = ['#1']*len(node_ids)
357 for node in self.traverse():
358 if not node.props.get('node_id'):
359 continue
360 if node.props.get('node_id') in node_ids:
361 if ('.' in marks[node_ids.index(node.props.get('node_id'))] or
362 match('#[0-9]+',
363 marks[node_ids.index(node.props.get('node_id'))]) is None) and verbose:
364 warn('WARNING: marks should be "#" sign directly ' +
365 'followed by integer\n' + self.mark_tree.__doc__)
366 node.add_prop('mark', ' '+marks[node_ids.index(node.props.get('node_id'))])
367 elif node.props.get('mark') == None:
368 node.add_prop('mark', '')
370 def link_to_evol_model(self, path, model):
371 '''
372 link EvolTree to evolutionary model
373 * free-branch model ("fb") will append evol values to tree
374 * Site models (M0, M1, M2, M7, M8) will give evol values by site
375 and likelihood
377 :argument path: path to outfile containing model computation result
378 :argument model: either the name of a model, or a Model object (usually empty)
380 '''
381 if isinstance(model, str):
382 model = Model(model, self, path)
383 else:
384 model._load(path)
385 # new entry in _models dict
386 while model.name in self._models:
387 model.name = model.name.split('__')[0] + str(
388 (int(model.name.split('__')[1]) + 1)
389 if '__' in model.name else 0)
390 self._models[model.name] = model
391 if not os.path.isfile(path):
392 warn("ERROR: not a file: " + path)
393 return 1
394 if len(self._models) == 1 and model.properties['exec'] == 'codeml':
395 self.change_dist_to_evol('bL', model, fill=True)
397 def get_evol_model(self, modelname):
398 '''
399 returns one precomputed model
401 :argument modelname: string of the name of a model object stored
402 :returns: Model object
403 '''
404 try:
405 return self._models[modelname]
406 except KeyError:
407 Exception("ERROR: Model %s not found." % (modelname))
409 def write(self, properties=None, outfile=None, format=10):
410 """
411 Inherits from Tree but adds the tenth format, that allows to display marks for CodeML.
412 TODO: internal writting format need to be something like 0
413 """
414 from re import sub
415 if int(format) == 11:
416 nwk = ' %s 1\n' % (len(self))
417 nwk += sub(r'\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \
418 write_newick(self, properties=['mark'],format=9))
419 elif int(format)==10:
420 nwk = sub(r'\[&&NHX:mark=([ #0-9.]*)\]', r'\1', \
421 write_newick(self, properties=['mark'],format=9))
422 else:
423 nwk = write_newick(self, properties=properties,format=format)
424 if outfile is not None:
425 open(outfile, "w").write(nwk)
426 return nwk
427 else:
428 return nwk
429 #write.__doc__ += super(PhyloTree, PhyloTree()).write.__doc__.replace(
430 # 'argument format', 'argument 10 format')
432 def get_most_likely(self, altn, null):
433 """Return pvalue of LRT between alternative model and null model.
435 Usual comparison are:
437 .. table::
439 =========== ======= ===========================================
440 Alternative Null Test
441 =========== ======= ===========================================
442 M2 M1 PS on sites (M2 prone to miss some sites)
443 (Yang 2000).
444 M3 M0 test of variability among sites
445 M8 M7 PS on sites
446 (Yang 2000)
447 M8 M8a RX on sites?? think so....
448 bsA bsA1 PS on sites on specific branch
449 (Zhang 2005)
450 bsA M1 RX on sites on specific branch
451 (Zhang 2005)
452 bsC M1 different omegas on clades branches sites
453 ref: Yang Nielsen 2002
454 bsD M3 different omegas on clades branches sites
455 (Yang Nielsen 2002, Bielawski 2004)
456 b_free b_neut foreground branch not neutral (w != 1)
457 - RX if P<0.05 (means that w on frg=1)
458 - PS if P>0.05 and wfrg>1
459 - CN if P>0.05 and wfrg>1
460 (Yang Nielsen 2002)
461 b_free M0 different ratio on branches
462 (Yang Nielsen 2002)
463 =========== ======= ===========================================
465 Note that M1 and M2 models are making reference to the new
466 versions of these models, with continuous omega rates (namely
467 M1a and M2a in the PAML user guide).
469 :param altn: model with higher number of parameters (np).
470 :param null: model with lower number of parameters (np).
471 """
472 altn = self.get_evol_model(altn)
473 null = self.get_evol_model(null)
474 if null.np > altn.np:
475 warn("first model should be the alternative, change the order")
476 return 1.0
477 try:
478 if hasattr(altn, 'lnL') and hasattr(null, 'lnL'):
479 if null.lnL - altn.lnL < 0:
480 return chi_high(2 * abs(altn.lnL - null.lnL),
481 float(altn.np - null.np))
482 else:
483 warn("\nWARNING: Likelihood of the alternative model is "
484 "smaller than null's (%f - %f = %f)" % (
485 null.lnL, altn.lnL, null.lnL - altn.lnL) +
486 "\nLarge differences (> 0.1) may indicate mistaken "
487 "assigantion of null and alternative models")
488 return 1
489 except KeyError:
490 warn("at least one of %s or %s, was not calculated" % (altn.name,
491 null.name))
492 exit(self.get_most_likely.__doc__)
494 def change_dist_to_evol(self, evol, model, fill=False):
496 """Change dist/branch length of tree to a given evolutionary variable.
498 :param evol: Evolutionary variable (dN, dS, w, bL, by default bL).
499 :param model: Model obj from which to retrieve evolutionary variables.
500 :param fill: In addition to changing the dist parameter, annotate nodes
501 with all evolutionary variables (add to their props: dN, w...).
502 """
503 # branch-site outfiles do not give specific branch info
504 if not model.branches:
505 return
506 for node in self.descendants():
507 if not evol in model.branches[node.props.get('node_id')]:
508 continue
509 node.dist = model.branches[node.props.get('node_id')][evol]
510 if fill:
511 for e in ['dN', 'dS', 'w', 'bL']:
512 node.add_prop(e, model.branches [node.props.get('node_id')][e])
515# cosmetic alias
516EvolTree = EvolNode