Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/phylo/phylotree.py: 16%

360 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-08-07 10:27 +0200

1""" 

2This module defines the PhyloTree class to manage phylogenetic trees. 

3It inherits from Tree and adds some special features to the the node 

4instances. 

5""" 

6 

7import sys 

8import re 

9import warnings 

10import itertools 

11from collections import defaultdict 

12from ete4 import Tree, SeqGroup, NCBITaxa, GTDBTaxa 

13from .reconciliation import get_reconciled_tree 

14from . import spoverlap 

15 

16__all__ = ["PhyloTree"] 

17 

18 

19def is_dup(n): 

20 return n.props.get("evoltype") == "D" 

21 

22def get_subtrees(tree, full_copy=False, properties=None, newick_only=False): 

23 """Calculate all possible species trees within a gene tree. I 

24 tested several recursive and iterative approaches to do it and 

25 this is the most efficient way I found. The method is now fast and 

26 light enough to deal with very large gene trees, and it scales 

27 linearly instead of exponentially. For instance, a tree with ~8000 

28 nodes, ~100 species and ~400 duplications returns ~10,000 sptrees 

29 that could be loaded in few minutes. 

30 

31 To avoid memory overloads, this function returns a tuple containing the 

32 total number of trees, number of duplication events, and an iterator for the 

33 species trees. Real trees are not actually computed until the iterator is 

34 first accessed. This allows to filter out cases producing astronomic numbers 

35 of sptrees. 

36 

37 """ 

38 ntrees, ndups = calc_subtrees(tree) 

39 return ntrees, ndups, _get_subtrees(tree, full_copy, properties, newick_only) 

40 

41def _get_subtrees(tree, full_copy=False, properties=None, newick_only=False): 

42 # First I need to precalculate all the species trees in tuple (newick) format 

43 nid = 0 

44 n2nid = {} 

45 nid2node = {} 

46 n2subtrees = defaultdict(list) 

47 for n in tree.traverse("postorder"): 

48 n2nid[n] = nid 

49 nid2node[nid] = n 

50 nid += 1 

51 if n.children: 

52 if is_dup(n): 

53 subtrees = [] 

54 for ch in n.children: 

55 subtrees.extend(n2subtrees[n2nid[ch]]) 

56 else: 

57 subtrees = tuple([val for val in 

58 itertools.product(n2subtrees[n2nid[n.children[0]]], 

59 n2subtrees[n2nid[n.children[1]]])]) 

60 else: 

61 subtrees = tuple([n2nid[n]]) 

62 

63 n2subtrees[n2nid[n]] = subtrees 

64 for ch in n.children: 

65 del n2subtrees[n2nid[ch]] 

66 

67 sp_trees = n2subtrees[n2nid[tree]] 

68 

69 # Second, I yield a tree per iteration in newick or ETE format 

70 properties = set(properties) if properties else set() 

71 properties.update(["name"]) 

72 

73 def _nodereplacer(match): 

74 pre, b, post = match.groups() 

75 pre = '' if not pre else pre 

76 post = '' if not post else post 

77 node = nid2node[int(b)] 

78 fstring = "" 

79 if properties: 

80 fstring = "".join(["[&&NHX:", 

81 ':'.join(["%s=%s" %(p, node.props.get(p)) 

82 for p in properties if node.props.get(p)]) 

83 , "]"]) 

84 

85 return ''.join([pre, node.name, fstring, post]) 

86 

87 if newick_only: 

88 id_match = re.compile(r"([^0-9])?(\d+)([^0-9])?") 

89 for nw in sp_trees: 

90 yield re.sub(id_match, _nodereplacer, str(nw)+";") 

91 else: 

92 for nw in sp_trees: 

93 # I take advantage from the fact that I generated the subtrees 

94 # using tuples, so str representation is actually a newick :) 

95 t = PhyloTree(str(nw)+";") 

96 # Map properties from original tree 

97 for leaf in t.leaves(): 

98 _nid = int(leaf.name) 

99 for p in properties: 

100 leaf.add_prop(p, getattr(nid2node[_nid], p)) 

101 yield t 

102 

103def calc_subtrees(tree): 

104 """Return the number of species and duplications for the given tree. 

105 

106 The ones that the TreeKO algorithm would produce. 

107 """ 

108 n2subtrees = {} 

109 dups = 0 

110 for n in tree.traverse("postorder"): 

111 if n.children: 

112 if is_dup(n): 

113 dups += 1 

114 subtrees = 0 

115 for ch in n.children: 

116 subtrees += n2subtrees[ch] 

117 else: 

118 subtrees = n2subtrees[n.children[0]] * n2subtrees[n.children[1]] 

119 else: 

120 subtrees = 1 

121 n2subtrees[n] = subtrees 

122 return n2subtrees[tree], dups 

123 

124def iter_sptrees(sptrees, nid2node, properties=None, newick_only=False): 

125 """ Loads and map the species trees returned by get_subtrees""" 

126 

127 properties = set(properties) if properties else set() 

128 properties.update(["name"]) 

129 

130 def _nodereplacer(match): 

131 pre, b, post = match.groups() 

132 node = nid2node[int(b)] 

133 fstring = "" 

134 if properties: 

135 fstring = "".join(["[&&NHX:", 

136 ','.join(["%s=%s" %(p, node.props.get(p)) 

137 for p in properties if node.props.get(p)]) 

138 , "]"]) 

139 

140 return ''.join([pre, node.name, fstring, post]) 

141 

142 if newick_only: 

143 id_match = re.compile(r"([^0-9])(\d+)([^0-9])") 

144 for nw in sptrees: 

145 yield re.sub(id_match, _nodereplacer, str(nw)+";") 

146 else: 

147 for nw in sptrees: 

148 # I take advantage from the fact that I generated the subtrees 

149 # using tuples, so str representation is actually a newick :) 

150 t = PhyloTree(str(nw)+";") 

151 # Map properties from original tree 

152 for leaf in t.leaves(): 

153 _nid = int(leaf.name) 

154 for p in properties: 

155 leaf.add_prop(p, getattr(nid2node[_nid], p)) 

156 yield t 

157 

158def _get_subtrees_recursive(node, full_copy=True): 

159 if is_dup(node): 

160 sp_trees = [] 

161 for ch in node.children: 

162 sp_trees.extend(_get_subtrees_recursive(ch, full_copy=full_copy)) 

163 return sp_trees 

164 

165 # saves a list of duplication nodes under current node 

166 dups = [] 

167 for _n in node.leaves(is_leaf_fn=is_dup): 

168 if is_dup(_n): 

169 dups.append(_n) 

170 

171 if dups: 

172 # detach inner duplication nodes and stores their anchor point 

173 subtrees = [] 

174 for dp in dups: 

175 # The real node to attach sibling subtress 

176 anchor = dp.up 

177 dp.detach() 

178 

179 duptrees = [] 

180 #get all sibling sptrees in each side of the 

181 #duplication. Each subtree is pointed to its anchor 

182 for ch in dp.children: 

183 for subt in _get_subtrees_recursive(ch, full_copy=full_copy): 

184 if not full_copy: 

185 subt = node.__class__(subt) 

186 subt.up = anchor 

187 duptrees.append(subt) 

188 

189 #all posible sptrees under this duplication are stored 

190 subtrees.append(duptrees) 

191 

192 # Generates all combinations of subtrees in sibling duplications 

193 sp_trees = [] 

194 for comb in itertools.product(*subtrees): 

195 #each subtree is attached to its anchor point and make a copy 

196 #of the final sp tree 

197 for subt in comb: 

198 #anchor = subt2anchor[subt] 

199 if subt.up: 

200 subt.up.children.append(subt) 

201 #print subt.up 

202 else: 

203 sp_trees.append(subt) 

204 if full_copy: 

205 back_up = node.up 

206 node.up = None 

207 _node = node.copy() 

208 node.up = back_up 

209 else: 

210 _node = node.write(format=9, properties=["name", "evoltype"]) 

211 sp_trees.append(_node) 

212 # Clear current node 

213 for subt in comb: 

214 subt.up.children.pop(-1) 

215 else: 

216 if full_copy: 

217 back_up = node.up 

218 node.up = None 

219 _node = node.copy() 

220 node.up = back_up 

221 else: 

222 _node = node.write(format=9, properties=["name", "evoltype"]) 

223 #node.detach() 

224 sp_trees = [_node] 

225 

226 return sp_trees 

227 

228def get_subparts(n): 

229 subtrees = [] 

230 if is_dup(n): 

231 for ch in n.get_children(): 

232 ch.detach() 

233 subtrees.extend(get_subparts(ch)) 

234 else: 

235 to_visit = [] 

236 for _n in n.leaves(is_leaf_fn=is_dup): 

237 if is_dup(_n): 

238 to_visit.append(_n) 

239 

240 for _n in to_visit: 

241 _n.detach() 

242 

243 freaks = [_n for _n in n.descendants() if 

244 len(_n.children)==1 or (not hasattr(_n, "_leaf") and not _n.children)] 

245 for s in freaks: 

246 s.delete(prevent_nondicotomic=True) 

247 

248 # Clean node structure to prevent nodes with only one child 

249 while len(n.children) == 1: 

250 n = n.children[0] 

251 n.detach() 

252 

253 if not n.children and not hasattr(n, "_leaf"): 

254 pass 

255 else: 

256 subtrees.append(n) 

257 

258 for _n in to_visit: 

259 subtrees.extend(get_subparts(_n)) 

260 

261 return subtrees 

262 

263 

264class PhyloTree(Tree): 

265 """ 

266 Class to store a phylogenetic tree. 

267 

268 Extends the standard :class:`Tree` instance by adding 

269 specific properties and methods to work with phylogentic trees. 

270 """ 

271 

272 def __init__(self, newick=None, children=None, alignment=None, 

273 alg_format="fasta", sp_naming_function=None, 

274 parser=None): 

275 """ 

276 :param newick: If not None, initializes the tree from a newick, 

277 which can be a string or file object containing it. 

278 :param children: If not None, the children to add to this node. 

279 :param alignment: File containing a multiple sequence alignment. 

280 :param alg_format: "fasta", "phylip" or "iphylip" (interleaved). 

281 :param parser: Parser to read the newick. 

282 :param sp_naming_function: Function that gets a node name and 

283 returns the species name (see 

284 :func:`PhyloTree.set_species_naming_function`). By default, 

285 the 3 first letters of node names will be used as species 

286 identifier. 

287 """ 

288 super().__init__(data=newick, children=children, parser=parser) 

289 

290 # This will be only executed after reading the whole tree, 

291 # because the argument 'alignment' is not passed to the 

292 # PhyloTree constructor during parsing. 

293 if alignment: 

294 self.link_to_alignment(alignment, alg_format) 

295 

296 if newick: 

297 self.set_species_naming_function(sp_naming_function) 

298 

299 @property 

300 def species(self): 

301 if self.props.get('_speciesFunction'): 

302 if 'species' in self.props: 

303 warnings.warn('Ambiguous species: both species and _speciesFunction' 

304 'defined. You can remove "species" from this node.') 

305 try: 

306 return self.props.get('_speciesFunction')(self.name) 

307 except: 

308 return self.props.get('_speciesFunction')(self) 

309 else: 

310 return self.props.get('species') 

311 

312 @species.setter 

313 def species(self, value): 

314 assert self.props.get('_speciesFunction') is None, \ 

315 ('Species naming function present, cannot set species manually. ' 

316 'Maybe call set_species_naming_function() first?') 

317 self.props['species'] = value 

318 

319 def __repr__(self): 

320 return "PhyloTree '%s' (%s)" % (self.name, hex(self.__hash__())) 

321 

322 def write(self, outfile=None, props=(), parser=None, 

323 format_root_node=False, is_leaf_fn=None): 

324 if props is None: 

325 props = sorted(set(p for node in self.traverse() 

326 for p in node.props if not p.startswith('_'))) 

327 return super().write(outfile, props, parser, format_root_node, is_leaf_fn) 

328 

329 def set_species_naming_function(self, fn): 

330 """Set the function used to get the species from the node's name. 

331 

332 :param fn: Function that takes a nodename and returns the species name. 

333 

334 Example of a parsing function:: 

335 

336 def parse_sp_name(node_name): 

337 return node_name.split("_")[1] 

338 tree.set_species_naming_function(parse_sp_name) 

339 """ 

340 for n in self.traverse(): 

341 if fn is not None: 

342 n.props['_speciesFunction'] = fn 

343 else: 

344 n.props.pop('_speciesFunction', None) 

345 

346 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): 

347 missing_leaves = [] 

348 missing_internal = [] 

349 if type(alignment) == SeqGroup: 

350 alg = alignment 

351 else: 

352 alg = SeqGroup(alignment, format=alg_format, **kwargs) 

353 # sets the seq of 

354 for n in self.traverse(): 

355 try: 

356 n.add_prop("sequence",alg.get_seq(n.name)) 

357 except KeyError: 

358 if n.is_leaf: 

359 missing_leaves.append(n.name) 

360 else: 

361 missing_internal.append(n.name) 

362 if len(missing_leaves)>0: 

363 print("Warnning: [%d] terminal nodes could not be found in the alignment." %\ 

364 len(missing_leaves), file=sys.stderr) 

365 # Show warning of not associated internal nodes. 

366 # if len(missing_internal)>0: 

367 # print >>sys.stderr, \ 

368 # "Warnning: [%d] internal nodes could not be found in the alignment." %\ 

369 # len(missing_leaves) 

370 

371 def get_species(self): 

372 """ Returns the set of species covered by its partition. """ 

373 return set([l.species for l in self.leaves()]) 

374 

375 def iter_species(self): 

376 """ Returns an iterator over the species grouped by this node. """ 

377 spcs = set([]) 

378 for l in self.leaves(): 

379 if l.species not in spcs: 

380 spcs.add(l.species) 

381 yield l.species 

382 

383 def get_age(self, species2age): 

384 """ 

385 Implements the phylostratigrafic method described in: 

386 

387 Huerta-Cepas, J., & Gabaldon, T. (2011). Assigning duplication events to 

388 relative temporal scales in genome-wide studies. Bioinformatics, 27(1), 

389 38-45. 

390 """ 

391 return max([species2age[sp] for sp in self.get_species()]) 

392 

393 def reconcile(self, species_tree): 

394 """ Returns the reconcilied topology with the provided species 

395 tree, and a list of evolutionary events inferred from such 

396 reconciliation. """ 

397 return get_reconciled_tree(self, species_tree, []) 

398 

399 def get_my_evol_events(self, sos_thr=0.0): 

400 """Return list of duplication and speciation events involving this node. 

401 

402 Scanned nodes are also labeled internally as dup=True|False. 

403 You can access these labels using ``node.dup``. 

404 

405 The algorithm scans all nodes from the given leafName to the 

406 root. Nodes are assumed to be duplications when a species 

407 overlap is found between its child linages. The method is 

408 described in more detail in: 

409 

410 :Citation: 

411 *The Human Phylome*. T. Genome Biol. 2007;8(6):R109. 

412 """ 

413 return spoverlap.get_evol_events_from_leaf(self, sos_thr=sos_thr) 

414 

415 def get_descendant_evol_events(self, sos_thr=0.0): 

416 """ Returns a list of all duplication and speciation 

417 events detected after this node. Nodes are assumed to be 

418 duplications when a species overlap is found between its child 

419 linages. Method is described more detail in: 

420 

421 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 

422 T. Genome Biol. 2007;8(6):R109. 

423 """ 

424 return spoverlap.get_evol_events_from_root(self, sos_thr=sos_thr) 

425 

426 def get_farthest_oldest_leaf(self, species2age, is_leaf_fn=None): 

427 """Return the farthest oldest leaf to the current one. 

428 

429 It requires an species2age dictionary with the age estimation 

430 for all species. 

431 

432 :param None is_leaf_fn: A pointer to a function that receives 

433 a node instance as unique argument and returns True or 

434 False. It can be used to dynamically collapse nodes, so 

435 they are seen as leaves. 

436 """ 

437 root = self.root 

438 outgroup_dist = 0 

439 outgroup_node = self 

440 outgroup_age = 0 # self.get_age(species2age) 

441 

442 for leaf in root.leaves(is_leaf_fn=is_leaf_fn): 

443 if leaf.get_age(species2age) > outgroup_age: 

444 outgroup_dist = leaf.get_distance(self, leaf) 

445 outgroup_node = leaf 

446 outgroup_age = species2age[leaf.get_species().pop()] 

447 elif leaf.get_age(species2age) == outgroup_age: 

448 dist = leaf.get_distance(self, leaf) 

449 if dist>outgroup_dist: 

450 outgroup_dist = leaf.get_distance(self, leaf) 

451 outgroup_node = leaf 

452 outgroup_age = species2age[leaf.get_species().pop()] 

453 return outgroup_node 

454 

455 def get_farthest_oldest_node(self, species2age): 

456 """Return the farthest oldest node (leaf or internal). 

457 

458 The difference with get_farthest_oldest_leaf() is that in this 

459 function internal nodes grouping seqs from the same species 

460 are collapsed. 

461 """ 

462 # I use a custom is_leaf() function to collapse nodes groups 

463 # seqs from the same species 

464 is_leaf = lambda node: len(node.get_species())==1 

465 return self.get_farthest_oldest_leaf(species2age, is_leaf_fn=is_leaf) 

466 

467 def get_age_balanced_outgroup(self, species2age): 

468 """ 

469 .. versionadded:: 2.2 

470 

471 Returns the node better balance current tree structure 

472 according to the topological age of the different leaves and 

473 internal node sizes. 

474 

475 :param species2age: A dictionary translating from leaf names 

476 into a topological age. 

477 

478 .. warning: This is currently an experimental method!! 

479 

480 """ 

481 root = self 

482 all_seqs = set(self.leaf_names()) 

483 outgroup_dist = 0 

484 best_balance = max(species2age.values()) 

485 outgroup_node = self 

486 outgroup_size = 0 

487 

488 for leaf in root.descendants(): 

489 leaf_seqs = set(leaf.leaf_names()) 

490 size = len(leaf_seqs) 

491 

492 leaf_species =[self.props.get('_speciesFunction')(s) for s in leaf_seqs] 

493 out_species = [self.props.get('_speciesFunction')(s) for s in all_seqs-leaf_seqs] 

494 

495 leaf_age_min = min([species2age[sp] for sp in leaf_species]) 

496 out_age_min = min([species2age[sp] for sp in out_species]) 

497 leaf_age_max = max([species2age[sp] for sp in leaf_species]) 

498 out_age_max = max([species2age[sp] for sp in out_species]) 

499 leaf_age = leaf_age_max - leaf_age_min 

500 out_age = out_age_max - out_age_min 

501 

502 age_inbalance = abs(out_age - leaf_age) 

503 

504 # DEBUG ONLY 

505 # leaf.add_features(age_inbalance = age_inbalance, age=leaf_age) 

506 

507 update = False 

508 if age_inbalance < best_balance: 

509 update = True 

510 elif age_inbalance == best_balance: 

511 if size > outgroup_size: 

512 update = True 

513 elif size == outgroup_size: 

514 dist = self.get_distance(self, leaf) 

515 outgroup_dist = self.get_distance(self, outgroup_node) 

516 if dist > outgroup_dist: 

517 update = True 

518 

519 if update: 

520 best_balance = age_inbalance 

521 outgroup_node = leaf 

522 outgroup_size = size 

523 

524 return outgroup_node 

525 

526 def get_speciation_trees(self, map_properties=None, autodetect_duplications=True, 

527 newick_only=False, prop='species'): 

528 """Return number of species trees, of duplications, and an iterator. 

529 

530 Calculates all possible species trees contained within a 

531 duplicated gene family tree as described in `Treeko 

532 <http://treeko.cgenomics.org>`_ (see `Marcet and Gabaldon, 

533 2011 <http://www.ncbi.nlm.nih.gov/pubmed/21335609>`_ ). 

534 

535 :param map_properties: List of properties that should be 

536 mapped from the original gene family tree to each species 

537 tree subtree. 

538 :param autodetect_duplications: If True, duplication nodes 

539 will be automatically detected using the Species Overlap 

540 algorithm (:func:`PhyloTree.get_descendants_evol_events`). 

541 If False, duplication nodes within the original tree are 

542 expected to contain the property "evoltype='D'". 

543 """ 

544 t = self 

545 if autodetect_duplications: 

546 n2content = t.get_cached_content() 

547 n2species = t.get_cached_content(prop) 

548 for node in n2content: 

549 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 

550 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 

551 node.props['evoltype'] = 'D' 

552 

553 sp_trees = get_subtrees(t, properties=map_properties, newick_only=newick_only) 

554 

555 return sp_trees 

556 

557 def __get_speciation_trees_recursive(self): 

558 # NOTE: This function is experimental and for testing. 

559 t = self.copy() 

560 if autodetect_duplications: 

561 dups = 0 

562 n2content = t.get_cached_content() 

563 n2species = t.get_cached_content('species') 

564 

565 #print "Detecting dups" 

566 for node in n2content: 

567 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 

568 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 

569 node.props['evoltype'] = 'D' 

570 dups += 1 

571 elif node.is_leaf: 

572 node._leaf = True 

573 #print dups 

574 else: 

575 for node in t.leaves(): 

576 node._leaf = True 

577 subtrees = _get_subtrees_recursive(t) 

578 return len(subtrees), 0, subtrees 

579 

580 def split_by_dups(self, autodetect_duplications=True): 

581 """Return the list of subtrees when splitting by its duplication nodes. 

582 

583 :param True autodetect_duplications: If True, duplication 

584 nodes will be automatically detected using the Species 

585 Overlap algorithm 

586 (:func:`PhyloTree.get_descendants_evol_events`. If False, 

587 duplication nodes within the original tree are expected to 

588 contain the feature "evoltype=D". 

589 """ 

590 try: 

591 t = self.copy() 

592 except Exception: 

593 t = self.copy("deepcopy") 

594 

595 if autodetect_duplications: 

596 dups = 0 

597 n2content = t.get_cached_content() 

598 n2species = t.get_cached_content('species') 

599 

600 #print "Detecting dups" 

601 for node in n2content: 

602 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 

603 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 

604 node.props['evoltype'] = 'D' 

605 dups += 1 

606 elif node.is_leaf: 

607 node._leaf = True 

608 else: 

609 for node in t.leaves(): 

610 node._leaf = True 

611 sp_trees = get_subparts(t) 

612 return sp_trees 

613 

614 def collapse_lineage_specific_expansions(self, species=None, return_copy=True): 

615 """ Converts lineage specific expansion nodes into a single 

616 tip node (randomly chosen from tips within the expansion). 

617 

618 :param None species: If supplied, only expansions matching the 

619 species criteria will be pruned. When None, all expansions 

620 within the tree will be processed. 

621 

622 """ 

623 if species and isinstance(species, (list, tuple)): 

624 species = set(species) 

625 elif species and (not isinstance(species, (set, frozenset))): 

626 raise TypeError("species argument should be a set (preferred), list or tuple") 

627 

628 prunned = self.copy("deepcopy") if return_copy else self 

629 n2sp = prunned.get_cached_content('species') 

630 n2leaves = prunned.get_cached_content() 

631 is_expansion = lambda n: (len(n2sp[n])==1 and len(n2leaves[n])>1 

632 and (species is None or species & n2sp[n])) 

633 for n in prunned.leaves(is_leaf_fn=is_expansion): 

634 repre = list(n2leaves[n])[0] 

635 repre.detach() 

636 if n is not prunned: 

637 n.up.add_child(repre) 

638 n.detach() 

639 else: 

640 return repre 

641 

642 return prunned 

643 

644 

645 def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None, ignore_unclassified=False): 

646 """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are 

647 expected to contain a feature (name, by default) encoding a valid taxid 

648 number. 

649 

650 All descendant nodes (including internal nodes) are annotated with the 

651 following new features: 

652 

653 `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database 

654 

655 `Node.named_lineage`: the NCBI lineage track using scientific names 

656 

657 `Node.taxid`: NCBI taxid number 

658 

659 `Node.lineage`: same as named_lineage but using taxid codes. 

660 

661 

662 Note that for internal nodes, NCBI information will refer to the first 

663 common lineage of the grouped species. 

664 

665 :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. 

666 

667 :param None tax2name: A dictionary where keys are taxid 

668 numbers and values are their translation into NCBI 

669 scientific name. Its use is optional and allows to avoid 

670 database queries when annotating many trees containing the 

671 same set of taxids. 

672 

673 :param None tax2track: A dictionary where keys are taxid 

674 numbers and values are their translation into NCBI lineage 

675 tracks (taxids). Its use is optional and allows to avoid 

676 database queries when annotating many trees containing the 

677 same set of taxids. 

678 

679 :param None tax2rank: A dictionary where keys are taxid 

680 numbers and values are their translation into NCBI rank 

681 name. Its use is optional and allows to avoid database 

682 queries when annotating many trees containing the same set 

683 of taxids. 

684 

685 :param None dbfile : If provided, the provided file will be 

686 used as a local copy of the NCBI taxonomy database. 

687 

688 :returns: tax2name (a dictionary translating taxid numbers 

689 into scientific name), tax2lineage (a dictionary 

690 translating taxid numbers into their corresponding NCBI 

691 lineage track) and tax2rank (a dictionary translating 

692 taxid numbers into rank names). 

693 

694 """ 

695 

696 ncbi = NCBITaxa(dbfile=dbfile) 

697 return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank, ignore_unclassified=ignore_unclassified) 

698 

699 def annotate_gtdb_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None, ignore_unclassified=False): 

700 gtdb = GTDBTaxa(dbfile=dbfile) 

701 return gtdb.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank, ignore_unclassified=ignore_unclassified) 

702 

703 def ncbi_compare(self, autodetect_duplications=True, cached_content=None): 

704 if not cached_content: 

705 cached_content = self.get_cached_content() 

706 cached_species = set([n.props.get('species') for n in cached_content[self]]) 

707 

708 if len(cached_species) != len(cached_content[self]): 

709 ntrees, ndups, target_trees = self.get_speciation_trees( 

710 autodetect_duplications=autodetect_duplications, 

711 map_properties=["taxid"]) 

712 else: 

713 target_trees = [self] 

714 

715 

716 ncbi = NCBITaxa() 

717 for t in target_trees: 

718 ncbi.get_broken_branches(t, cached_content)