Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/ncbi_taxonomy/ncbiquery.py: 29%
508 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-08-07 10:27 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2024-08-07 10:27 +0200
1#!/usr/bin/env python3
3import sys
4import os
5import pickle
6from collections import defaultdict, Counter
7import requests
8from hashlib import md5
10import sqlite3
11import math
12import tarfile
13import warnings
15from ete4 import ETE_DATA_HOME, update_ete_data
18__all__ = ["NCBITaxa", "is_taxadb_up_to_date"]
20DB_VERSION = 2
21DEFAULT_TAXADB = ETE_DATA_HOME + '/taxa.sqlite'
22DEFAULT_TAXDUMP = ETE_DATA_HOME + '/taxdump.tar.gz'
25def is_taxadb_up_to_date(dbfile=DEFAULT_TAXADB):
26 """Return True if a valid and up-to-date taxa.sqlite database exists.
28 If `dbfile` is not specified, DEFAULT_TAXADB is assumed.
29 """
30 db = sqlite3.connect(dbfile)
32 try:
33 version = db.execute('SELECT version FROM stats;').fetchone()[0]
34 except (sqlite3.OperationalError, ValueError, IndexError, TypeError):
35 version = None
37 db.close()
39 return version == DB_VERSION
42class NCBITaxa:
43 """
44 A local transparent connector to the NCBI taxonomy database.
45 """
47 def __init__(self, dbfile=None, taxdump_file=None,
48 memory=False, update=True):
49 """Open and keep a connection to the NCBI taxonomy database.
51 If it is not present in the system, it will download the
52 database from the NCBI site first, and convert it to ete's
53 format.
54 """
55 self.dbfile = dbfile or DEFAULT_TAXADB
57 if taxdump_file:
58 self.update_taxonomy_database(taxdump_file)
60 if dbfile != DEFAULT_TAXADB and not os.path.exists(self.dbfile):
61 print('NCBI database not present yet (first time used?)',
62 file=sys.stderr)
63 self.update_taxonomy_database(taxdump_file)
65 if not os.path.exists(self.dbfile):
66 raise ValueError("Cannot open taxonomy database: %s" % self.dbfile)
68 self.db = None
69 self._connect()
71 if not is_taxadb_up_to_date(self.dbfile) and update:
72 print('NCBI database format is outdated. Upgrading',
73 file=sys.stderr)
74 self.update_taxonomy_database(taxdump_file)
76 if memory:
77 filedb = self.db
78 self.db = sqlite3.connect(':memory:')
79 filedb.backup(self.db)
81 def update_taxonomy_database(self, taxdump_file=None):
82 """Update the ncbi taxonomy database.
84 It does it by downloading and parsing the latest
85 taxdump.tar.gz file from the NCBI site.
87 :param taxdump_file: Alternative location of the
88 taxdump.tax.gz file.
89 """
90 update_db(self.dbfile, taxdump_file)
92 def _connect(self):
93 self.db = sqlite3.connect(self.dbfile)
95 def _translate_merged(self, all_taxids):
96 conv_all_taxids = set((list(map(int, all_taxids))))
98 cmd = ('SELECT taxid_old, taxid_new '
99 'FROM merged WHERE taxid_old IN (%s)' %
100 ','.join(map(str, all_taxids)))
102 result = self.db.execute(cmd)
104 conversion = {}
105 for old, new in result.fetchall():
106 conv_all_taxids.discard(int(old))
107 conv_all_taxids.add(int(new))
108 conversion[int(old)] = int(new)
110 return conv_all_taxids, conversion
112 def get_fuzzy_name_translation(self, name, sim=0.9):
113 """Return taxid, species name and match score from the NCBI database.
115 The results are for the best match for name in the NCBI
116 database of taxa names, with a word similarity >= `sim`.
118 :param name: Species name (does not need to be exact).
119 :param 0.9 sim: Min word similarity to report a match (from 0 to 1).
120 """
121 import sqlite3.dbapi2 as dbapi2
123 _db = dbapi2.connect(self.dbfile)
124 _db.enable_load_extension(True)
125 module_path = os.path.split(os.path.realpath(__file__))[0]
126 _db.execute("SELECT load_extension('%s/%s')" %
127 (module_path, "SQLite-Levenshtein/levenshtein.sqlext"))
129 print("Trying fuzzy search for %s" % name)
130 maxdiffs = math.ceil(len(name) * (1-sim))
131 cmd = (f'SELECT taxid, spname, LEVENSHTEIN(spname, "{name}") AS sim '
132 f'FROM species WHERE sim <= {maxdiffs} ORDER BY sim LIMIT 1;')
134 taxid, spname, score = None, None, len(name)
135 result = _db.execute(cmd)
136 try:
137 taxid, spname, score = result.fetchone()
138 except TypeError:
139 cmd = (
140 f'SELECT taxid, spname, LEVENSHTEIN(spname, "{name}") AS sim '
141 f'FROM synonym WHERE sim <= {maxdiffs} ORDER BY sim LIMIT 1;')
142 result = _db.execute(cmd)
143 try:
144 taxid, spname, score = result.fetchone()
145 except:
146 pass
147 else:
148 taxid = int(taxid)
149 else:
150 taxid = int(taxid)
152 norm_score = 1 - (float(score) / len(name))
153 if taxid:
154 print(f'FOUND! {spname} taxid:{taxid} score:{score} ({norm_score})')
156 return taxid, spname, norm_score
158 def get_rank(self, taxids):
159 """Return dict with NCBI taxonomy ranks for each list of taxids."""
160 all_ids = set(taxids)
161 all_ids.discard(None)
162 all_ids.discard("")
164 query = ','.join('"%s"' % v for v in all_ids)
165 cmd = 'SELECT taxid, rank FROM species WHERE taxid IN (%s);' % query
166 result = self.db.execute(cmd)
168 id2rank = {}
169 for tax, spname in result.fetchall():
170 id2rank[tax] = spname
172 return id2rank
174 def get_lineage_translator(self, taxids):
175 """Return dict with lineage tracks corresponding to the given taxids.
177 The lineage tracks are a hierarchically sorted list of parent taxids.
178 """
179 all_ids = set(taxids)
180 all_ids.discard(None)
181 all_ids.discard("")
183 query = ','.join('"%s"' % v for v in all_ids)
184 cmd = 'SELECT taxid, track FROM species WHERE taxid IN (%s);' % query
185 result = self.db.execute(cmd)
187 id2lineages = {}
188 for tax, track in result.fetchall():
189 id2lineages[tax] = list(map(int, reversed(track.split(','))))
191 return id2lineages
193 def get_lineage(self, taxid):
194 """Return lineage track corresponding to the given taxid.
196 The lineage track is a hierarchically sorted list of parent taxids.
197 """
198 if not taxid:
199 return None
201 taxid = int(taxid)
202 result = self.db.execute(f'SELECT track FROM species WHERE taxid={taxid}')
203 raw_track = result.fetchone()
204 if not raw_track:
205 #perhaps is an obsolete taxid
206 _, merged_conversion = self._translate_merged([taxid])
207 if taxid in merged_conversion:
208 result = self.db.execute(
209 'SELECT track FROM species WHERE taxid=%s' %
210 merged_conversion[taxid])
211 raw_track = result.fetchone()
213 if not raw_track:
214 raise ValueError(f'Could not find taxid: {taxid}')
215 else:
216 warnings.warn('taxid %s was translated into %s' %
217 (taxid, merged_conversion[taxid]))
219 track = list(map(int, raw_track[0].split(',')))
220 return list(reversed(track))
222 def get_common_names(self, taxids):
223 query = ','.join('"%s"' % v for v in taxids)
224 cmd = 'SELECT taxid, common FROM species WHERE taxid IN (%s);' % query
225 result = self.db.execute(cmd)
227 id2name = {}
228 for tax, common_name in result.fetchall():
229 if common_name:
230 id2name[tax] = common_name
232 return id2name
234 def get_taxid_translator(self, taxids, try_synonyms=True):
235 """Return dict with the scientific names corresponding to the taxids."""
236 all_ids = set(map(int, taxids))
237 all_ids.discard(None)
238 all_ids.discard("")
240 query = ','.join('"%s"' % v for v in all_ids)
241 cmd = 'SELECT taxid, spname FROM species WHERE taxid IN (%s);' % query
242 result = self.db.execute(cmd)
244 id2name = {}
245 for tax, spname in result.fetchall():
246 id2name[tax] = spname
248 # Any taxid without translation? Let's try in the merged table.
249 if len(all_ids) != len(id2name) and try_synonyms:
250 not_found_taxids = all_ids - set(id2name.keys())
251 taxids, old2new = self._translate_merged(not_found_taxids)
252 new2old = {v: k for k,v in old2new.items()}
254 if old2new:
255 query = ','.join('"%s"' % v for v in new2old)
256 cmd = 'SELECT taxid, spname FROM species WHERE taxid IN (%s);' % query
257 result = self.db.execute(cmd)
258 for tax, spname in result.fetchall():
259 id2name[new2old[tax]] = spname
261 return id2name
263 def get_name_translator(self, names):
264 """Return dict with taxids corresponding to the given scientific names.
266 Exact name match is required for translation.
267 """
268 name2id = {}
269 #name2realname = {}
270 name2origname = {}
271 for n in names:
272 name2origname[n.lower()] = n
274 names = set(name2origname.keys())
276 query = ','.join('"%s"' % n for n in name2origname.keys())
277 cmd = 'SELECT spname, taxid FROM species WHERE spname IN (%s)' % query
278 result = self.db.execute('SELECT spname, taxid FROM species WHERE spname IN (%s)' % query)
279 for sp, taxid in result.fetchall():
280 oname = name2origname[sp.lower()]
281 name2id.setdefault(oname, []).append(taxid)
282 #name2realname[oname] = sp
283 missing = names - set([n.lower() for n in name2id.keys()])
284 if missing:
285 query = ','.join('"%s"' % n for n in missing)
286 result = self.db.execute('SELECT spname, taxid FROM synonym '
287 'WHERE spname IN (%s)' % query)
288 for sp, taxid in result.fetchall():
289 oname = name2origname[sp.lower()]
290 name2id.setdefault(oname, []).append(taxid)
291 #name2realname[oname] = sp
292 return name2id
294 def translate_to_names(self, taxids):
295 """Return list of scientific names corresponding to taxids."""
296 id2name = self.get_taxid_translator(taxids)
297 names = []
298 for sp in taxids:
299 names.append(id2name.get(sp, sp))
300 return names
302 def get_descendant_taxa(self, parent, intermediate_nodes=False,
303 rank_limit=None, collapse_subspecies=False,
304 return_tree=False):
305 """Return list of descendant taxids of the given parent.
307 Parent can be given as taxid or scientific species name.
309 If intermediate_nodes=True, the list will also have the internal nodes.
310 """
311 try:
312 taxid = int(parent)
313 except ValueError:
314 try:
315 taxid = self.get_name_translator([parent])[parent][0]
316 except KeyError:
317 raise ValueError('%s not found!' %parent)
319 # checks if taxid is a deprecated one, and converts into the right one.
320 _, conversion = self._translate_merged([taxid]) #try to find taxid in synonyms table
321 if conversion:
322 taxid = conversion[taxid]
324 with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE:
325 prepostorder = pickle.load(CACHED_TRAVERSE)
326 descendants = {}
327 found = 0
328 for tid in prepostorder:
329 if tid == taxid:
330 found += 1
331 elif found == 1:
332 descendants[tid] = descendants.get(tid, 0) + 1
333 elif found == 2:
334 break
336 if not found:
337 raise ValueError("taxid not found:%s" %taxid)
338 elif found == 1:
339 return [taxid]
341 if rank_limit or collapse_subspecies or return_tree:
342 tree = self.get_topology(list(descendants.keys()), intermediate_nodes=intermediate_nodes, collapse_subspecies=collapse_subspecies, rank_limit=rank_limit)
343 if return_tree:
344 return tree
345 elif intermediate_nodes:
346 return list(map(int, [n.name for n in tree.get_descendants()]))
347 else:
348 return list(map(int, [n.name for n in tree]))
350 elif intermediate_nodes:
351 return [tid for tid, count in descendants.items()]
352 else:
353 return [tid for tid, count in descendants.items() if count == 1]
355 def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None,
356 collapse_subspecies=False, annotate=True):
357 """Return the minimal pruned NCBI taxonomy tree containing taxids.
359 :param False intermediate_nodes: If True, single child nodes
360 representing the complete lineage of leaf nodes are kept.
361 Otherwise, the tree is pruned to contain the first common
362 ancestor of each group.
363 :param None rank_limit: If valid NCBI rank name is provided,
364 the tree is pruned at that given level. For instance, use
365 rank="species" to get rid of sub-species or strain leaf
366 nodes.
367 :param False collapse_subspecies: If True, any item under the
368 species rank will be collapsed into the species upper
369 node.
370 """
371 from .. import PhyloTree
372 taxids, merged_conversion = self._translate_merged(taxids)
373 if len(taxids) == 1:
374 root_taxid = int(list(taxids)[0])
375 with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE:
376 prepostorder = pickle.load(CACHED_TRAVERSE)
377 descendants = {}
378 found = 0
379 nodes = {}
380 hit = 0
381 visited = set()
382 start = prepostorder.index(root_taxid)
383 try:
384 end = prepostorder.index(root_taxid, start+1)
385 subtree = prepostorder[start:end+1]
386 except ValueError:
387 # If root taxid is not found in postorder, must be a tip node
388 subtree = [root_taxid]
389 leaves = set(v for v, count in Counter(subtree).items() if count == 1)
390 nodes[root_taxid] = PhyloTree({'name': str(root_taxid)})
391 current_parent = nodes[root_taxid]
392 for tid in subtree:
393 if tid in visited:
394 current_parent = nodes[tid].up
395 else:
396 visited.add(tid)
397 nodes[tid] = PhyloTree({'name': str(tid)})
398 current_parent.add_child(nodes[tid])
399 if tid not in leaves:
400 current_parent = nodes[tid]
401 root = nodes[root_taxid]
402 else:
403 taxids = set(map(int, taxids))
404 sp2track = {}
405 elem2node = {}
406 id2lineage = self.get_lineage_translator(taxids)
407 all_taxids = set()
408 for lineage in id2lineage.values():
409 all_taxids.update(lineage)
410 id2rank = self.get_rank(all_taxids)
411 for sp in taxids:
412 track = []
413 lineage = id2lineage[sp]
415 for elem in lineage:
416 if elem not in elem2node:
417 node = elem2node.setdefault(elem, PhyloTree())
418 node.name = str(elem)
419 node.taxid = elem
420 node.add_prop("rank", str(id2rank.get(int(elem), "no rank")))
421 else:
422 node = elem2node[elem]
423 track.append(node)
424 sp2track[sp] = track
425 # generate parent child relationships
426 for sp, track in sp2track.items():
427 parent = None
428 for elem in track:
429 if parent and elem not in parent.children:
430 parent.add_child(elem)
431 if rank_limit and elem.props.get('rank') == rank_limit:
432 break
433 parent = elem
434 root = elem2node[1]
436 #remove onechild-nodes
437 if not intermediate_nodes:
438 for n in root.descendants():
439 if len(n.children) == 1 and int(n.name) not in taxids:
440 n.delete(prevent_nondicotomic=False)
442 if len(root.children) == 1:
443 tree = root.children[0].detach()
444 else:
445 tree = root
447 if collapse_subspecies:
448 to_detach = []
449 for node in tree.traverse():
450 if node.props.get('rank') == "species":
451 to_detach.extend(node.children)
452 for n in to_detach:
453 n.detach()
455 if annotate:
456 self.annotate_tree(tree)
458 return tree
460 def annotate_tree(self, t, taxid_attr="name", tax2name=None,
461 tax2track=None, tax2rank=None, ignore_unclassified=False):
462 """Annotate a tree containing taxids as leaf names.
464 The annotation adds the properties: 'taxid', 'sci_name',
465 'lineage', 'named_lineage' and 'rank'.
467 :param t: a Tree (or Tree derived) instance.
468 :param name taxid_attr: Allows to set a custom node attribute
469 containing the taxid number associated to each node (i.e.
470 species in PhyloTree instances).
471 :param tax2name,tax2track,tax2rank: Use these arguments to
472 provide pre-calculated dictionaries providing translation
473 from taxid number and names,track lineages and ranks.
474 """
475 taxids = set()
476 for n in t.traverse():
477 try:
478 tid = int(getattr(n, taxid_attr, n.props.get(taxid_attr)))
479 except (ValueError, AttributeError, TypeError):
480 pass
481 else:
482 taxids.add(tid)
483 merged_conversion = {}
485 taxids, merged_conversion = self._translate_merged(taxids)
487 if not tax2name or taxids - set(map(int, list(tax2name.keys()))):
488 tax2name = self.get_taxid_translator(taxids)
489 if not tax2track or taxids - set(map(int, list(tax2track.keys()))):
490 tax2track = self.get_lineage_translator(taxids)
492 all_taxid_codes = set([_tax for _lin in list(tax2track.values()) for _tax in _lin])
493 extra_tax2name = self.get_taxid_translator(list(all_taxid_codes - set(tax2name.keys())))
494 tax2name.update(extra_tax2name)
495 tax2common_name = self.get_common_names(tax2name.keys())
497 if not tax2rank:
498 tax2rank = self.get_rank(list(tax2name.keys()))
500 n2leaves = t.get_cached_content()
502 for n in t.traverse('postorder'):
503 try:
504 node_taxid = int(getattr(n, taxid_attr, n.props.get(taxid_attr)))
505 except (ValueError, AttributeError, TypeError):
506 node_taxid = None
508 n.add_prop('taxid', node_taxid)
509 if node_taxid:
510 if node_taxid in merged_conversion:
511 node_taxid = merged_conversion[node_taxid]
512 n.add_props(sci_name = tax2name.get(node_taxid, getattr(n, taxid_attr, n.props.get(taxid_attr, ''))),
513 common_name = tax2common_name.get(node_taxid, ''),
514 lineage = tax2track.get(node_taxid, []),
515 rank = tax2rank.get(node_taxid, 'Unknown'),
516 named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(node_taxid, [])])
517 elif n.is_leaf:
518 n.add_props(sci_name = getattr(n, taxid_attr, n.props.get(taxid_attr, 'NA')),
519 common_name = '',
520 lineage = [],
521 rank = 'Unknown',
522 named_lineage = [])
523 else:
524 if ignore_unclassified:
525 vectors = [lf.props.get('lineage') for lf in n2leaves[n] if lf.props.get('lineage')]
526 else:
527 vectors = [lf.props.get('lineage') for lf in n2leaves[n]]
528 lineage = self._common_lineage(vectors)
529 ancestor = lineage[-1]
530 n.add_props(sci_name = tax2name.get(ancestor, str(ancestor)),
531 common_name = tax2common_name.get(ancestor, ''),
532 taxid = ancestor,
533 lineage = lineage,
534 rank = tax2rank.get(ancestor, 'Unknown'),
535 named_lineage = [tax2name.get(tax, str(tax)) for tax in lineage])
537 return tax2name, tax2track, tax2rank
539 def _common_lineage(self, vectors):
540 occurrence = defaultdict(int)
541 pos = defaultdict(set)
542 for v in vectors:
543 for i, taxid in enumerate(v):
544 occurrence[taxid] += 1
545 pos[taxid].add(i)
547 common = [taxid for taxid, ocu in occurrence.items() if ocu == len(vectors)]
548 if not common:
549 return [""]
550 else:
551 sorted_lineage = sorted(common, key=lambda x: min(pos[x]))
552 return sorted_lineage
554 def get_broken_branches(self, t, taxa_lineages, n2content=None):
555 """Returns a list of NCBI lineage names that are not monophyletic in the
556 provided tree, as well as the list of affected branches and their size.
558 CURRENTLY EXPERIMENTAL
559 """
560 if not n2content:
561 n2content = t.get_cached_content()
563 tax2node = defaultdict(set)
565 unknown = set()
566 for leaf in t.iter_leaves():
567 if leaf.sci_name.lower() != "unknown":
568 lineage = taxa_lineages[leaf.taxid]
569 for index, tax in enumerate(lineage):
570 tax2node[tax].add(leaf)
571 else:
572 unknown.add(leaf)
574 broken_branches = defaultdict(set)
575 broken_clades = set()
576 for tax, leaves in tax2node.items():
577 if len(leaves) > 1:
578 common = t.get_common_ancestor(leaves)
579 else:
580 common = list(leaves)[0]
581 if (leaves ^ set(n2content[common])) - unknown:
582 broken_branches[common].add(tax)
583 broken_clades.add(tax)
585 broken_clade_sizes = [len(tax2node[tax]) for tax in broken_clades]
587 return broken_branches, broken_clades, broken_clade_sizes
590def load_ncbi_tree_from_dump(tar):
591 from .. import Tree
592 parent2child = {}
593 name2node = {}
594 node2taxname = {}
595 synonyms = set()
596 name2rank = {}
597 node2common = {}
598 print("Loading node names...")
599 unique_nocase_synonyms = set()
600 for line in tar.extractfile("names.dmp"):
601 line = str(line.decode())
602 fields = [_f.strip() for _f in line.split("|")]
603 nodename = fields[0]
604 name_type = fields[3].lower()
605 taxname = fields[1]
607 # Clean up tax names so we make sure the don't include quotes. See https://github.com/etetoolkit/ete/issues/469
608 taxname = taxname.rstrip('"').lstrip('"')
610 if name_type == "scientific name":
611 node2taxname[nodename] = taxname
612 if name_type == "genbank common name":
613 node2common[nodename] = taxname
614 elif name_type in set(["synonym", "equivalent name", "genbank equivalent name",
615 "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]):
617 # Keep track synonyms, but ignore duplicate case-insensitive names. See https://github.com/etetoolkit/ete/issues/469
618 synonym_key = (nodename, taxname.lower())
619 if synonym_key not in unique_nocase_synonyms:
620 unique_nocase_synonyms.add(synonym_key)
621 synonyms.add((nodename, taxname))
623 print(len(node2taxname), "names loaded.")
624 print(len(synonyms), "synonyms loaded.")
626 print("Loading nodes...")
627 for line in tar.extractfile("nodes.dmp"):
628 line = str(line.decode())
629 fields = line.split("|")
630 nodename = fields[0].strip()
631 parentname = fields[1].strip()
632 n = Tree()
633 n.name = nodename
634 #n.taxname = node2taxname[nodename]
635 n.add_prop('taxname', node2taxname[nodename])
636 if nodename in node2common:
637 n.add_prop('common_name', node2taxname[nodename])
638 n.add_prop('rank', fields[2].strip())
639 parent2child[nodename] = parentname
640 name2node[nodename] = n
641 print(len(name2node), "nodes loaded.")
643 print("Linking nodes...")
644 for node in name2node:
645 if node == "1":
646 t = name2node[node]
647 else:
648 parent = parent2child[node]
649 parent_node = name2node[parent]
650 parent_node.add_child(name2node[node])
651 print("Tree is loaded.")
652 return t, synonyms
654def generate_table(t):
655 with open("taxa.tab", "w") as out:
656 for j, n in enumerate(t.traverse()):
657 if j % 1000 == 0:
658 print("\r",j,"generating entries...", end=' ')
659 temp_node = n
660 track = []
661 while temp_node:
662 track.append(temp_node.name)
663 temp_node = temp_node.up
665 n_up_name = n.up.name if n.up else ""
667 row = '\t'.join([n.name, n_up_name, n.props.get('taxname'),
668 n.props.get("common_name", ''), n.props.get("rank"),
669 ','.join(track)])
671 print(row, file=out)
673def update_db(dbfile, targz_file=None):
674 basepath = os.path.split(dbfile)[0]
675 if basepath and not os.path.exists(basepath):
676 os.makedirs(basepath)
678 if not targz_file:
679 update_local_taxdump(DEFAULT_TAXDUMP)
680 targz_file = DEFAULT_TAXDUMP
682 tar = tarfile.open(targz_file, 'r')
683 t, synonyms = load_ncbi_tree_from_dump(tar)
684 prepostorder = [int(node.name) for post, node in t.iter_prepostorder()]
685 pickle.dump(prepostorder, open(dbfile+'.traverse.pkl', "wb"), 2)
687 print("Updating database: %s ..." %dbfile)
688 generate_table(t)
690 with open("syn.tab", "w") as SYN:
691 SYN.write('\n'.join(["%s\t%s" %(v[0],v[1]) for v in synonyms]))
693 with open("merged.tab", "w") as merged:
694 for line in tar.extractfile("merged.dmp"):
695 line = str(line.decode())
696 out_line = '\t'.join([_f.strip() for _f in line.split('|')[:2]])
697 merged.write(out_line+'\n')
699 upload_data(dbfile)
701 os.system("rm syn.tab merged.tab taxa.tab")
704def update_local_taxdump(fname=DEFAULT_TAXDUMP):
705 """Update contents of file fname with taxdump.tar.gz from the NCBI site."""
706 url = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
708 if not os.path.exists(fname):
709 print(f'Downloading {fname} from {url} ...')
710 with open(fname, 'wb') as f:
711 f.write(requests.get(url).content)
712 else:
713 md5_local = md5(open(fname, 'rb').read()).hexdigest()
714 md5_remote = requests.get(url + '.md5').text.split()[0]
716 if md5_local != md5_remote:
717 print(f'Updating {fname} from {url} ...')
718 with open(fname, 'wb') as f:
719 f.write(requests.get(url).content)
720 else:
721 print(f'File {fname} is already up-to-date with {url} .')
724def upload_data(dbfile):
725 print()
726 print('Uploading to', dbfile)
727 basepath = os.path.split(dbfile)[0]
728 if basepath and not os.path.exists(basepath):
729 os.mkdir(basepath)
731 db = sqlite3.connect(dbfile)
733 create_cmd = """
734 DROP TABLE IF EXISTS stats;
735 DROP TABLE IF EXISTS species;
736 DROP TABLE IF EXISTS synonym;
737 DROP TABLE IF EXISTS merged;
738 CREATE TABLE stats (version INT PRIMARY KEY);
739 CREATE TABLE species (taxid INT PRIMARY KEY, parent INT, spname VARCHAR(50) COLLATE NOCASE, common VARCHAR(50) COLLATE NOCASE, rank VARCHAR(50), track TEXT);
740 CREATE TABLE synonym (taxid INT,spname VARCHAR(50) COLLATE NOCASE, PRIMARY KEY (spname, taxid));
741 CREATE TABLE merged (taxid_old INT, taxid_new INT);
742 CREATE INDEX spname1 ON species (spname COLLATE NOCASE);
743 CREATE INDEX spname2 ON synonym (spname COLLATE NOCASE);
744 """
745 for cmd in create_cmd.split(';'):
746 db.execute(cmd)
747 print()
749 db.execute("INSERT INTO stats (version) VALUES (%d);" %DB_VERSION)
750 db.commit()
752 for i, line in enumerate(open("syn.tab")):
753 if i % 5000 == 0 :
754 print('\rInserting synonyms: %6d' % i, end=' ', file=sys.stderr)
755 sys.stderr.flush()
756 taxid, spname = line.strip('\n').split('\t')
757 db.execute("INSERT INTO synonym (taxid, spname) VALUES (?, ?);",
758 (taxid, spname))
759 print()
760 db.commit()
761 for i, line in enumerate(open("merged.tab")):
762 if i % 5000 == 0 :
763 print('\rInserting taxid merges: %6d' % i, end=' ', file=sys.stderr)
764 sys.stderr.flush()
765 taxid_old, taxid_new = line.strip('\n').split('\t')
766 db.execute("INSERT INTO merged (taxid_old, taxid_new) VALUES (?, ?);",
767 (taxid_old, taxid_new))
768 print()
769 db.commit()
770 for i, line in enumerate(open("taxa.tab")):
771 if i % 5000 == 0 :
772 print('\rInserting taxids: %6d' % i, end=' ', file=sys.stderr)
773 sys.stderr.flush()
774 taxid, parentid, spname, common, rank, lineage = line.strip('\n').split('\t')
775 db.execute('INSERT INTO species (taxid, parent, spname, common, rank, track) '
776 'VALUES (?, ?, ?, ?, ?, ?);',
777 (taxid, parentid, spname, common, rank, lineage))
778 print()
779 db.commit()
782if __name__ == "__main__":
783 ncbi = NCBITaxa()
785 a = ncbi.get_descendant_taxa("hominidae")
786 print(a)
787 print(ncbi.get_common_names(a))
788 print(ncbi.get_topology(a))
789 b = ncbi.get_descendant_taxa("homo", intermediate_nodes=True, collapse_subspecies=True)
790 print(ncbi.get_taxid_translator(b))
792 print(ncbi.get_common_names(b))
793 #ncbi.update_taxonomy_database()