Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/ncbi_taxonomy/ncbiquery.py: 29%

508 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-08-07 10:27 +0200

1#!/usr/bin/env python3 

2 

3import sys 

4import os 

5import pickle 

6from collections import defaultdict, Counter 

7import requests 

8from hashlib import md5 

9 

10import sqlite3 

11import math 

12import tarfile 

13import warnings 

14 

15from ete4 import ETE_DATA_HOME, update_ete_data 

16 

17 

18__all__ = ["NCBITaxa", "is_taxadb_up_to_date"] 

19 

20DB_VERSION = 2 

21DEFAULT_TAXADB = ETE_DATA_HOME + '/taxa.sqlite' 

22DEFAULT_TAXDUMP = ETE_DATA_HOME + '/taxdump.tar.gz' 

23 

24 

25def is_taxadb_up_to_date(dbfile=DEFAULT_TAXADB): 

26 """Return True if a valid and up-to-date taxa.sqlite database exists. 

27 

28 If `dbfile` is not specified, DEFAULT_TAXADB is assumed. 

29 """ 

30 db = sqlite3.connect(dbfile) 

31 

32 try: 

33 version = db.execute('SELECT version FROM stats;').fetchone()[0] 

34 except (sqlite3.OperationalError, ValueError, IndexError, TypeError): 

35 version = None 

36 

37 db.close() 

38 

39 return version == DB_VERSION 

40 

41 

42class NCBITaxa: 

43 """ 

44 A local transparent connector to the NCBI taxonomy database. 

45 """ 

46 

47 def __init__(self, dbfile=None, taxdump_file=None, 

48 memory=False, update=True): 

49 """Open and keep a connection to the NCBI taxonomy database. 

50 

51 If it is not present in the system, it will download the 

52 database from the NCBI site first, and convert it to ete's 

53 format. 

54 """ 

55 self.dbfile = dbfile or DEFAULT_TAXADB 

56 

57 if taxdump_file: 

58 self.update_taxonomy_database(taxdump_file) 

59 

60 if dbfile != DEFAULT_TAXADB and not os.path.exists(self.dbfile): 

61 print('NCBI database not present yet (first time used?)', 

62 file=sys.stderr) 

63 self.update_taxonomy_database(taxdump_file) 

64 

65 if not os.path.exists(self.dbfile): 

66 raise ValueError("Cannot open taxonomy database: %s" % self.dbfile) 

67 

68 self.db = None 

69 self._connect() 

70 

71 if not is_taxadb_up_to_date(self.dbfile) and update: 

72 print('NCBI database format is outdated. Upgrading', 

73 file=sys.stderr) 

74 self.update_taxonomy_database(taxdump_file) 

75 

76 if memory: 

77 filedb = self.db 

78 self.db = sqlite3.connect(':memory:') 

79 filedb.backup(self.db) 

80 

81 def update_taxonomy_database(self, taxdump_file=None): 

82 """Update the ncbi taxonomy database. 

83 

84 It does it by downloading and parsing the latest 

85 taxdump.tar.gz file from the NCBI site. 

86 

87 :param taxdump_file: Alternative location of the 

88 taxdump.tax.gz file. 

89 """ 

90 update_db(self.dbfile, taxdump_file) 

91 

92 def _connect(self): 

93 self.db = sqlite3.connect(self.dbfile) 

94 

95 def _translate_merged(self, all_taxids): 

96 conv_all_taxids = set((list(map(int, all_taxids)))) 

97 

98 cmd = ('SELECT taxid_old, taxid_new ' 

99 'FROM merged WHERE taxid_old IN (%s)' % 

100 ','.join(map(str, all_taxids))) 

101 

102 result = self.db.execute(cmd) 

103 

104 conversion = {} 

105 for old, new in result.fetchall(): 

106 conv_all_taxids.discard(int(old)) 

107 conv_all_taxids.add(int(new)) 

108 conversion[int(old)] = int(new) 

109 

110 return conv_all_taxids, conversion 

111 

112 def get_fuzzy_name_translation(self, name, sim=0.9): 

113 """Return taxid, species name and match score from the NCBI database. 

114 

115 The results are for the best match for name in the NCBI 

116 database of taxa names, with a word similarity >= `sim`. 

117 

118 :param name: Species name (does not need to be exact). 

119 :param 0.9 sim: Min word similarity to report a match (from 0 to 1). 

120 """ 

121 import sqlite3.dbapi2 as dbapi2 

122 

123 _db = dbapi2.connect(self.dbfile) 

124 _db.enable_load_extension(True) 

125 module_path = os.path.split(os.path.realpath(__file__))[0] 

126 _db.execute("SELECT load_extension('%s/%s')" % 

127 (module_path, "SQLite-Levenshtein/levenshtein.sqlext")) 

128 

129 print("Trying fuzzy search for %s" % name) 

130 maxdiffs = math.ceil(len(name) * (1-sim)) 

131 cmd = (f'SELECT taxid, spname, LEVENSHTEIN(spname, "{name}") AS sim ' 

132 f'FROM species WHERE sim <= {maxdiffs} ORDER BY sim LIMIT 1;') 

133 

134 taxid, spname, score = None, None, len(name) 

135 result = _db.execute(cmd) 

136 try: 

137 taxid, spname, score = result.fetchone() 

138 except TypeError: 

139 cmd = ( 

140 f'SELECT taxid, spname, LEVENSHTEIN(spname, "{name}") AS sim ' 

141 f'FROM synonym WHERE sim <= {maxdiffs} ORDER BY sim LIMIT 1;') 

142 result = _db.execute(cmd) 

143 try: 

144 taxid, spname, score = result.fetchone() 

145 except: 

146 pass 

147 else: 

148 taxid = int(taxid) 

149 else: 

150 taxid = int(taxid) 

151 

152 norm_score = 1 - (float(score) / len(name)) 

153 if taxid: 

154 print(f'FOUND! {spname} taxid:{taxid} score:{score} ({norm_score})') 

155 

156 return taxid, spname, norm_score 

157 

158 def get_rank(self, taxids): 

159 """Return dict with NCBI taxonomy ranks for each list of taxids.""" 

160 all_ids = set(taxids) 

161 all_ids.discard(None) 

162 all_ids.discard("") 

163 

164 query = ','.join('"%s"' % v for v in all_ids) 

165 cmd = 'SELECT taxid, rank FROM species WHERE taxid IN (%s);' % query 

166 result = self.db.execute(cmd) 

167 

168 id2rank = {} 

169 for tax, spname in result.fetchall(): 

170 id2rank[tax] = spname 

171 

172 return id2rank 

173 

174 def get_lineage_translator(self, taxids): 

175 """Return dict with lineage tracks corresponding to the given taxids. 

176 

177 The lineage tracks are a hierarchically sorted list of parent taxids. 

178 """ 

179 all_ids = set(taxids) 

180 all_ids.discard(None) 

181 all_ids.discard("") 

182 

183 query = ','.join('"%s"' % v for v in all_ids) 

184 cmd = 'SELECT taxid, track FROM species WHERE taxid IN (%s);' % query 

185 result = self.db.execute(cmd) 

186 

187 id2lineages = {} 

188 for tax, track in result.fetchall(): 

189 id2lineages[tax] = list(map(int, reversed(track.split(',')))) 

190 

191 return id2lineages 

192 

193 def get_lineage(self, taxid): 

194 """Return lineage track corresponding to the given taxid. 

195 

196 The lineage track is a hierarchically sorted list of parent taxids. 

197 """ 

198 if not taxid: 

199 return None 

200 

201 taxid = int(taxid) 

202 result = self.db.execute(f'SELECT track FROM species WHERE taxid={taxid}') 

203 raw_track = result.fetchone() 

204 if not raw_track: 

205 #perhaps is an obsolete taxid 

206 _, merged_conversion = self._translate_merged([taxid]) 

207 if taxid in merged_conversion: 

208 result = self.db.execute( 

209 'SELECT track FROM species WHERE taxid=%s' % 

210 merged_conversion[taxid]) 

211 raw_track = result.fetchone() 

212 

213 if not raw_track: 

214 raise ValueError(f'Could not find taxid: {taxid}') 

215 else: 

216 warnings.warn('taxid %s was translated into %s' % 

217 (taxid, merged_conversion[taxid])) 

218 

219 track = list(map(int, raw_track[0].split(','))) 

220 return list(reversed(track)) 

221 

222 def get_common_names(self, taxids): 

223 query = ','.join('"%s"' % v for v in taxids) 

224 cmd = 'SELECT taxid, common FROM species WHERE taxid IN (%s);' % query 

225 result = self.db.execute(cmd) 

226 

227 id2name = {} 

228 for tax, common_name in result.fetchall(): 

229 if common_name: 

230 id2name[tax] = common_name 

231 

232 return id2name 

233 

234 def get_taxid_translator(self, taxids, try_synonyms=True): 

235 """Return dict with the scientific names corresponding to the taxids.""" 

236 all_ids = set(map(int, taxids)) 

237 all_ids.discard(None) 

238 all_ids.discard("") 

239 

240 query = ','.join('"%s"' % v for v in all_ids) 

241 cmd = 'SELECT taxid, spname FROM species WHERE taxid IN (%s);' % query 

242 result = self.db.execute(cmd) 

243 

244 id2name = {} 

245 for tax, spname in result.fetchall(): 

246 id2name[tax] = spname 

247 

248 # Any taxid without translation? Let's try in the merged table. 

249 if len(all_ids) != len(id2name) and try_synonyms: 

250 not_found_taxids = all_ids - set(id2name.keys()) 

251 taxids, old2new = self._translate_merged(not_found_taxids) 

252 new2old = {v: k for k,v in old2new.items()} 

253 

254 if old2new: 

255 query = ','.join('"%s"' % v for v in new2old) 

256 cmd = 'SELECT taxid, spname FROM species WHERE taxid IN (%s);' % query 

257 result = self.db.execute(cmd) 

258 for tax, spname in result.fetchall(): 

259 id2name[new2old[tax]] = spname 

260 

261 return id2name 

262 

263 def get_name_translator(self, names): 

264 """Return dict with taxids corresponding to the given scientific names. 

265 

266 Exact name match is required for translation. 

267 """ 

268 name2id = {} 

269 #name2realname = {} 

270 name2origname = {} 

271 for n in names: 

272 name2origname[n.lower()] = n 

273 

274 names = set(name2origname.keys()) 

275 

276 query = ','.join('"%s"' % n for n in name2origname.keys()) 

277 cmd = 'SELECT spname, taxid FROM species WHERE spname IN (%s)' % query 

278 result = self.db.execute('SELECT spname, taxid FROM species WHERE spname IN (%s)' % query) 

279 for sp, taxid in result.fetchall(): 

280 oname = name2origname[sp.lower()] 

281 name2id.setdefault(oname, []).append(taxid) 

282 #name2realname[oname] = sp 

283 missing = names - set([n.lower() for n in name2id.keys()]) 

284 if missing: 

285 query = ','.join('"%s"' % n for n in missing) 

286 result = self.db.execute('SELECT spname, taxid FROM synonym ' 

287 'WHERE spname IN (%s)' % query) 

288 for sp, taxid in result.fetchall(): 

289 oname = name2origname[sp.lower()] 

290 name2id.setdefault(oname, []).append(taxid) 

291 #name2realname[oname] = sp 

292 return name2id 

293 

294 def translate_to_names(self, taxids): 

295 """Return list of scientific names corresponding to taxids.""" 

296 id2name = self.get_taxid_translator(taxids) 

297 names = [] 

298 for sp in taxids: 

299 names.append(id2name.get(sp, sp)) 

300 return names 

301 

302 def get_descendant_taxa(self, parent, intermediate_nodes=False, 

303 rank_limit=None, collapse_subspecies=False, 

304 return_tree=False): 

305 """Return list of descendant taxids of the given parent. 

306 

307 Parent can be given as taxid or scientific species name. 

308 

309 If intermediate_nodes=True, the list will also have the internal nodes. 

310 """ 

311 try: 

312 taxid = int(parent) 

313 except ValueError: 

314 try: 

315 taxid = self.get_name_translator([parent])[parent][0] 

316 except KeyError: 

317 raise ValueError('%s not found!' %parent) 

318 

319 # checks if taxid is a deprecated one, and converts into the right one. 

320 _, conversion = self._translate_merged([taxid]) #try to find taxid in synonyms table 

321 if conversion: 

322 taxid = conversion[taxid] 

323 

324 with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE: 

325 prepostorder = pickle.load(CACHED_TRAVERSE) 

326 descendants = {} 

327 found = 0 

328 for tid in prepostorder: 

329 if tid == taxid: 

330 found += 1 

331 elif found == 1: 

332 descendants[tid] = descendants.get(tid, 0) + 1 

333 elif found == 2: 

334 break 

335 

336 if not found: 

337 raise ValueError("taxid not found:%s" %taxid) 

338 elif found == 1: 

339 return [taxid] 

340 

341 if rank_limit or collapse_subspecies or return_tree: 

342 tree = self.get_topology(list(descendants.keys()), intermediate_nodes=intermediate_nodes, collapse_subspecies=collapse_subspecies, rank_limit=rank_limit) 

343 if return_tree: 

344 return tree 

345 elif intermediate_nodes: 

346 return list(map(int, [n.name for n in tree.get_descendants()])) 

347 else: 

348 return list(map(int, [n.name for n in tree])) 

349 

350 elif intermediate_nodes: 

351 return [tid for tid, count in descendants.items()] 

352 else: 

353 return [tid for tid, count in descendants.items() if count == 1] 

354 

355 def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, 

356 collapse_subspecies=False, annotate=True): 

357 """Return the minimal pruned NCBI taxonomy tree containing taxids. 

358 

359 :param False intermediate_nodes: If True, single child nodes 

360 representing the complete lineage of leaf nodes are kept. 

361 Otherwise, the tree is pruned to contain the first common 

362 ancestor of each group. 

363 :param None rank_limit: If valid NCBI rank name is provided, 

364 the tree is pruned at that given level. For instance, use 

365 rank="species" to get rid of sub-species or strain leaf 

366 nodes. 

367 :param False collapse_subspecies: If True, any item under the 

368 species rank will be collapsed into the species upper 

369 node. 

370 """ 

371 from .. import PhyloTree 

372 taxids, merged_conversion = self._translate_merged(taxids) 

373 if len(taxids) == 1: 

374 root_taxid = int(list(taxids)[0]) 

375 with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE: 

376 prepostorder = pickle.load(CACHED_TRAVERSE) 

377 descendants = {} 

378 found = 0 

379 nodes = {} 

380 hit = 0 

381 visited = set() 

382 start = prepostorder.index(root_taxid) 

383 try: 

384 end = prepostorder.index(root_taxid, start+1) 

385 subtree = prepostorder[start:end+1] 

386 except ValueError: 

387 # If root taxid is not found in postorder, must be a tip node 

388 subtree = [root_taxid] 

389 leaves = set(v for v, count in Counter(subtree).items() if count == 1) 

390 nodes[root_taxid] = PhyloTree({'name': str(root_taxid)}) 

391 current_parent = nodes[root_taxid] 

392 for tid in subtree: 

393 if tid in visited: 

394 current_parent = nodes[tid].up 

395 else: 

396 visited.add(tid) 

397 nodes[tid] = PhyloTree({'name': str(tid)}) 

398 current_parent.add_child(nodes[tid]) 

399 if tid not in leaves: 

400 current_parent = nodes[tid] 

401 root = nodes[root_taxid] 

402 else: 

403 taxids = set(map(int, taxids)) 

404 sp2track = {} 

405 elem2node = {} 

406 id2lineage = self.get_lineage_translator(taxids) 

407 all_taxids = set() 

408 for lineage in id2lineage.values(): 

409 all_taxids.update(lineage) 

410 id2rank = self.get_rank(all_taxids) 

411 for sp in taxids: 

412 track = [] 

413 lineage = id2lineage[sp] 

414 

415 for elem in lineage: 

416 if elem not in elem2node: 

417 node = elem2node.setdefault(elem, PhyloTree()) 

418 node.name = str(elem) 

419 node.taxid = elem 

420 node.add_prop("rank", str(id2rank.get(int(elem), "no rank"))) 

421 else: 

422 node = elem2node[elem] 

423 track.append(node) 

424 sp2track[sp] = track 

425 # generate parent child relationships 

426 for sp, track in sp2track.items(): 

427 parent = None 

428 for elem in track: 

429 if parent and elem not in parent.children: 

430 parent.add_child(elem) 

431 if rank_limit and elem.props.get('rank') == rank_limit: 

432 break 

433 parent = elem 

434 root = elem2node[1] 

435 

436 #remove onechild-nodes 

437 if not intermediate_nodes: 

438 for n in root.descendants(): 

439 if len(n.children) == 1 and int(n.name) not in taxids: 

440 n.delete(prevent_nondicotomic=False) 

441 

442 if len(root.children) == 1: 

443 tree = root.children[0].detach() 

444 else: 

445 tree = root 

446 

447 if collapse_subspecies: 

448 to_detach = [] 

449 for node in tree.traverse(): 

450 if node.props.get('rank') == "species": 

451 to_detach.extend(node.children) 

452 for n in to_detach: 

453 n.detach() 

454 

455 if annotate: 

456 self.annotate_tree(tree) 

457 

458 return tree 

459 

460 def annotate_tree(self, t, taxid_attr="name", tax2name=None, 

461 tax2track=None, tax2rank=None, ignore_unclassified=False): 

462 """Annotate a tree containing taxids as leaf names. 

463 

464 The annotation adds the properties: 'taxid', 'sci_name', 

465 'lineage', 'named_lineage' and 'rank'. 

466 

467 :param t: a Tree (or Tree derived) instance. 

468 :param name taxid_attr: Allows to set a custom node attribute 

469 containing the taxid number associated to each node (i.e. 

470 species in PhyloTree instances). 

471 :param tax2name,tax2track,tax2rank: Use these arguments to 

472 provide pre-calculated dictionaries providing translation 

473 from taxid number and names,track lineages and ranks. 

474 """ 

475 taxids = set() 

476 for n in t.traverse(): 

477 try: 

478 tid = int(getattr(n, taxid_attr, n.props.get(taxid_attr))) 

479 except (ValueError, AttributeError, TypeError): 

480 pass 

481 else: 

482 taxids.add(tid) 

483 merged_conversion = {} 

484 

485 taxids, merged_conversion = self._translate_merged(taxids) 

486 

487 if not tax2name or taxids - set(map(int, list(tax2name.keys()))): 

488 tax2name = self.get_taxid_translator(taxids) 

489 if not tax2track or taxids - set(map(int, list(tax2track.keys()))): 

490 tax2track = self.get_lineage_translator(taxids) 

491 

492 all_taxid_codes = set([_tax for _lin in list(tax2track.values()) for _tax in _lin]) 

493 extra_tax2name = self.get_taxid_translator(list(all_taxid_codes - set(tax2name.keys()))) 

494 tax2name.update(extra_tax2name) 

495 tax2common_name = self.get_common_names(tax2name.keys()) 

496 

497 if not tax2rank: 

498 tax2rank = self.get_rank(list(tax2name.keys())) 

499 

500 n2leaves = t.get_cached_content() 

501 

502 for n in t.traverse('postorder'): 

503 try: 

504 node_taxid = int(getattr(n, taxid_attr, n.props.get(taxid_attr))) 

505 except (ValueError, AttributeError, TypeError): 

506 node_taxid = None 

507 

508 n.add_prop('taxid', node_taxid) 

509 if node_taxid: 

510 if node_taxid in merged_conversion: 

511 node_taxid = merged_conversion[node_taxid] 

512 n.add_props(sci_name = tax2name.get(node_taxid, getattr(n, taxid_attr, n.props.get(taxid_attr, ''))), 

513 common_name = tax2common_name.get(node_taxid, ''), 

514 lineage = tax2track.get(node_taxid, []), 

515 rank = tax2rank.get(node_taxid, 'Unknown'), 

516 named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(node_taxid, [])]) 

517 elif n.is_leaf: 

518 n.add_props(sci_name = getattr(n, taxid_attr, n.props.get(taxid_attr, 'NA')), 

519 common_name = '', 

520 lineage = [], 

521 rank = 'Unknown', 

522 named_lineage = []) 

523 else: 

524 if ignore_unclassified: 

525 vectors = [lf.props.get('lineage') for lf in n2leaves[n] if lf.props.get('lineage')] 

526 else: 

527 vectors = [lf.props.get('lineage') for lf in n2leaves[n]] 

528 lineage = self._common_lineage(vectors) 

529 ancestor = lineage[-1] 

530 n.add_props(sci_name = tax2name.get(ancestor, str(ancestor)), 

531 common_name = tax2common_name.get(ancestor, ''), 

532 taxid = ancestor, 

533 lineage = lineage, 

534 rank = tax2rank.get(ancestor, 'Unknown'), 

535 named_lineage = [tax2name.get(tax, str(tax)) for tax in lineage]) 

536 

537 return tax2name, tax2track, tax2rank 

538 

539 def _common_lineage(self, vectors): 

540 occurrence = defaultdict(int) 

541 pos = defaultdict(set) 

542 for v in vectors: 

543 for i, taxid in enumerate(v): 

544 occurrence[taxid] += 1 

545 pos[taxid].add(i) 

546 

547 common = [taxid for taxid, ocu in occurrence.items() if ocu == len(vectors)] 

548 if not common: 

549 return [""] 

550 else: 

551 sorted_lineage = sorted(common, key=lambda x: min(pos[x])) 

552 return sorted_lineage 

553 

554 def get_broken_branches(self, t, taxa_lineages, n2content=None): 

555 """Returns a list of NCBI lineage names that are not monophyletic in the 

556 provided tree, as well as the list of affected branches and their size. 

557 

558 CURRENTLY EXPERIMENTAL 

559 """ 

560 if not n2content: 

561 n2content = t.get_cached_content() 

562 

563 tax2node = defaultdict(set) 

564 

565 unknown = set() 

566 for leaf in t.iter_leaves(): 

567 if leaf.sci_name.lower() != "unknown": 

568 lineage = taxa_lineages[leaf.taxid] 

569 for index, tax in enumerate(lineage): 

570 tax2node[tax].add(leaf) 

571 else: 

572 unknown.add(leaf) 

573 

574 broken_branches = defaultdict(set) 

575 broken_clades = set() 

576 for tax, leaves in tax2node.items(): 

577 if len(leaves) > 1: 

578 common = t.get_common_ancestor(leaves) 

579 else: 

580 common = list(leaves)[0] 

581 if (leaves ^ set(n2content[common])) - unknown: 

582 broken_branches[common].add(tax) 

583 broken_clades.add(tax) 

584 

585 broken_clade_sizes = [len(tax2node[tax]) for tax in broken_clades] 

586 

587 return broken_branches, broken_clades, broken_clade_sizes 

588 

589 

590def load_ncbi_tree_from_dump(tar): 

591 from .. import Tree 

592 parent2child = {} 

593 name2node = {} 

594 node2taxname = {} 

595 synonyms = set() 

596 name2rank = {} 

597 node2common = {} 

598 print("Loading node names...") 

599 unique_nocase_synonyms = set() 

600 for line in tar.extractfile("names.dmp"): 

601 line = str(line.decode()) 

602 fields = [_f.strip() for _f in line.split("|")] 

603 nodename = fields[0] 

604 name_type = fields[3].lower() 

605 taxname = fields[1] 

606 

607 # Clean up tax names so we make sure the don't include quotes. See https://github.com/etetoolkit/ete/issues/469 

608 taxname = taxname.rstrip('"').lstrip('"') 

609 

610 if name_type == "scientific name": 

611 node2taxname[nodename] = taxname 

612 if name_type == "genbank common name": 

613 node2common[nodename] = taxname 

614 elif name_type in set(["synonym", "equivalent name", "genbank equivalent name", 

615 "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]): 

616 

617 # Keep track synonyms, but ignore duplicate case-insensitive names. See https://github.com/etetoolkit/ete/issues/469 

618 synonym_key = (nodename, taxname.lower()) 

619 if synonym_key not in unique_nocase_synonyms: 

620 unique_nocase_synonyms.add(synonym_key) 

621 synonyms.add((nodename, taxname)) 

622 

623 print(len(node2taxname), "names loaded.") 

624 print(len(synonyms), "synonyms loaded.") 

625 

626 print("Loading nodes...") 

627 for line in tar.extractfile("nodes.dmp"): 

628 line = str(line.decode()) 

629 fields = line.split("|") 

630 nodename = fields[0].strip() 

631 parentname = fields[1].strip() 

632 n = Tree() 

633 n.name = nodename 

634 #n.taxname = node2taxname[nodename] 

635 n.add_prop('taxname', node2taxname[nodename]) 

636 if nodename in node2common: 

637 n.add_prop('common_name', node2taxname[nodename]) 

638 n.add_prop('rank', fields[2].strip()) 

639 parent2child[nodename] = parentname 

640 name2node[nodename] = n 

641 print(len(name2node), "nodes loaded.") 

642 

643 print("Linking nodes...") 

644 for node in name2node: 

645 if node == "1": 

646 t = name2node[node] 

647 else: 

648 parent = parent2child[node] 

649 parent_node = name2node[parent] 

650 parent_node.add_child(name2node[node]) 

651 print("Tree is loaded.") 

652 return t, synonyms 

653 

654def generate_table(t): 

655 with open("taxa.tab", "w") as out: 

656 for j, n in enumerate(t.traverse()): 

657 if j % 1000 == 0: 

658 print("\r",j,"generating entries...", end=' ') 

659 temp_node = n 

660 track = [] 

661 while temp_node: 

662 track.append(temp_node.name) 

663 temp_node = temp_node.up 

664 

665 n_up_name = n.up.name if n.up else "" 

666 

667 row = '\t'.join([n.name, n_up_name, n.props.get('taxname'), 

668 n.props.get("common_name", ''), n.props.get("rank"), 

669 ','.join(track)]) 

670 

671 print(row, file=out) 

672 

673def update_db(dbfile, targz_file=None): 

674 basepath = os.path.split(dbfile)[0] 

675 if basepath and not os.path.exists(basepath): 

676 os.makedirs(basepath) 

677 

678 if not targz_file: 

679 update_local_taxdump(DEFAULT_TAXDUMP) 

680 targz_file = DEFAULT_TAXDUMP 

681 

682 tar = tarfile.open(targz_file, 'r') 

683 t, synonyms = load_ncbi_tree_from_dump(tar) 

684 prepostorder = [int(node.name) for post, node in t.iter_prepostorder()] 

685 pickle.dump(prepostorder, open(dbfile+'.traverse.pkl', "wb"), 2) 

686 

687 print("Updating database: %s ..." %dbfile) 

688 generate_table(t) 

689 

690 with open("syn.tab", "w") as SYN: 

691 SYN.write('\n'.join(["%s\t%s" %(v[0],v[1]) for v in synonyms])) 

692 

693 with open("merged.tab", "w") as merged: 

694 for line in tar.extractfile("merged.dmp"): 

695 line = str(line.decode()) 

696 out_line = '\t'.join([_f.strip() for _f in line.split('|')[:2]]) 

697 merged.write(out_line+'\n') 

698 

699 upload_data(dbfile) 

700 

701 os.system("rm syn.tab merged.tab taxa.tab") 

702 

703 

704def update_local_taxdump(fname=DEFAULT_TAXDUMP): 

705 """Update contents of file fname with taxdump.tar.gz from the NCBI site.""" 

706 url = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' 

707 

708 if not os.path.exists(fname): 

709 print(f'Downloading {fname} from {url} ...') 

710 with open(fname, 'wb') as f: 

711 f.write(requests.get(url).content) 

712 else: 

713 md5_local = md5(open(fname, 'rb').read()).hexdigest() 

714 md5_remote = requests.get(url + '.md5').text.split()[0] 

715 

716 if md5_local != md5_remote: 

717 print(f'Updating {fname} from {url} ...') 

718 with open(fname, 'wb') as f: 

719 f.write(requests.get(url).content) 

720 else: 

721 print(f'File {fname} is already up-to-date with {url} .') 

722 

723 

724def upload_data(dbfile): 

725 print() 

726 print('Uploading to', dbfile) 

727 basepath = os.path.split(dbfile)[0] 

728 if basepath and not os.path.exists(basepath): 

729 os.mkdir(basepath) 

730 

731 db = sqlite3.connect(dbfile) 

732 

733 create_cmd = """ 

734 DROP TABLE IF EXISTS stats; 

735 DROP TABLE IF EXISTS species; 

736 DROP TABLE IF EXISTS synonym; 

737 DROP TABLE IF EXISTS merged; 

738 CREATE TABLE stats (version INT PRIMARY KEY); 

739 CREATE TABLE species (taxid INT PRIMARY KEY, parent INT, spname VARCHAR(50) COLLATE NOCASE, common VARCHAR(50) COLLATE NOCASE, rank VARCHAR(50), track TEXT); 

740 CREATE TABLE synonym (taxid INT,spname VARCHAR(50) COLLATE NOCASE, PRIMARY KEY (spname, taxid)); 

741 CREATE TABLE merged (taxid_old INT, taxid_new INT); 

742 CREATE INDEX spname1 ON species (spname COLLATE NOCASE); 

743 CREATE INDEX spname2 ON synonym (spname COLLATE NOCASE); 

744 """ 

745 for cmd in create_cmd.split(';'): 

746 db.execute(cmd) 

747 print() 

748 

749 db.execute("INSERT INTO stats (version) VALUES (%d);" %DB_VERSION) 

750 db.commit() 

751 

752 for i, line in enumerate(open("syn.tab")): 

753 if i % 5000 == 0 : 

754 print('\rInserting synonyms: %6d' % i, end=' ', file=sys.stderr) 

755 sys.stderr.flush() 

756 taxid, spname = line.strip('\n').split('\t') 

757 db.execute("INSERT INTO synonym (taxid, spname) VALUES (?, ?);", 

758 (taxid, spname)) 

759 print() 

760 db.commit() 

761 for i, line in enumerate(open("merged.tab")): 

762 if i % 5000 == 0 : 

763 print('\rInserting taxid merges: %6d' % i, end=' ', file=sys.stderr) 

764 sys.stderr.flush() 

765 taxid_old, taxid_new = line.strip('\n').split('\t') 

766 db.execute("INSERT INTO merged (taxid_old, taxid_new) VALUES (?, ?);", 

767 (taxid_old, taxid_new)) 

768 print() 

769 db.commit() 

770 for i, line in enumerate(open("taxa.tab")): 

771 if i % 5000 == 0 : 

772 print('\rInserting taxids: %6d' % i, end=' ', file=sys.stderr) 

773 sys.stderr.flush() 

774 taxid, parentid, spname, common, rank, lineage = line.strip('\n').split('\t') 

775 db.execute('INSERT INTO species (taxid, parent, spname, common, rank, track) ' 

776 'VALUES (?, ?, ?, ?, ?, ?);', 

777 (taxid, parentid, spname, common, rank, lineage)) 

778 print() 

779 db.commit() 

780 

781 

782if __name__ == "__main__": 

783 ncbi = NCBITaxa() 

784 

785 a = ncbi.get_descendant_taxa("hominidae") 

786 print(a) 

787 print(ncbi.get_common_names(a)) 

788 print(ncbi.get_topology(a)) 

789 b = ncbi.get_descendant_taxa("homo", intermediate_nodes=True, collapse_subspecies=True) 

790 print(ncbi.get_taxid_translator(b)) 

791 

792 print(ncbi.get_common_names(b)) 

793 #ncbi.update_taxonomy_database()