multitax
1import importlib.metadata 2 3__version__ = importlib.metadata.version(__name__) 4 5__all__ = ( 6 "CustomTx", 7 "DummyTx", 8 "GreengenesTx", 9 "GtdbTx", 10 "NcbiTx", 11 "OttTx", 12 "SilvaTx", 13) 14 15from .customtx import CustomTx 16from .dummytx import DummyTx 17from .greengenestx import GreengenesTx 18from .gtdbtx import GtdbTx 19from .ncbitx import NcbiTx 20from .otttx import OttTx 21from .silvatx import SilvaTx
7class CustomTx(MultiTax): 8 _required_cols = ["node", "parent"] 9 _possible_cols = ["node", "parent", "rank", "name"] 10 11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs) 31 32 def __repr__(self): 33 return format_repr(inst=self) 34 35 def _build_translation(self, target_tax, file: str = None, url: str = None): 36 warnings.warn( 37 "Translation between taxonomies [" 38 + self.__class__.__name__ 39 + "," 40 + target_tax.__class__.__name__ 41 + "] not yet implemented." 42 ) 43 return {} 44 45 def _parse(self, fhs, **kwargs): 46 nodes = {} 47 ranks = {} 48 names = {} 49 for source, fh in fhs.items(): 50 for line in fh: 51 try: 52 fields = line.rstrip().split(self._sep) 53 except TypeError: 54 fields = line.decode().rstrip().split(self._sep) 55 56 node = fields[self._cols["node"]] 57 nodes[node] = fields[self._cols["parent"]] 58 if "name" in self._cols: 59 names[node] = fields[self._cols["name"]] 60 if "rank" in self._cols: 61 ranks[node] = fields[self._cols["rank"]] 62 63 return nodes, ranks, names 64 65 def _parse_cols(self, cols): 66 if isinstance(cols, list): 67 cols = {c: i for i, c in enumerate(cols)} 68 69 for rc in self._required_cols: 70 if rc not in cols: 71 raise ValueError(rc + " is a required column") 72 73 for c in cols: 74 if c not in self._possible_cols: 75 raise ValueError( 76 c + " is not a valid column: " + ",".join(self._possible_cols) 77 ) 78 79 return cols
11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs)
CustomTx()
Parameters:
- cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
- sep [str]: Separator of fields
- **kwargs defined at
multitax.multitax.MultiTax
Example:
tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
6class DummyTx(MultiTax): 7 def __init__(self, **kwargs): 8 """ 9 DummyTx() - Dummy empty taxonomy 10 11 Parameters: 12 13 * \\*\\*kwargs defined at `multitax.multitax.MultiTax` 14 """ 15 super().__init__(**kwargs) 16 17 def __repr__(self): 18 return format_repr(inst=self)
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class GreengenesTx(MultiTax): 8 _default_version = "2024.09" 9 _supported_versions = ["2022.10", "2024.09"] 10 _default_urls = { 11 "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz", 12 "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz", 13 } 14 15 _rank_codes = [ 16 ("d__", "domain"), 17 ("p__", "phylum"), 18 ("c__", "class"), 19 ("o__", "order"), 20 ("f__", "family"), 21 ("g__", "genus"), 22 ("s__", "species"), 23 ] 24 25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs) 29 30 def __repr__(self): 31 return format_repr(inst=self) 32 33 def _build_translation( 34 self, 35 target_tax, 36 representatives: bool = False, 37 file: str = None, 38 url: str = None, 39 ): 40 warnings.warn( 41 "Translation between taxonomies [" 42 + self.__class__.__name__ 43 + "," 44 + target_tax.__class__.__name__ 45 + "] not yet implemented." 46 ) 47 return {} 48 49 def _parse(self, fhs, **kwargs): 50 nodes = {} 51 ranks = {} 52 names = {} 53 54 lineages = set() 55 for source, fh in fhs.items(): 56 for line in fh: 57 try: 58 fields = line.rstrip().split("\t") 59 except TypeError: 60 fields = line.decode().rstrip().split("\t") 61 62 # skip header 63 if fields[0] == "Feature ID": 64 continue 65 66 lineages.add(fields[1]) 67 68 for lineage in lineages: 69 last_taxid = None 70 lin = lineage.split("; ") 71 for i in range(len(lin))[::-1]: 72 # assert rank 73 assert lin[i][:3] == self._rank_codes[i][0] 74 75 name = lin[i][3:] 76 if not name: 77 continue # empty entry "s__" 78 79 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 80 taxid = lin[i] 81 rank = self._rank_codes[i][1] 82 83 if taxid not in nodes: 84 names[taxid] = name 85 ranks[taxid] = rank 86 if last_taxid: 87 nodes[last_taxid] = taxid 88 last_taxid = taxid 89 nodes[last_taxid] = self._default_root_node 90 91 return nodes, ranks, names
25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
10class GtdbTx(MultiTax): 11 _default_version = "232" 12 _supported_versions = [ 13 "80", 14 "83", 15 "86.2", 16 "89", 17 "95", 18 "202", 19 "207", 20 "214.1", 21 "220", 22 "226", 23 "232", 24 ] 25 26 _url_prefix = "https://data.gtdb.ecogenomic.org/releases/" 27 _default_urls = { 28 "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"], 29 "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"], 30 "86.2": [ 31 f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv", 32 f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv", 33 ], 34 "89": [ 35 f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv", 36 f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv", 37 ], 38 "95": [ 39 f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz", 40 f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz", 41 ], 42 "202": [ 43 f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz", 44 f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz", 45 ], 46 "207": [ 47 f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz", 48 f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz", 49 ], 50 "214.1": [ 51 f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz", 52 f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz", 53 ], 54 "220": [ 55 f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz", 56 f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz", 57 ], 58 "226": [ 59 f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz", 60 f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz", 61 ], 62 "232": [ 63 f"{_url_prefix}release232/232.0/ar53_taxonomy_r232.tsv.gz", 64 f"{_url_prefix}release232/232.0/bac120_taxonomy_r232.tsv.gz", 65 ], 66 } 67 68 _rank_codes = [ 69 ("d__", "domain"), 70 ("p__", "phylum"), 71 ("c__", "class"), 72 ("o__", "order"), 73 ("f__", "family"), 74 ("g__", "genus"), 75 ("s__", "species"), 76 ] 77 78 def __init__(self, **kwargs): 79 self._convert_to = {} 80 self._convert_from = {} 81 super().__init__(**kwargs) 82 83 def __repr__(self): 84 return format_repr(inst=self) 85 86 def _build_translation( 87 self, 88 target_tax, 89 representatives: bool = False, 90 file: str = None, 91 url: str = None, 92 ): 93 translated_nodes: dict[str, list] = {} 94 if target_tax.__class__.__name__ == "NcbiTx": 95 for acc, rep, gtdb_lin, ncbi_txid in download_parse_data_gtdb( 96 version=self.version, file=file, url=url 97 ): 98 # skip not representatives if requested 99 if representatives and rep == "f": 100 continue 101 102 # Build NCBI and GTDB lineage from leaf based on standard ranks 103 ncbi_leaf_node = target_tax.latest(ncbi_txid) 104 gtdb_leaf_node = gtdb_lin.split(";")[-1] 105 106 if ( 107 ncbi_leaf_node == target_tax.undefined_node 108 or gtdb_leaf_node == self.undefined_node 109 ): 110 continue 111 ncbi_nodes = target_tax.lineage( 112 ncbi_leaf_node, 113 ranks=target_tax._standard_ranks, 114 ) 115 if not ncbi_nodes: 116 continue 117 gtdb_nodes = self.lineage( 118 gtdb_leaf_node, 119 ranks=self._standard_ranks, 120 ) 121 122 # Match ranks 123 for i, gtdb_n in enumerate(gtdb_nodes, 1): 124 if gtdb_n == self.undefined_node: 125 continue 126 127 # Get closes available node for translation in the lineage up to current rank 128 translated_node = next( 129 node 130 for node in reversed(ncbi_nodes[:i]) 131 if node is not target_tax.undefined_node 132 ) 133 if gtdb_n not in translated_nodes: 134 translated_nodes[gtdb_n] = [] 135 translated_nodes[gtdb_n].append(translated_node) 136 else: 137 warnings.warn( 138 "Translation between taxonomies [" 139 + self.__class__.__name__ 140 + "," 141 + target_tax.__class__.__name__ 142 + "] not yet implemented." 143 ) 144 145 return translated_nodes 146 147 def _parse(self, fhs, **kwargs): 148 nodes = {} 149 ranks = {} 150 names = {} 151 for source, fh in fhs.items(): 152 for line in fh: 153 try: 154 _, lineage = line.rstrip().split("\t") 155 except TypeError: 156 _, lineage = line.decode().rstrip().split("\t") 157 lin = lineage.split(";") 158 for i in range(len(lin))[::-1]: 159 # assert rank 160 assert lin[i][:3] == self._rank_codes[i][0] 161 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 162 taxid = lin[i] 163 name = lin[i][3:] 164 # empty entry "s__" 165 if not name: 166 continue 167 rank = self._rank_codes[i][1] 168 if i == 0: 169 parent_taxid = self._default_root_node 170 else: 171 parent_taxid = lin[i - 1] 172 if taxid not in nodes: 173 nodes[taxid] = parent_taxid 174 names[taxid] = name 175 ranks[taxid] = rank 176 177 return nodes, ranks, names 178 179 def _lookup_version_taxa(self, node, version: str): 180 res = set() 181 for acc in self._convert_from.get(node, ""): 182 for tx in self._convert_to[version].get(acc, "").split(";"): 183 # Return only rank of requested node 184 if tx.startswith(node[:1]): 185 res.add(tx) 186 return res 187 188 def build_conversion( 189 self, 190 version: str, 191 representatives: bool = False, 192 files: tuple[str, str] = ("", ""), 193 urls: tuple[str, str] = ("", ""), 194 ): 195 """ 196 Download and build conversion table against another version. 197 Optionally build conversion based only on representative genomes of current version. 198 Optional function, conversion tables are automatically downloaded 199 and built on first .convert() call. 200 """ 201 if version not in self._supported_versions: 202 raise ValueError( 203 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 204 ) 205 206 if not self._convert_from: 207 # Collect the accessions of the representative entries for each taxa in the current version 208 tx_accs = {} 209 for acc, rep, lin, _ in download_parse_data_gtdb( 210 version=self.version, file=files[0], url=urls[0] 211 ): 212 # Skip not representatives if requested 213 if representatives and rep == "f": 214 continue 215 216 for tx in lin.split(";"): 217 if tx not in tx_accs: 218 tx_accs[tx] = [] 219 tx_accs[tx].append(acc) 220 221 # Assign only at the end, in case of download/parse errors 222 self._convert_from = tx_accs 223 224 if version not in self._convert_to: 225 # Collect the lineage for each accession 226 acc_lin = {} 227 for acc, _, lin, _ in download_parse_data_gtdb( 228 version=version, file=files[1], url=urls[1] 229 ): 230 acc_lin[acc] = lin 231 # Assign only at the end, in case of download/parse errors 232 self._convert_to[version] = acc_lin 233 234 def convert(self, node: str, version: str) -> set[str]: 235 """ 236 Converts a taxonomic node from the loaded tax to another version. 237 238 It uses a genomic centric strategy: locates the GTDB species representative(s) 239 of the requested `node` lineage and converts to the assigned lineage(s) of that 240 genome in the requested `version`. 241 242 It may return multiple nodes for ranks above species, 243 since multiple representatives can be split into more taxa. 244 It may return an empty set if node is not found in the current version 245 or if related representative is no longer available in the requested version. 246 247 Example: 248 249 from multitax import GtdbTx 250 tax = GtdbTx(version="95") 251 252 # Species - always one-to-one 253 tax.convert('s__Giesbergeria metamorpha', version="226") 254 {'s__Simplicispira metamorpha'} 255 256 # Other ranks - may be one-to-many 257 tax.convert('g__UBA6715', version="226") 258 {'g__Aquirufa', 'g__Sandaracinomonas'} 259 """ 260 261 if version not in self._supported_versions: 262 raise ValueError( 263 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 264 ) 265 266 if not self._convert_from or version not in self._convert_to: 267 self.build_conversion(version=version) 268 269 return self._lookup_version_taxa(node, version)
78 def __init__(self, **kwargs): 79 self._convert_to = {} 80 self._convert_from = {} 81 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
188 def build_conversion( 189 self, 190 version: str, 191 representatives: bool = False, 192 files: tuple[str, str] = ("", ""), 193 urls: tuple[str, str] = ("", ""), 194 ): 195 """ 196 Download and build conversion table against another version. 197 Optionally build conversion based only on representative genomes of current version. 198 Optional function, conversion tables are automatically downloaded 199 and built on first .convert() call. 200 """ 201 if version not in self._supported_versions: 202 raise ValueError( 203 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 204 ) 205 206 if not self._convert_from: 207 # Collect the accessions of the representative entries for each taxa in the current version 208 tx_accs = {} 209 for acc, rep, lin, _ in download_parse_data_gtdb( 210 version=self.version, file=files[0], url=urls[0] 211 ): 212 # Skip not representatives if requested 213 if representatives and rep == "f": 214 continue 215 216 for tx in lin.split(";"): 217 if tx not in tx_accs: 218 tx_accs[tx] = [] 219 tx_accs[tx].append(acc) 220 221 # Assign only at the end, in case of download/parse errors 222 self._convert_from = tx_accs 223 224 if version not in self._convert_to: 225 # Collect the lineage for each accession 226 acc_lin = {} 227 for acc, _, lin, _ in download_parse_data_gtdb( 228 version=version, file=files[1], url=urls[1] 229 ): 230 acc_lin[acc] = lin 231 # Assign only at the end, in case of download/parse errors 232 self._convert_to[version] = acc_lin
Download and build conversion table against another version. Optionally build conversion based only on representative genomes of current version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.
234 def convert(self, node: str, version: str) -> set[str]: 235 """ 236 Converts a taxonomic node from the loaded tax to another version. 237 238 It uses a genomic centric strategy: locates the GTDB species representative(s) 239 of the requested `node` lineage and converts to the assigned lineage(s) of that 240 genome in the requested `version`. 241 242 It may return multiple nodes for ranks above species, 243 since multiple representatives can be split into more taxa. 244 It may return an empty set if node is not found in the current version 245 or if related representative is no longer available in the requested version. 246 247 Example: 248 249 from multitax import GtdbTx 250 tax = GtdbTx(version="95") 251 252 # Species - always one-to-one 253 tax.convert('s__Giesbergeria metamorpha', version="226") 254 {'s__Simplicispira metamorpha'} 255 256 # Other ranks - may be one-to-many 257 tax.convert('g__UBA6715', version="226") 258 {'g__Aquirufa', 'g__Sandaracinomonas'} 259 """ 260 261 if version not in self._supported_versions: 262 raise ValueError( 263 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 264 ) 265 266 if not self._convert_from or version not in self._convert_to: 267 self.build_conversion(version=version) 268 269 return self._lookup_version_taxa(node, version)
Converts a taxonomic node from the loaded tax to another version.
It uses a genomic centric strategy: locates the GTDB species representative(s)
of the requested node lineage and converts to the assigned lineage(s) of that
genome in the requested version.
It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.
Example:
from multitax import GtdbTx
tax = GtdbTx(version="95")
# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}
# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class NcbiTx(MultiTax): 8 _default_version = "current" 9 _supported_versions = ["current"] 10 _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"} 11 12 def __init__(self, **kwargs): 13 self._merged = {} 14 self._extended_name_nodes = {} 15 super().__init__(**kwargs) 16 17 def __repr__(self): 18 return format_repr(inst=self) 19 20 def _build_translation( 21 self, 22 target_tax, 23 representatives: bool = False, 24 file: str = None, 25 url: str = None, 26 ): 27 translated_nodes: dict[str, list] = {} 28 if target_tax.__class__.__name__ == "GtdbTx": 29 for acc, rep, gtdb_lin, ncbi_txid in download_parse_data_gtdb( 30 version=target_tax.version, file=file, url=url 31 ): 32 # skip not representatives if requested 33 if representatives and rep == "f": 34 continue 35 36 # Build NCBI and GTDB lineage from leaf based on standard ranks 37 ncbi_leaf_node = self.latest(ncbi_txid) 38 gtdb_leaf_node = gtdb_lin.split(";")[-1] 39 if ( 40 ncbi_leaf_node == self.undefined_node 41 or gtdb_leaf_node == target_tax.undefined_node 42 ): 43 continue 44 gtdb_nodes = target_tax.lineage( 45 gtdb_leaf_node, 46 ranks=target_tax._standard_ranks, 47 ) 48 if not gtdb_nodes: 49 continue 50 ncbi_nodes = self.lineage( 51 ncbi_leaf_node, 52 ranks=self._standard_ranks, 53 ) 54 55 # Additionaly add connection from NCBI leaf to GTDB species 56 # since the NCBI taxid can be of a strain 57 if ncbi_leaf_node not in translated_nodes: 58 translated_nodes[ncbi_leaf_node] = [] 59 translated_nodes[ncbi_leaf_node].append(gtdb_leaf_node) 60 61 # Match ranks 62 for i, ncbi_n in enumerate(ncbi_nodes, 1): 63 if ncbi_n == self.undefined_node: 64 continue 65 66 # Get closes available node for translation in the lineage up to current rank 67 translated_node = next( 68 node 69 for node in reversed(gtdb_nodes[:i]) 70 if node is not target_tax.undefined_node 71 ) 72 73 if ncbi_n not in translated_nodes: 74 translated_nodes[ncbi_n] = [] 75 translated_nodes[ncbi_n].append(translated_node) 76 77 else: 78 warnings.warn( 79 "Translation between taxonomies [" 80 + self.__class__.__name__ 81 + "," 82 + target_tax.__class__.__name__ 83 + "] not yet implemented." 84 ) 85 86 return translated_nodes 87 88 def _parse(self, fhs, **kwargs): 89 fhs_list = list(fhs.values()) 90 # One element tar.gz -> taxdump.tar.gz 91 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"): 92 nodes, ranks, names, self._merged = self._parse_taxdump( 93 fhs_list[0], extended_names=kwargs["extended_names"] 94 ) 95 else: 96 # nodes.dmp 97 nodes, ranks = self._parse_nodes(fhs_list[0]) 98 99 # [names.dmp] 100 if len(fhs) >= 2: 101 names = self._parse_names( 102 fhs_list[1], extended_names=kwargs["extended_names"] 103 ) 104 else: 105 names = {} 106 107 # [merged.dmp] 108 if len(fhs) == 3: 109 self._merged = self._parse_merged(fhs_list[2]) 110 return nodes, ranks, names 111 112 def _parse_merged(self, fh): 113 merged = {} 114 for line in fh: 115 try: 116 old_taxid, _, new_taxid, _ = line.split("\t", 3) 117 except TypeError: 118 old_taxid, _, new_taxid, _ = line.decode().split("\t", 3) 119 merged[old_taxid] = new_taxid 120 return merged 121 122 def _parse_names(self, fh, extended_names): 123 names = {} 124 for line in fh: 125 try: 126 node, name, _, name_class = line.split("\t|\t") 127 except TypeError: 128 node, name, _, name_class = line.decode().split("\t|\t") 129 if name_class.replace("\t|\n", "") == "scientific name": 130 names[node] = name 131 elif extended_names: 132 if name not in self._extended_name_nodes: 133 self._extended_name_nodes[name] = [] 134 self._extended_name_nodes[name].append(node) 135 136 return names 137 138 def _parse_nodes(self, fh): 139 nodes = {} 140 ranks = {} 141 for line in fh: 142 try: 143 taxid, parent_taxid, rank, _ = line.split("\t|\t", 3) 144 except TypeError: 145 taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3) 146 ranks[taxid] = rank 147 nodes[taxid] = parent_taxid 148 return nodes, ranks 149 150 def _parse_taxdump(self, fh_taxdump, extended_names): 151 with fh_taxdump.extractfile("nodes.dmp") as fh_nodes: 152 nodes, ranks = self._parse_nodes(fh_nodes) 153 with fh_taxdump.extractfile("names.dmp") as fh_names: 154 names = self._parse_names(fh_names, extended_names=extended_names) 155 with fh_taxdump.extractfile("merged.dmp") as fh_merged: 156 merged = self._parse_merged(fh_merged) 157 return nodes, ranks, names, merged 158 159 def latest(self, node: str): 160 n = super().latest(node) 161 if n == self.undefined_node: 162 n = self.merged(node) 163 return n 164 165 def merged(self, node: str): 166 """ 167 Returns relative entry from the merged.dmp file of a given node. 168 """ 169 if node in self._merged: 170 return self._merged[node] 171 else: 172 return self.undefined_node 173 174 def search_name( 175 self, 176 text: str, 177 rank: str = None, 178 exact: bool = True, 179 force_extended: bool = False, 180 ): 181 """ 182 Search node by exact or partial name. 183 184 Default order (can be skipped with **force_extended=True**): 185 186 1) Search names defined as "scientific name" on nodes.dmp 187 188 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 189 190 Parameters: 191 * **text** *[str]*: Text to search. 192 * **rank** *[str]*: Filter results by rank. 193 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 194 * **force_extended** *[bool]*: Search for text in all categories at once. 195 196 Returns: list of matching nodes 197 """ 198 n = super().search_name(text, rank=rank, exact=exact) 199 if n and not force_extended: 200 return n 201 else: 202 if exact: 203 ret = self._exact_name(text, self._extended_name_nodes) 204 else: 205 ret = self._partial_name(text, self._extended_name_nodes) 206 207 # Only return nodes of chosen rank 208 if rank: 209 ret = filter_function(ret, self.rank, rank) 210 211 return list(set(n + ret)) 212 213 def stats(self, **kwargs): 214 s = super().stats(**kwargs) 215 if self._merged: 216 s["merged"] = len(self._merged) 217 if self._extended_name_nodes: 218 s["extended_names"] = len(self._extended_name_nodes) 219 return s
12 def __init__(self, **kwargs): 13 self._merged = {} 14 self._extended_name_nodes = {} 15 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
159 def latest(self, node: str): 160 n = super().latest(node) 161 if n == self.undefined_node: 162 n = self.merged(node) 163 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
165 def merged(self, node: str): 166 """ 167 Returns relative entry from the merged.dmp file of a given node. 168 """ 169 if node in self._merged: 170 return self._merged[node] 171 else: 172 return self.undefined_node
Returns relative entry from the merged.dmp file of a given node.
174 def search_name( 175 self, 176 text: str, 177 rank: str = None, 178 exact: bool = True, 179 force_extended: bool = False, 180 ): 181 """ 182 Search node by exact or partial name. 183 184 Default order (can be skipped with **force_extended=True**): 185 186 1) Search names defined as "scientific name" on nodes.dmp 187 188 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 189 190 Parameters: 191 * **text** *[str]*: Text to search. 192 * **rank** *[str]*: Filter results by rank. 193 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 194 * **force_extended** *[bool]*: Search for text in all categories at once. 195 196 Returns: list of matching nodes 197 """ 198 n = super().search_name(text, rank=rank, exact=exact) 199 if n and not force_extended: 200 return n 201 else: 202 if exact: 203 ret = self._exact_name(text, self._extended_name_nodes) 204 else: 205 ret = self._partial_name(text, self._extended_name_nodes) 206 207 # Only return nodes of chosen rank 208 if rank: 209 ret = filter_function(ret, self.rank, rank) 210 211 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search names defined as "scientific name" on nodes.dmp
2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
213 def stats(self, **kwargs): 214 s = super().stats(**kwargs) 215 if self._merged: 216 s["merged"] = len(self._merged) 217 if self._extended_name_nodes: 218 s["extended_names"] = len(self._extended_name_nodes) 219 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class OttTx(MultiTax): 8 _default_version = "3.7.3" 9 _supported_versions = ["3.6", "3.7.3"] 10 _default_urls = { 11 "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz", 12 "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz", 13 } 14 _default_root_node = "805080" 15 16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs) 20 21 def __repr__(self): 22 return format_repr(inst=self) 23 24 def _build_translation( 25 self, 26 target_tax, 27 representatives: bool = False, 28 file: str = None, 29 url: str = None, 30 ): 31 warnings.warn( 32 "Translation between taxonomies [" 33 + self.__class__.__name__ 34 + "," 35 + target_tax.__class__.__name__ 36 + "] not yet implemented." 37 ) 38 return {} 39 40 def _parse(self, fhs, **kwargs): 41 fhs_list = list(fhs.values()) 42 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"): 43 nodes, ranks, names = self._parse_ott( 44 fhs_list[0], extended_names=kwargs["extended_names"] 45 ) 46 else: 47 # nodes.dmp 48 nodes, ranks, names = self._parse_taxonomy(fhs_list[0]) 49 # [forwards.tsv] 50 if len(fhs) >= 2: 51 self._forwards = self._parse_forwards(fhs_list[1]) 52 if len(fhs) == 3 and kwargs["extended_names"]: 53 self._extended_name_nodes = self._parse_synonyms(fhs_list[2]) 54 55 return nodes, ranks, names 56 57 def _parse_forwards(self, fh): 58 forwards = {} 59 # skip first line header 60 next(fh) 61 for line in fh: 62 try: 63 old_taxid, new_taxid = line.rstrip().split("\t") 64 except TypeError: 65 old_taxid, new_taxid = line.decode().rstrip().split("\t") 66 forwards[old_taxid] = new_taxid 67 return forwards 68 69 def _parse_ott(self, fh_taxdump, extended_names): 70 # Get files inside folder by name 71 for e in fh_taxdump.getnames(): 72 if e.endswith("taxonomy.tsv"): 73 tax = e 74 if e.endswith("forwards.tsv"): 75 fwr = e 76 if e.endswith("synonyms.tsv"): 77 syn = e 78 79 with fh_taxdump.extractfile(tax) as fh_nodes: 80 nodes, ranks, names = self._parse_taxonomy(fh_nodes) 81 with fh_taxdump.extractfile(fwr) as fh_forwards: 82 self._forwards = self._parse_forwards(fh_forwards) 83 if extended_names: 84 with fh_taxdump.extractfile(syn) as fh_synonyms: 85 self._extended_name_nodes = self._parse_synonyms(fh_synonyms) 86 return nodes, ranks, names 87 88 def _parse_synonyms(self, fh): 89 synonyms = {} 90 # skip first line header 91 next(fh) 92 for line in fh: 93 try: 94 name, taxid, _ = line.split("\t|\t", 2) 95 except TypeError: 96 name, taxid, _ = line.decode().split("\t|\t", 2) 97 if name not in synonyms: 98 synonyms[name] = [] 99 synonyms[name].append(taxid) 100 101 return synonyms 102 103 def _parse_taxonomy(self, fh): 104 nodes = {} 105 ranks = {} 106 names = {} 107 # skip first line header 108 next(fh) 109 for line in fh: 110 try: 111 taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4) 112 except TypeError: 113 taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4) 114 ranks[taxid] = rank 115 nodes[taxid] = parent_taxid 116 names[taxid] = name 117 return nodes, ranks, names 118 119 def forwards(self, node: str): 120 """ 121 Returns relative entry from the forwards.tsv file of a given node. 122 """ 123 if node in self._forwards: 124 return self._forwards[node] 125 else: 126 return self.undefined_node 127 128 def latest(self, node: str): 129 n = super().latest(node) 130 if n == self.undefined_node: 131 n = self.forwards(node) 132 return n 133 134 def search_name( 135 self, 136 text: str, 137 rank: str = None, 138 exact: bool = True, 139 force_extended: bool = False, 140 ): 141 """ 142 Search node by exact or partial name. 143 144 Default order (can be skipped with **force_extended=True**): 145 146 1) Search default names defined on "taxonomy.tsv" 147 148 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 149 150 Parameters: 151 * **text** *[str]*: Text to search. 152 * **rank** *[str]*: Filter results by rank. 153 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 154 * **force_extended** *[bool]*: Search for text in all categories at once. 155 156 Returns: list of matching nodes 157 """ 158 n = super().search_name(text, rank=rank, exact=exact) 159 if n and not force_extended: 160 return n 161 else: 162 if exact: 163 ret = self._exact_name(text, self._extended_name_nodes) 164 else: 165 ret = self._partial_name(text, self._extended_name_nodes) 166 167 # Only return nodes of chosen rank 168 if rank: 169 ret = filter_function(ret, self.rank, rank) 170 171 return list(set(n + ret)) 172 173 def stats(self, **kwargs): 174 s = super().stats(**kwargs) 175 if self._forwards: 176 s["forwards"] = len(self._forwards) 177 if self._extended_name_nodes: 178 s["extended_names"] = len(self._extended_name_nodes) 179 return s
16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
119 def forwards(self, node: str): 120 """ 121 Returns relative entry from the forwards.tsv file of a given node. 122 """ 123 if node in self._forwards: 124 return self._forwards[node] 125 else: 126 return self.undefined_node
Returns relative entry from the forwards.tsv file of a given node.
128 def latest(self, node: str): 129 n = super().latest(node) 130 if n == self.undefined_node: 131 n = self.forwards(node) 132 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
134 def search_name( 135 self, 136 text: str, 137 rank: str = None, 138 exact: bool = True, 139 force_extended: bool = False, 140 ): 141 """ 142 Search node by exact or partial name. 143 144 Default order (can be skipped with **force_extended=True**): 145 146 1) Search default names defined on "taxonomy.tsv" 147 148 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 149 150 Parameters: 151 * **text** *[str]*: Text to search. 152 * **rank** *[str]*: Filter results by rank. 153 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 154 * **force_extended** *[bool]*: Search for text in all categories at once. 155 156 Returns: list of matching nodes 157 """ 158 n = super().search_name(text, rank=rank, exact=exact) 159 if n and not force_extended: 160 return n 161 else: 162 if exact: 163 ret = self._exact_name(text, self._extended_name_nodes) 164 else: 165 ret = self._partial_name(text, self._extended_name_nodes) 166 167 # Only return nodes of chosen rank 168 if rank: 169 ret = filter_function(ret, self.rank, rank) 170 171 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search default names defined on "taxonomy.tsv"
2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
173 def stats(self, **kwargs): 174 s = super().stats(**kwargs) 175 if self._forwards: 176 s["forwards"] = len(self._forwards) 177 if self._extended_name_nodes: 178 s["extended_names"] = len(self._extended_name_nodes) 179 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class SilvaTx(MultiTax): 8 _default_version = "ssu_138.2" 9 _supported_versions = ["lsu_138.2", "ssu_138.2"] 10 _default_urls = { 11 "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz", 12 "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz", 13 } 14 15 def __init__(self, **kwargs): 16 super().__init__(**kwargs) 17 18 def __repr__(self): 19 return format_repr(inst=self) 20 21 def _build_translation( 22 self, 23 target_tax, 24 representatives: bool = False, 25 file: str = None, 26 url: str = None, 27 ): 28 warnings.warn( 29 "Translation between taxonomies [" 30 + self.__class__.__name__ 31 + "," 32 + target_tax.__class__.__name__ 33 + "] not yet implemented." 34 ) 35 return {} 36 37 def _parse(self, fhs, **kwargs): 38 nodes = {} 39 ranks = {} 40 names = {} 41 42 lin = {} 43 for source, fh in fhs.items(): 44 for line in fh: 45 try: 46 name_lineage, taxid, rank, _ = line.split("\t", 3) 47 except TypeError: 48 name_lineage, taxid, rank, _ = line.decode().split("\t", 3) 49 # Remove last char ";" 50 lineage = name_lineage[:-1] 51 name = lineage.split(";")[-1] 52 # Save lineage to build tree 53 lin[lineage] = taxid 54 names[taxid] = name 55 ranks[taxid] = rank 56 57 # Build parent node connection 58 for lineage, taxid in lin.items(): 59 t = taxid 60 lsplit = lineage.split(";")[:-1] 61 while lsplit: 62 parent_taxid = lin[";".join(lsplit)] 63 if t not in nodes: 64 nodes[t] = parent_taxid 65 t = parent_taxid 66 del lsplit[-1] # remove last element 67 # Connect last node to root 68 if t not in nodes: 69 nodes[t] = self._default_root_node 70 71 return nodes, ranks, names
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write