multitax
1import importlib.metadata 2 3__version__ = importlib.metadata.version(__name__) 4 5__all__ = ( 6 "CustomTx", 7 "DummyTx", 8 "GreengenesTx", 9 "GtdbTx", 10 "NcbiTx", 11 "OttTx", 12 "SilvaTx", 13) 14 15from .customtx import CustomTx 16from .dummytx import DummyTx 17from .greengenestx import GreengenesTx 18from .gtdbtx import GtdbTx 19from .ncbitx import NcbiTx 20from .otttx import OttTx 21from .silvatx import SilvaTx
7class CustomTx(MultiTax): 8 _required_cols = ["node", "parent"] 9 _possible_cols = ["node", "parent", "rank", "name"] 10 11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs) 31 32 def __repr__(self): 33 return format_repr(inst=self) 34 35 def _build_translation(self, target_tax, file: str = None, url: str = None): 36 warnings.warn( 37 "Translation between taxonomies [" 38 + self.__class__.__name__ 39 + "," 40 + target_tax.__class__.__name__ 41 + "] not yet implemented." 42 ) 43 return {} 44 45 def _parse(self, fhs, **kwargs): 46 nodes = {} 47 ranks = {} 48 names = {} 49 for source, fh in fhs.items(): 50 for line in fh: 51 try: 52 fields = line.rstrip().split(self._sep) 53 except TypeError: 54 fields = line.decode().rstrip().split(self._sep) 55 56 node = fields[self._cols["node"]] 57 nodes[node] = fields[self._cols["parent"]] 58 if "name" in self._cols: 59 names[node] = fields[self._cols["name"]] 60 if "rank" in self._cols: 61 ranks[node] = fields[self._cols["rank"]] 62 63 return nodes, ranks, names 64 65 def _parse_cols(self, cols): 66 if isinstance(cols, list): 67 cols = {c: i for i, c in enumerate(cols)} 68 69 for rc in self._required_cols: 70 if rc not in cols: 71 raise ValueError(rc + " is a required column") 72 73 for c in cols: 74 if c not in self._possible_cols: 75 raise ValueError( 76 c + " is not a valid column: " + ",".join(self._possible_cols) 77 ) 78 79 return cols
11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs)
CustomTx()
Parameters:
- cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
- sep [str]: Separator of fields
- **kwargs defined at
multitax.multitax.MultiTax
Example:
tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
6class DummyTx(MultiTax): 7 def __init__(self, **kwargs): 8 """ 9 DummyTx() - Dummy empty taxonomy 10 11 Parameters: 12 13 * \\*\\*kwargs defined at `multitax.multitax.MultiTax` 14 """ 15 super().__init__(**kwargs) 16 17 def __repr__(self): 18 return format_repr(inst=self)
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class GreengenesTx(MultiTax): 8 _default_version = "2024.09" 9 _supported_versions = ["2022.10", "2024.09"] 10 _default_urls = { 11 "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz", 12 "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz", 13 } 14 15 _rank_codes = [ 16 ("d__", "domain"), 17 ("p__", "phylum"), 18 ("c__", "class"), 19 ("o__", "order"), 20 ("f__", "family"), 21 ("g__", "genus"), 22 ("s__", "species"), 23 ] 24 25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs) 29 30 def __repr__(self): 31 return format_repr(inst=self) 32 33 def _build_translation(self, target_tax, file: str = None, url: str = None): 34 warnings.warn( 35 "Translation between taxonomies [" 36 + self.__class__.__name__ 37 + "," 38 + target_tax.__class__.__name__ 39 + "] not yet implemented." 40 ) 41 return {} 42 43 def _parse(self, fhs, **kwargs): 44 nodes = {} 45 ranks = {} 46 names = {} 47 48 lineages = set() 49 for source, fh in fhs.items(): 50 for line in fh: 51 try: 52 fields = line.rstrip().split("\t") 53 except TypeError: 54 fields = line.decode().rstrip().split("\t") 55 56 # skip header 57 if fields[0] == "Feature ID": 58 continue 59 60 lineages.add(fields[1]) 61 62 for lineage in lineages: 63 last_taxid = None 64 lin = lineage.split("; ") 65 for i in range(len(lin))[::-1]: 66 # assert rank 67 assert lin[i][:3] == self._rank_codes[i][0] 68 69 name = lin[i][3:] 70 if not name: 71 continue # empty entry "s__" 72 73 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 74 taxid = lin[i] 75 rank = self._rank_codes[i][1] 76 77 if taxid not in nodes: 78 names[taxid] = name 79 ranks[taxid] = rank 80 if last_taxid: 81 nodes[last_taxid] = taxid 82 last_taxid = taxid 83 nodes[last_taxid] = self._default_root_node 84 85 return nodes, ranks, names
25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class GtdbTx(MultiTax): 8 _default_version = "226" 9 _supported_versions = [ 10 "80", 11 "83", 12 "86.2", 13 "89", 14 "95", 15 "202", 16 "207", 17 "214.1", 18 "220", 19 "226", 20 ] 21 22 _url_prefix = "https://data.gtdb.ecogenomic.org/releases/" 23 _default_urls = { 24 "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"], 25 "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"], 26 "86.2": [ 27 f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv", 28 f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv", 29 ], 30 "89": [ 31 f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv", 32 f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv", 33 ], 34 "95": [ 35 f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz", 36 f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz", 37 ], 38 "202": [ 39 f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz", 40 f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz", 41 ], 42 "207": [ 43 f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz", 44 f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz", 45 ], 46 "214.1": [ 47 f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz", 48 f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz", 49 ], 50 "220": [ 51 f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz", 52 f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz", 53 ], 54 "226": [ 55 f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz", 56 f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz", 57 ], 58 } 59 60 _rank_codes = [ 61 ("d__", "domain"), 62 ("p__", "phylum"), 63 ("c__", "class"), 64 ("o__", "order"), 65 ("f__", "family"), 66 ("g__", "genus"), 67 ("s__", "species"), 68 ] 69 70 def __init__(self, **kwargs): 71 self._convert_to = {} 72 self._convert_from = {} 73 super().__init__(**kwargs) 74 75 def __repr__(self): 76 return format_repr(inst=self) 77 78 def _build_translation(self, target_tax, file: str = None, url: str = None): 79 translated_nodes = {} 80 if target_tax.__class__.__name__ == "NcbiTx": 81 if file: 82 fhs = open_files([file]) 83 else: 84 if not url: 85 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{self.version}_acc_rep_lin_ncbi.tsv.gz" 86 fhs = download_files(urls=[url], retry_attempts=3) 87 88 accession_col = 0 89 gtdb_taxonomy_col = 2 90 ncbi_taxid_col = 3 91 92 for source, fh in fhs.items(): 93 for line in fh: 94 try: 95 fields = line.rstrip().split("\t") 96 except TypeError: 97 fields = line.decode().rstrip().split("\t") 98 99 # skip header 100 if fields[accession_col] == "accession": 101 continue 102 103 ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col]) 104 if ncbi_leaf_node != target_tax.undefined_node: 105 ncbi_nodes = target_tax.lineage( 106 ncbi_leaf_node, 107 ranks=[ 108 "domain", 109 "phylum", 110 "class", 111 "order", 112 "family", 113 "genus", 114 "species", 115 ], 116 ) 117 else: 118 continue 119 120 # Build GTDB lineage from leaf (species on given lineage) 121 # to accomodate possible changes in the loaded tax 122 gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 123 if gtdb_leaf_node != self.undefined_node: 124 gtdb_nodes = self.lineage( 125 gtdb_leaf_node, 126 ranks=[ 127 "domain", 128 "phylum", 129 "class", 130 "order", 131 "family", 132 "genus", 133 "species", 134 ], 135 ) 136 else: 137 continue 138 139 # Match ranks 140 for i, gtdb_n in enumerate(gtdb_nodes): 141 if ( 142 ncbi_nodes[i] != target_tax.undefined_node 143 and gtdb_n != self.undefined_node 144 ): 145 if gtdb_n not in translated_nodes: 146 translated_nodes[gtdb_n] = set() 147 translated_nodes[gtdb_n].add(ncbi_nodes[i]) 148 149 close_files(fhs) 150 else: 151 warnings.warn( 152 "Translation between taxonomies [" 153 + self.__class__.__name__ 154 + "," 155 + target_tax.__class__.__name__ 156 + "] not yet implemented." 157 ) 158 159 return translated_nodes 160 161 def _parse(self, fhs, **kwargs): 162 nodes = {} 163 ranks = {} 164 names = {} 165 for source, fh in fhs.items(): 166 for line in fh: 167 try: 168 _, lineage = line.rstrip().split("\t") 169 except TypeError: 170 _, lineage = line.decode().rstrip().split("\t") 171 lin = lineage.split(";") 172 for i in range(len(lin))[::-1]: 173 # assert rank 174 assert lin[i][:3] == self._rank_codes[i][0] 175 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 176 taxid = lin[i] 177 name = lin[i][3:] 178 # empty entry "s__" 179 if not name: 180 continue 181 rank = self._rank_codes[i][1] 182 if i == 0: 183 parent_taxid = self._default_root_node 184 else: 185 parent_taxid = lin[i - 1] 186 if taxid not in nodes: 187 nodes[taxid] = parent_taxid 188 names[taxid] = name 189 ranks[taxid] = rank 190 191 return nodes, ranks, names 192 193 def _lookup_version_taxa(self, node, version: str): 194 res = set() 195 for acc in self._convert_from.get(node, ""): 196 for tx in self._convert_to[version].get(acc, "").split(";"): 197 # Return only rank of requested node 198 if tx.startswith(node[:1]): 199 res.add(tx) 200 return res 201 202 def _download_parse_version_taxa(self, version, file, url): 203 if file: 204 fhs = open_files(files=[file]) 205 else: 206 if not url: 207 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{version}_acc_rep_lin_ncbi.tsv.gz" 208 fhs = download_files(urls=[url], retry_attempts=3) 209 210 for fh in fhs.values(): 211 for line in fh: 212 try: 213 yield line.rstrip().split("\t") 214 except TypeError: 215 yield line.decode().rstrip().split("\t") 216 217 def build_conversion( 218 self, 219 version: str, 220 files: tuple[str, str] = ("", ""), 221 urls: tuple[str, str] = ("", ""), 222 ): 223 """ 224 Download and build conversion table against another version. 225 Optional function, conversion tables are automatically downloaded 226 and built on first .convert() call. 227 """ 228 if version not in self._supported_versions: 229 raise ValueError( 230 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 231 ) 232 233 if not self._convert_from: 234 # Collect the accessions of the representative entries for each taxa in the current version 235 tx_accs = {} 236 for acc, rep, lin, _ in self._download_parse_version_taxa( 237 version=self.version, file=files[0], url=urls[0] 238 ): 239 if rep == "t": 240 for tx in lin.split(";"): 241 if tx not in tx_accs: 242 tx_accs[tx] = [] 243 tx_accs[tx].append(acc) 244 # Assign only at the end, in case of download/parse errors 245 self._convert_from = tx_accs 246 247 if version not in self._convert_to: 248 # Collect the lineage for each accession 249 acc_lin = {} 250 for acc, _, lin, _ in self._download_parse_version_taxa( 251 version=version, file=files[1], url=urls[1] 252 ): 253 acc_lin[acc] = lin 254 # Assign only at the end, in case of download/parse errors 255 self._convert_to[version] = acc_lin 256 257 def convert(self, node: str, version: str) -> set[str]: 258 """ 259 Converts a taxonomic node from current version to another. 260 It uses a genomic centric strategy, based on the taxa of the representative 261 genome among versions. 262 It may return multiple nodes for ranks above species, 263 since multiple representatives can be split into more taxa. 264 It may return an empty set if node is not found in the current version 265 or if related representative is no longer available in the requested version. 266 267 Example: 268 269 from multitax import GtdbTx 270 tax = GtdbTx(version="95") 271 272 # Species - always one-to-one 273 tax.convert('s__Giesbergeria metamorpha', version="226") 274 {'s__Simplicispira metamorpha'} 275 276 # Other ranks - may be one-to-many 277 tax.convert('g__UBA6715', version="226") 278 {'g__Aquirufa', 'g__Sandaracinomonas'} 279 """ 280 281 if version not in self._supported_versions: 282 raise ValueError( 283 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 284 ) 285 286 if not self._convert_from or version not in self._convert_to: 287 self.build_conversion(version=version) 288 289 return self._lookup_version_taxa(node, version)
70 def __init__(self, **kwargs): 71 self._convert_to = {} 72 self._convert_from = {} 73 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
217 def build_conversion( 218 self, 219 version: str, 220 files: tuple[str, str] = ("", ""), 221 urls: tuple[str, str] = ("", ""), 222 ): 223 """ 224 Download and build conversion table against another version. 225 Optional function, conversion tables are automatically downloaded 226 and built on first .convert() call. 227 """ 228 if version not in self._supported_versions: 229 raise ValueError( 230 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 231 ) 232 233 if not self._convert_from: 234 # Collect the accessions of the representative entries for each taxa in the current version 235 tx_accs = {} 236 for acc, rep, lin, _ in self._download_parse_version_taxa( 237 version=self.version, file=files[0], url=urls[0] 238 ): 239 if rep == "t": 240 for tx in lin.split(";"): 241 if tx not in tx_accs: 242 tx_accs[tx] = [] 243 tx_accs[tx].append(acc) 244 # Assign only at the end, in case of download/parse errors 245 self._convert_from = tx_accs 246 247 if version not in self._convert_to: 248 # Collect the lineage for each accession 249 acc_lin = {} 250 for acc, _, lin, _ in self._download_parse_version_taxa( 251 version=version, file=files[1], url=urls[1] 252 ): 253 acc_lin[acc] = lin 254 # Assign only at the end, in case of download/parse errors 255 self._convert_to[version] = acc_lin
Download and build conversion table against another version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.
257 def convert(self, node: str, version: str) -> set[str]: 258 """ 259 Converts a taxonomic node from current version to another. 260 It uses a genomic centric strategy, based on the taxa of the representative 261 genome among versions. 262 It may return multiple nodes for ranks above species, 263 since multiple representatives can be split into more taxa. 264 It may return an empty set if node is not found in the current version 265 or if related representative is no longer available in the requested version. 266 267 Example: 268 269 from multitax import GtdbTx 270 tax = GtdbTx(version="95") 271 272 # Species - always one-to-one 273 tax.convert('s__Giesbergeria metamorpha', version="226") 274 {'s__Simplicispira metamorpha'} 275 276 # Other ranks - may be one-to-many 277 tax.convert('g__UBA6715', version="226") 278 {'g__Aquirufa', 'g__Sandaracinomonas'} 279 """ 280 281 if version not in self._supported_versions: 282 raise ValueError( 283 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 284 ) 285 286 if not self._convert_from or version not in self._convert_to: 287 self.build_conversion(version=version) 288 289 return self._lookup_version_taxa(node, version)
Converts a taxonomic node from current version to another. It uses a genomic centric strategy, based on the taxa of the representative genome among versions. It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.
Example:
from multitax import GtdbTx
tax = GtdbTx(version="95")
# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}
# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
13class NcbiTx(MultiTax): 14 _default_version = "current" 15 _supported_versions = ["current"] 16 _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"} 17 18 def __init__(self, **kwargs): 19 self._merged = {} 20 self._extended_name_nodes = {} 21 super().__init__(**kwargs) 22 23 def __repr__(self): 24 return format_repr(inst=self) 25 26 def _build_translation(self, target_tax, file: str = None, url: str = None): 27 translated_nodes = {} 28 if target_tax.__class__.__name__ == "GtdbTx": 29 if file: 30 fhs = open_files([file]) 31 else: 32 if not url: 33 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{target_tax.version}_acc_rep_lin_ncbi.tsv.gz" 34 fhs = download_files(urls=[url], retry_attempts=3) 35 36 accession_col = 0 37 gtdb_taxonomy_col = 2 38 ncbi_taxid_col = 3 39 40 for source, fh in fhs.items(): 41 for line in fh: 42 try: 43 fields = line.rstrip().split("\t") 44 except TypeError: 45 fields = line.decode().rstrip().split("\t") 46 47 # skip header 48 if fields[accession_col] == "accession": 49 continue 50 51 # Build GTDB lineage from leaf (species on given lineage) 52 # to accomodate possible changes in the loaded tax 53 gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 54 if gtdb_leaf_node != target_tax.undefined_node: 55 gtdb_nodes = target_tax.lineage( 56 gtdb_leaf_node, 57 ranks=[ 58 "domain", 59 "phylum", 60 "class", 61 "order", 62 "family", 63 "genus", 64 "species", 65 ], 66 ) 67 else: 68 continue 69 70 # Build NCBI lineage from leaf 71 ncbi_leaf_node = self.latest(fields[ncbi_taxid_col]) 72 if ncbi_leaf_node != self.undefined_node: 73 # Additional add connection from leaf to species on GTDB 74 # that could represent strain, etc on NCBI tax 75 if ncbi_leaf_node not in translated_nodes: 76 translated_nodes[ncbi_leaf_node] = set() 77 translated_nodes[ncbi_leaf_node].add(gtdb_leaf_node) 78 ncbi_nodes = self.lineage( 79 ncbi_leaf_node, 80 ranks=[ 81 "domain", 82 "phylum", 83 "class", 84 "order", 85 "family", 86 "genus", 87 "species", 88 ], 89 ) 90 else: 91 continue 92 93 # Match ranks 94 for i, ncbi_n in enumerate(ncbi_nodes): 95 if ( 96 gtdb_nodes[i] != target_tax.undefined_node 97 and ncbi_n != self.undefined_node 98 ): 99 if ncbi_n not in translated_nodes: 100 translated_nodes[ncbi_n] = set() 101 translated_nodes[ncbi_n].add(gtdb_nodes[i]) 102 close_files(fhs) 103 else: 104 warnings.warn( 105 "Translation between taxonomies [" 106 + self.__class__.__name__ 107 + "," 108 + target_tax.__class__.__name__ 109 + "] not yet implemented." 110 ) 111 112 return translated_nodes 113 114 def _parse(self, fhs, **kwargs): 115 fhs_list = list(fhs.values()) 116 # One element tar.gz -> taxdump.tar.gz 117 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"): 118 nodes, ranks, names, self._merged = self._parse_taxdump( 119 fhs_list[0], extended_names=kwargs["extended_names"] 120 ) 121 else: 122 # nodes.dmp 123 nodes, ranks = self._parse_nodes(fhs_list[0]) 124 125 # [names.dmp] 126 if len(fhs) >= 2: 127 names = self._parse_names( 128 fhs_list[1], extended_names=kwargs["extended_names"] 129 ) 130 else: 131 names = {} 132 133 # [merged.dmp] 134 if len(fhs) == 3: 135 self._merged = self._parse_merged(fhs_list[2]) 136 return nodes, ranks, names 137 138 def _parse_merged(self, fh): 139 merged = {} 140 for line in fh: 141 try: 142 old_taxid, _, new_taxid, _ = line.split("\t", 3) 143 except TypeError: 144 old_taxid, _, new_taxid, _ = line.decode().split("\t", 3) 145 merged[old_taxid] = new_taxid 146 return merged 147 148 def _parse_names(self, fh, extended_names): 149 names = {} 150 for line in fh: 151 try: 152 node, name, _, name_class = line.split("\t|\t") 153 except TypeError: 154 node, name, _, name_class = line.decode().split("\t|\t") 155 if name_class.replace("\t|\n", "") == "scientific name": 156 names[node] = name 157 elif extended_names: 158 if name not in self._extended_name_nodes: 159 self._extended_name_nodes[name] = [] 160 self._extended_name_nodes[name].append(node) 161 162 return names 163 164 def _parse_nodes(self, fh): 165 nodes = {} 166 ranks = {} 167 for line in fh: 168 try: 169 taxid, parent_taxid, rank, _ = line.split("\t|\t", 3) 170 except TypeError: 171 taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3) 172 ranks[taxid] = rank 173 nodes[taxid] = parent_taxid 174 return nodes, ranks 175 176 def _parse_taxdump(self, fh_taxdump, extended_names): 177 with fh_taxdump.extractfile("nodes.dmp") as fh_nodes: 178 nodes, ranks = self._parse_nodes(fh_nodes) 179 with fh_taxdump.extractfile("names.dmp") as fh_names: 180 names = self._parse_names(fh_names, extended_names=extended_names) 181 with fh_taxdump.extractfile("merged.dmp") as fh_merged: 182 merged = self._parse_merged(fh_merged) 183 return nodes, ranks, names, merged 184 185 def latest(self, node: str): 186 n = super().latest(node) 187 if n == self.undefined_node: 188 n = self.merged(node) 189 return n 190 191 def merged(self, node: str): 192 """ 193 Returns relative entry from the merged.dmp file of a given node. 194 """ 195 if node in self._merged: 196 return self._merged[node] 197 else: 198 return self.undefined_node 199 200 def search_name( 201 self, 202 text: str, 203 rank: str = None, 204 exact: bool = True, 205 force_extended: bool = False, 206 ): 207 """ 208 Search node by exact or partial name. 209 210 Default order (can be skipped with **force_extended=True**): 211 212 1) Search names defined as "scientific name" on nodes.dmp 213 214 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 215 216 Parameters: 217 * **text** *[str]*: Text to search. 218 * **rank** *[str]*: Filter results by rank. 219 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 220 * **force_extended** *[bool]*: Search for text in all categories at once. 221 222 Returns: list of matching nodes 223 """ 224 n = super().search_name(text, rank=rank, exact=exact) 225 if n and not force_extended: 226 return n 227 else: 228 if exact: 229 ret = self._exact_name(text, self._extended_name_nodes) 230 else: 231 ret = self._partial_name(text, self._extended_name_nodes) 232 233 # Only return nodes of chosen rank 234 if rank: 235 ret = filter_function(ret, self.rank, rank) 236 237 return list(set(n + ret)) 238 239 def stats(self, **kwargs): 240 s = super().stats(**kwargs) 241 if self._merged: 242 s["merged"] = len(self._merged) 243 if self._extended_name_nodes: 244 s["extended_names"] = len(self._extended_name_nodes) 245 return s
18 def __init__(self, **kwargs): 19 self._merged = {} 20 self._extended_name_nodes = {} 21 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
185 def latest(self, node: str): 186 n = super().latest(node) 187 if n == self.undefined_node: 188 n = self.merged(node) 189 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
191 def merged(self, node: str): 192 """ 193 Returns relative entry from the merged.dmp file of a given node. 194 """ 195 if node in self._merged: 196 return self._merged[node] 197 else: 198 return self.undefined_node
Returns relative entry from the merged.dmp file of a given node.
200 def search_name( 201 self, 202 text: str, 203 rank: str = None, 204 exact: bool = True, 205 force_extended: bool = False, 206 ): 207 """ 208 Search node by exact or partial name. 209 210 Default order (can be skipped with **force_extended=True**): 211 212 1) Search names defined as "scientific name" on nodes.dmp 213 214 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 215 216 Parameters: 217 * **text** *[str]*: Text to search. 218 * **rank** *[str]*: Filter results by rank. 219 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 220 * **force_extended** *[bool]*: Search for text in all categories at once. 221 222 Returns: list of matching nodes 223 """ 224 n = super().search_name(text, rank=rank, exact=exact) 225 if n and not force_extended: 226 return n 227 else: 228 if exact: 229 ret = self._exact_name(text, self._extended_name_nodes) 230 else: 231 ret = self._partial_name(text, self._extended_name_nodes) 232 233 # Only return nodes of chosen rank 234 if rank: 235 ret = filter_function(ret, self.rank, rank) 236 237 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search names defined as "scientific name" on nodes.dmp
2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
239 def stats(self, **kwargs): 240 s = super().stats(**kwargs) 241 if self._merged: 242 s["merged"] = len(self._merged) 243 if self._extended_name_nodes: 244 s["extended_names"] = len(self._extended_name_nodes) 245 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class OttTx(MultiTax): 8 _default_version = "3.7.3" 9 _supported_versions = ["3.6", "3.7.3"] 10 _default_urls = { 11 "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz", 12 "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz", 13 } 14 _default_root_node = "805080" 15 16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs) 20 21 def __repr__(self): 22 return format_repr(inst=self) 23 24 def _build_translation(self, target_tax, file: str = None, url: str = None): 25 warnings.warn( 26 "Translation between taxonomies [" 27 + self.__class__.__name__ 28 + "," 29 + target_tax.__class__.__name__ 30 + "] not yet implemented." 31 ) 32 return {} 33 34 def _parse(self, fhs, **kwargs): 35 fhs_list = list(fhs.values()) 36 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"): 37 nodes, ranks, names = self._parse_ott( 38 fhs_list[0], extended_names=kwargs["extended_names"] 39 ) 40 else: 41 # nodes.dmp 42 nodes, ranks, names = self._parse_taxonomy(fhs_list[0]) 43 # [forwards.tsv] 44 if len(fhs) >= 2: 45 self._forwards = self._parse_forwards(fhs_list[1]) 46 if len(fhs) == 3 and kwargs["extended_names"]: 47 self._extended_name_nodes = self._parse_synonyms(fhs_list[2]) 48 49 return nodes, ranks, names 50 51 def _parse_forwards(self, fh): 52 forwards = {} 53 # skip first line header 54 next(fh) 55 for line in fh: 56 try: 57 old_taxid, new_taxid = line.rstrip().split("\t") 58 except TypeError: 59 old_taxid, new_taxid = line.decode().rstrip().split("\t") 60 forwards[old_taxid] = new_taxid 61 return forwards 62 63 def _parse_ott(self, fh_taxdump, extended_names): 64 # Get files inside folder by name 65 for e in fh_taxdump.getnames(): 66 if e.endswith("taxonomy.tsv"): 67 tax = e 68 if e.endswith("forwards.tsv"): 69 fwr = e 70 if e.endswith("synonyms.tsv"): 71 syn = e 72 73 with fh_taxdump.extractfile(tax) as fh_nodes: 74 nodes, ranks, names = self._parse_taxonomy(fh_nodes) 75 with fh_taxdump.extractfile(fwr) as fh_forwards: 76 self._forwards = self._parse_forwards(fh_forwards) 77 if extended_names: 78 with fh_taxdump.extractfile(syn) as fh_synonyms: 79 self._extended_name_nodes = self._parse_synonyms(fh_synonyms) 80 return nodes, ranks, names 81 82 def _parse_synonyms(self, fh): 83 synonyms = {} 84 # skip first line header 85 next(fh) 86 for line in fh: 87 try: 88 name, taxid, _ = line.split("\t|\t", 2) 89 except TypeError: 90 name, taxid, _ = line.decode().split("\t|\t", 2) 91 if name not in synonyms: 92 synonyms[name] = [] 93 synonyms[name].append(taxid) 94 95 return synonyms 96 97 def _parse_taxonomy(self, fh): 98 nodes = {} 99 ranks = {} 100 names = {} 101 # skip first line header 102 next(fh) 103 for line in fh: 104 try: 105 taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4) 106 except TypeError: 107 taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4) 108 ranks[taxid] = rank 109 nodes[taxid] = parent_taxid 110 names[taxid] = name 111 return nodes, ranks, names 112 113 def forwards(self, node: str): 114 """ 115 Returns relative entry from the forwards.tsv file of a given node. 116 """ 117 if node in self._forwards: 118 return self._forwards[node] 119 else: 120 return self.undefined_node 121 122 def latest(self, node: str): 123 n = super().latest(node) 124 if n == self.undefined_node: 125 n = self.forwards(node) 126 return n 127 128 def search_name( 129 self, 130 text: str, 131 rank: str = None, 132 exact: bool = True, 133 force_extended: bool = False, 134 ): 135 """ 136 Search node by exact or partial name. 137 138 Default order (can be skipped with **force_extended=True**): 139 140 1) Search default names defined on "taxonomy.tsv" 141 142 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 143 144 Parameters: 145 * **text** *[str]*: Text to search. 146 * **rank** *[str]*: Filter results by rank. 147 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 148 * **force_extended** *[bool]*: Search for text in all categories at once. 149 150 Returns: list of matching nodes 151 """ 152 n = super().search_name(text, rank=rank, exact=exact) 153 if n and not force_extended: 154 return n 155 else: 156 if exact: 157 ret = self._exact_name(text, self._extended_name_nodes) 158 else: 159 ret = self._partial_name(text, self._extended_name_nodes) 160 161 # Only return nodes of chosen rank 162 if rank: 163 ret = filter_function(ret, self.rank, rank) 164 165 return list(set(n + ret)) 166 167 def stats(self, **kwargs): 168 s = super().stats(**kwargs) 169 if self._forwards: 170 s["forwards"] = len(self._forwards) 171 if self._extended_name_nodes: 172 s["extended_names"] = len(self._extended_name_nodes) 173 return s
16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
113 def forwards(self, node: str): 114 """ 115 Returns relative entry from the forwards.tsv file of a given node. 116 """ 117 if node in self._forwards: 118 return self._forwards[node] 119 else: 120 return self.undefined_node
Returns relative entry from the forwards.tsv file of a given node.
122 def latest(self, node: str): 123 n = super().latest(node) 124 if n == self.undefined_node: 125 n = self.forwards(node) 126 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
128 def search_name( 129 self, 130 text: str, 131 rank: str = None, 132 exact: bool = True, 133 force_extended: bool = False, 134 ): 135 """ 136 Search node by exact or partial name. 137 138 Default order (can be skipped with **force_extended=True**): 139 140 1) Search default names defined on "taxonomy.tsv" 141 142 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 143 144 Parameters: 145 * **text** *[str]*: Text to search. 146 * **rank** *[str]*: Filter results by rank. 147 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 148 * **force_extended** *[bool]*: Search for text in all categories at once. 149 150 Returns: list of matching nodes 151 """ 152 n = super().search_name(text, rank=rank, exact=exact) 153 if n and not force_extended: 154 return n 155 else: 156 if exact: 157 ret = self._exact_name(text, self._extended_name_nodes) 158 else: 159 ret = self._partial_name(text, self._extended_name_nodes) 160 161 # Only return nodes of chosen rank 162 if rank: 163 ret = filter_function(ret, self.rank, rank) 164 165 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search default names defined on "taxonomy.tsv"
2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
167 def stats(self, **kwargs): 168 s = super().stats(**kwargs) 169 if self._forwards: 170 s["forwards"] = len(self._forwards) 171 if self._extended_name_nodes: 172 s["extended_names"] = len(self._extended_name_nodes) 173 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class SilvaTx(MultiTax): 8 _default_version = "ssu_138.2" 9 _supported_versions = ["lsu_138.2", "ssu_138.2"] 10 _default_urls = { 11 "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz", 12 "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz", 13 } 14 15 def __init__(self, **kwargs): 16 super().__init__(**kwargs) 17 18 def __repr__(self): 19 return format_repr(inst=self) 20 21 def _build_translation(self, target_tax, file: str = None, url: str = None): 22 warnings.warn( 23 "Translation between taxonomies [" 24 + self.__class__.__name__ 25 + "," 26 + target_tax.__class__.__name__ 27 + "] not yet implemented." 28 ) 29 return {} 30 31 def _parse(self, fhs, **kwargs): 32 nodes = {} 33 ranks = {} 34 names = {} 35 36 lin = {} 37 for source, fh in fhs.items(): 38 for line in fh: 39 try: 40 name_lineage, taxid, rank, _ = line.split("\t", 3) 41 except TypeError: 42 name_lineage, taxid, rank, _ = line.decode().split("\t", 3) 43 # Remove last char ";" 44 lineage = name_lineage[:-1] 45 name = lineage.split(";")[-1] 46 # Save lineage to build tree 47 lin[lineage] = taxid 48 names[taxid] = name 49 ranks[taxid] = rank 50 51 # Build parent node connection 52 for lineage, taxid in lin.items(): 53 t = taxid 54 lsplit = lineage.split(";")[:-1] 55 while lsplit: 56 parent_taxid = lin[";".join(lsplit)] 57 if t not in nodes: 58 nodes[t] = parent_taxid 59 t = parent_taxid 60 del lsplit[-1] # remove last element 61 # Connect last node to root 62 if t not in nodes: 63 nodes[t] = self._default_root_node 64 65 return nodes, ranks, names
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write