multitax
1import importlib.metadata 2 3__version__ = importlib.metadata.version(__name__) 4 5__all__ = ( 6 "CustomTx", 7 "DummyTx", 8 "GreengenesTx", 9 "GtdbTx", 10 "NcbiTx", 11 "OttTx", 12 "SilvaTx", 13) 14 15from .customtx import CustomTx 16from .dummytx import DummyTx 17from .greengenestx import GreengenesTx 18from .gtdbtx import GtdbTx 19from .ncbitx import NcbiTx 20from .otttx import OttTx 21from .silvatx import SilvaTx
7class CustomTx(MultiTax): 8 _required_cols = ["node", "parent"] 9 _possible_cols = ["node", "parent", "rank", "name"] 10 11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs) 31 32 def __repr__(self): 33 return format_repr(inst=self) 34 35 def _build_translation(self, target_tax, file: str = None, url: str = None): 36 warnings.warn( 37 "Translation between taxonomies [" 38 + self.__class__.__name__ 39 + "," 40 + target_tax.__class__.__name__ 41 + "] not yet implemented." 42 ) 43 return {} 44 45 def _parse(self, fhs, **kwargs): 46 nodes = {} 47 ranks = {} 48 names = {} 49 for source, fh in fhs.items(): 50 for line in fh: 51 try: 52 fields = line.rstrip().split(self._sep) 53 except TypeError: 54 fields = line.decode().rstrip().split(self._sep) 55 56 node = fields[self._cols["node"]] 57 nodes[node] = fields[self._cols["parent"]] 58 if "name" in self._cols: 59 names[node] = fields[self._cols["name"]] 60 if "rank" in self._cols: 61 ranks[node] = fields[self._cols["rank"]] 62 63 return nodes, ranks, names 64 65 def _parse_cols(self, cols): 66 if isinstance(cols, list): 67 cols = {c: i for i, c in enumerate(cols)} 68 69 for rc in self._required_cols: 70 if rc not in cols: 71 raise ValueError(rc + " is a required column") 72 73 for c in cols: 74 if c not in self._possible_cols: 75 raise ValueError( 76 c + " is not a valid column: " + ",".join(self._possible_cols) 77 ) 78 79 return cols
11 def __init__( 12 self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs 13 ): 14 """ 15 CustomTx() 16 17 Parameters: 18 * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 19 * **sep** *[str]*: Separator of fields 20 * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax` 21 22 Example: 23 24 tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 25 tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 26 """ 27 28 self._cols = self._parse_cols(cols) 29 self._sep = sep 30 super().__init__(**kwargs)
CustomTx()
Parameters:
- cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
- sep [str]: Separator of fields
- **kwargs defined at
multitax.multitax.MultiTax
Example:
tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
6class DummyTx(MultiTax): 7 def __init__(self, **kwargs): 8 """ 9 DummyTx() - Dummy empty taxonomy 10 11 Parameters: 12 13 * \\*\\*kwargs defined at `multitax.multitax.MultiTax` 14 """ 15 super().__init__(**kwargs) 16 17 def __repr__(self): 18 return format_repr(inst=self)
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class GreengenesTx(MultiTax): 8 _default_version = "2024.09" 9 _supported_versions = ["2022.10", "2024.09"] 10 _default_urls = { 11 "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz", 12 "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz", 13 } 14 15 _rank_codes = [ 16 ("d__", "domain"), 17 ("p__", "phylum"), 18 ("c__", "class"), 19 ("o__", "order"), 20 ("f__", "family"), 21 ("g__", "genus"), 22 ("s__", "species"), 23 ] 24 25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs) 29 30 def __repr__(self): 31 return format_repr(inst=self) 32 33 def _build_translation(self, target_tax, file: str = None, url: str = None): 34 warnings.warn( 35 "Translation between taxonomies [" 36 + self.__class__.__name__ 37 + "," 38 + target_tax.__class__.__name__ 39 + "] not yet implemented." 40 ) 41 return {} 42 43 def _parse(self, fhs, **kwargs): 44 nodes = {} 45 ranks = {} 46 names = {} 47 48 lineages = set() 49 for source, fh in fhs.items(): 50 for line in fh: 51 try: 52 fields = line.rstrip().split("\t") 53 except TypeError: 54 fields = line.decode().rstrip().split("\t") 55 56 # skip header 57 if fields[0] == "Feature ID": 58 continue 59 60 lineages.add(fields[1]) 61 62 for lineage in lineages: 63 last_taxid = None 64 lin = lineage.split("; ") 65 for i in range(len(lin))[::-1]: 66 # assert rank 67 assert lin[i][:3] == self._rank_codes[i][0] 68 69 name = lin[i][3:] 70 if not name: 71 continue # empty entry "s__" 72 73 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 74 taxid = lin[i] 75 rank = self._rank_codes[i][1] 76 77 if taxid not in nodes: 78 names[taxid] = name 79 ranks[taxid] = rank 80 if last_taxid: 81 nodes[last_taxid] = taxid 82 last_taxid = taxid 83 nodes[last_taxid] = self._default_root_node 84 85 return nodes, ranks, names
25 def __init__(self, **kwargs): 26 # forwards.tsv 27 self._forwards = {} 28 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
7class GtdbTx(MultiTax): 8 _default_version = "232" 9 _supported_versions = [ 10 "80", 11 "83", 12 "86.2", 13 "89", 14 "95", 15 "202", 16 "207", 17 "214.1", 18 "220", 19 "226", 20 "232", 21 ] 22 23 _url_prefix = "https://data.gtdb.ecogenomic.org/releases/" 24 _default_urls = { 25 "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"], 26 "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"], 27 "86.2": [ 28 f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv", 29 f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv", 30 ], 31 "89": [ 32 f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv", 33 f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv", 34 ], 35 "95": [ 36 f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz", 37 f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz", 38 ], 39 "202": [ 40 f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz", 41 f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz", 42 ], 43 "207": [ 44 f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz", 45 f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz", 46 ], 47 "214.1": [ 48 f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz", 49 f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz", 50 ], 51 "220": [ 52 f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz", 53 f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz", 54 ], 55 "226": [ 56 f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz", 57 f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz", 58 ], 59 "232": [ 60 f"{_url_prefix}release232/232.0/ar53_taxonomy_r232.tsv.gz", 61 f"{_url_prefix}release232/232.0/bac120_taxonomy_r232.tsv.gz", 62 ], 63 } 64 65 _rank_codes = [ 66 ("d__", "domain"), 67 ("p__", "phylum"), 68 ("c__", "class"), 69 ("o__", "order"), 70 ("f__", "family"), 71 ("g__", "genus"), 72 ("s__", "species"), 73 ] 74 75 def __init__(self, **kwargs): 76 self._convert_to = {} 77 self._convert_from = {} 78 super().__init__(**kwargs) 79 80 def __repr__(self): 81 return format_repr(inst=self) 82 83 def _build_translation(self, target_tax, file: str = None, url: str = None): 84 translated_nodes = {} 85 if target_tax.__class__.__name__ == "NcbiTx": 86 if file: 87 fhs = open_files([file]) 88 else: 89 if not url: 90 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{self.version}_acc_rep_lin_ncbi.tsv.gz" 91 fhs = download_files(urls=[url], retry_attempts=3) 92 93 accession_col = 0 94 gtdb_taxonomy_col = 2 95 ncbi_taxid_col = 3 96 97 for source, fh in fhs.items(): 98 for line in fh: 99 try: 100 fields = line.rstrip().split("\t") 101 except TypeError: 102 fields = line.decode().rstrip().split("\t") 103 104 # skip header 105 if fields[accession_col] == "accession": 106 continue 107 108 ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col]) 109 if ncbi_leaf_node != target_tax.undefined_node: 110 ncbi_nodes = target_tax.lineage( 111 ncbi_leaf_node, 112 ranks=[ 113 "domain", 114 "phylum", 115 "class", 116 "order", 117 "family", 118 "genus", 119 "species", 120 ], 121 ) 122 else: 123 continue 124 125 # Build GTDB lineage from leaf (species on given lineage) 126 # to accomodate possible changes in the loaded tax 127 gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 128 if gtdb_leaf_node != self.undefined_node: 129 gtdb_nodes = self.lineage( 130 gtdb_leaf_node, 131 ranks=[ 132 "domain", 133 "phylum", 134 "class", 135 "order", 136 "family", 137 "genus", 138 "species", 139 ], 140 ) 141 else: 142 continue 143 144 # Match ranks 145 for i, gtdb_n in enumerate(gtdb_nodes): 146 if ( 147 ncbi_nodes[i] != target_tax.undefined_node 148 and gtdb_n != self.undefined_node 149 ): 150 if gtdb_n not in translated_nodes: 151 translated_nodes[gtdb_n] = set() 152 translated_nodes[gtdb_n].add(ncbi_nodes[i]) 153 154 close_files(fhs) 155 else: 156 warnings.warn( 157 "Translation between taxonomies [" 158 + self.__class__.__name__ 159 + "," 160 + target_tax.__class__.__name__ 161 + "] not yet implemented." 162 ) 163 164 return translated_nodes 165 166 def _parse(self, fhs, **kwargs): 167 nodes = {} 168 ranks = {} 169 names = {} 170 for source, fh in fhs.items(): 171 for line in fh: 172 try: 173 _, lineage = line.rstrip().split("\t") 174 except TypeError: 175 _, lineage = line.decode().rstrip().split("\t") 176 lin = lineage.split(";") 177 for i in range(len(lin))[::-1]: 178 # assert rank 179 assert lin[i][:3] == self._rank_codes[i][0] 180 # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 181 taxid = lin[i] 182 name = lin[i][3:] 183 # empty entry "s__" 184 if not name: 185 continue 186 rank = self._rank_codes[i][1] 187 if i == 0: 188 parent_taxid = self._default_root_node 189 else: 190 parent_taxid = lin[i - 1] 191 if taxid not in nodes: 192 nodes[taxid] = parent_taxid 193 names[taxid] = name 194 ranks[taxid] = rank 195 196 return nodes, ranks, names 197 198 def _lookup_version_taxa(self, node, version: str): 199 res = set() 200 for acc in self._convert_from.get(node, ""): 201 for tx in self._convert_to[version].get(acc, "").split(";"): 202 # Return only rank of requested node 203 if tx.startswith(node[:1]): 204 res.add(tx) 205 return res 206 207 def _download_parse_version_taxa(self, version, file, url): 208 if file: 209 fhs = open_files(files=[file]) 210 else: 211 if not url: 212 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{version}_acc_rep_lin_ncbi.tsv.gz" 213 fhs = download_files(urls=[url], retry_attempts=3) 214 215 for fh in fhs.values(): 216 for line in fh: 217 try: 218 yield line.rstrip().split("\t") 219 except TypeError: 220 yield line.decode().rstrip().split("\t") 221 222 def build_conversion( 223 self, 224 version: str, 225 files: tuple[str, str] = ("", ""), 226 urls: tuple[str, str] = ("", ""), 227 ): 228 """ 229 Download and build conversion table against another version. 230 Optional function, conversion tables are automatically downloaded 231 and built on first .convert() call. 232 """ 233 if version not in self._supported_versions: 234 raise ValueError( 235 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 236 ) 237 238 if not self._convert_from: 239 # Collect the accessions of the representative entries for each taxa in the current version 240 tx_accs = {} 241 for acc, rep, lin, _ in self._download_parse_version_taxa( 242 version=self.version, file=files[0], url=urls[0] 243 ): 244 if rep == "t": 245 for tx in lin.split(";"): 246 if tx not in tx_accs: 247 tx_accs[tx] = [] 248 tx_accs[tx].append(acc) 249 # Assign only at the end, in case of download/parse errors 250 self._convert_from = tx_accs 251 252 if version not in self._convert_to: 253 # Collect the lineage for each accession 254 acc_lin = {} 255 for acc, _, lin, _ in self._download_parse_version_taxa( 256 version=version, file=files[1], url=urls[1] 257 ): 258 acc_lin[acc] = lin 259 # Assign only at the end, in case of download/parse errors 260 self._convert_to[version] = acc_lin 261 262 def convert(self, node: str, version: str) -> set[str]: 263 """ 264 Converts a taxonomic node from current version to another. 265 It uses a genomic centric strategy, based on the taxa of the representative 266 genome among versions. 267 It may return multiple nodes for ranks above species, 268 since multiple representatives can be split into more taxa. 269 It may return an empty set if node is not found in the current version 270 or if related representative is no longer available in the requested version. 271 272 Example: 273 274 from multitax import GtdbTx 275 tax = GtdbTx(version="95") 276 277 # Species - always one-to-one 278 tax.convert('s__Giesbergeria metamorpha', version="226") 279 {'s__Simplicispira metamorpha'} 280 281 # Other ranks - may be one-to-many 282 tax.convert('g__UBA6715', version="226") 283 {'g__Aquirufa', 'g__Sandaracinomonas'} 284 """ 285 286 if version not in self._supported_versions: 287 raise ValueError( 288 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 289 ) 290 291 if not self._convert_from or version not in self._convert_to: 292 self.build_conversion(version=version) 293 294 return self._lookup_version_taxa(node, version)
75 def __init__(self, **kwargs): 76 self._convert_to = {} 77 self._convert_from = {} 78 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
222 def build_conversion( 223 self, 224 version: str, 225 files: tuple[str, str] = ("", ""), 226 urls: tuple[str, str] = ("", ""), 227 ): 228 """ 229 Download and build conversion table against another version. 230 Optional function, conversion tables are automatically downloaded 231 and built on first .convert() call. 232 """ 233 if version not in self._supported_versions: 234 raise ValueError( 235 f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}" 236 ) 237 238 if not self._convert_from: 239 # Collect the accessions of the representative entries for each taxa in the current version 240 tx_accs = {} 241 for acc, rep, lin, _ in self._download_parse_version_taxa( 242 version=self.version, file=files[0], url=urls[0] 243 ): 244 if rep == "t": 245 for tx in lin.split(";"): 246 if tx not in tx_accs: 247 tx_accs[tx] = [] 248 tx_accs[tx].append(acc) 249 # Assign only at the end, in case of download/parse errors 250 self._convert_from = tx_accs 251 252 if version not in self._convert_to: 253 # Collect the lineage for each accession 254 acc_lin = {} 255 for acc, _, lin, _ in self._download_parse_version_taxa( 256 version=version, file=files[1], url=urls[1] 257 ): 258 acc_lin[acc] = lin 259 # Assign only at the end, in case of download/parse errors 260 self._convert_to[version] = acc_lin
Download and build conversion table against another version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.
262 def convert(self, node: str, version: str) -> set[str]: 263 """ 264 Converts a taxonomic node from current version to another. 265 It uses a genomic centric strategy, based on the taxa of the representative 266 genome among versions. 267 It may return multiple nodes for ranks above species, 268 since multiple representatives can be split into more taxa. 269 It may return an empty set if node is not found in the current version 270 or if related representative is no longer available in the requested version. 271 272 Example: 273 274 from multitax import GtdbTx 275 tax = GtdbTx(version="95") 276 277 # Species - always one-to-one 278 tax.convert('s__Giesbergeria metamorpha', version="226") 279 {'s__Simplicispira metamorpha'} 280 281 # Other ranks - may be one-to-many 282 tax.convert('g__UBA6715', version="226") 283 {'g__Aquirufa', 'g__Sandaracinomonas'} 284 """ 285 286 if version not in self._supported_versions: 287 raise ValueError( 288 f"Version [{version}] not supported: {', '.join(self._supported_versions)}" 289 ) 290 291 if not self._convert_from or version not in self._convert_to: 292 self.build_conversion(version=version) 293 294 return self._lookup_version_taxa(node, version)
Converts a taxonomic node from current version to another. It uses a genomic centric strategy, based on the taxa of the representative genome among versions. It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.
Example:
from multitax import GtdbTx
tax = GtdbTx(version="95")
# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}
# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write
13class NcbiTx(MultiTax): 14 _default_version = "current" 15 _supported_versions = ["current"] 16 _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"} 17 18 def __init__(self, **kwargs): 19 self._merged = {} 20 self._extended_name_nodes = {} 21 super().__init__(**kwargs) 22 23 def __repr__(self): 24 return format_repr(inst=self) 25 26 def _build_translation(self, target_tax, file: str = None, url: str = None): 27 translated_nodes = {} 28 if target_tax.__class__.__name__ == "GtdbTx": 29 if file: 30 fhs = open_files([file]) 31 else: 32 if not url: 33 url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{target_tax.version}_acc_rep_lin_ncbi.tsv.gz" 34 fhs = download_files(urls=[url], retry_attempts=3) 35 36 accession_col = 0 37 gtdb_taxonomy_col = 2 38 ncbi_taxid_col = 3 39 40 for source, fh in fhs.items(): 41 for line in fh: 42 try: 43 fields = line.rstrip().split("\t") 44 except TypeError: 45 fields = line.decode().rstrip().split("\t") 46 47 # skip header 48 if fields[accession_col] == "accession": 49 continue 50 51 # Build GTDB lineage from leaf (species on given lineage) 52 # to accomodate possible changes in the loaded tax 53 gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 54 if gtdb_leaf_node != target_tax.undefined_node: 55 gtdb_nodes = target_tax.lineage( 56 gtdb_leaf_node, 57 ranks=[ 58 "domain", 59 "phylum", 60 "class", 61 "order", 62 "family", 63 "genus", 64 "species", 65 ], 66 ) 67 else: 68 continue 69 70 # Build NCBI lineage from leaf 71 ncbi_leaf_node = self.latest(fields[ncbi_taxid_col]) 72 if ncbi_leaf_node != self.undefined_node: 73 # Additional add connection from leaf to species on GTDB 74 # that could represent strain, etc on NCBI tax 75 if ncbi_leaf_node not in translated_nodes: 76 translated_nodes[ncbi_leaf_node] = set() 77 translated_nodes[ncbi_leaf_node].add(gtdb_leaf_node) 78 ncbi_nodes = self.lineage( 79 ncbi_leaf_node, 80 ranks=[ 81 "domain", 82 "phylum", 83 "class", 84 "order", 85 "family", 86 "genus", 87 "species", 88 ], 89 ) 90 else: 91 continue 92 93 # Match ranks 94 for i, ncbi_n in enumerate(ncbi_nodes): 95 if ( 96 gtdb_nodes[i] != target_tax.undefined_node 97 and ncbi_n != self.undefined_node 98 ): 99 if ncbi_n not in translated_nodes: 100 translated_nodes[ncbi_n] = set() 101 translated_nodes[ncbi_n].add(gtdb_nodes[i]) 102 close_files(fhs) 103 else: 104 warnings.warn( 105 "Translation between taxonomies [" 106 + self.__class__.__name__ 107 + "," 108 + target_tax.__class__.__name__ 109 + "] not yet implemented." 110 ) 111 112 return translated_nodes 113 114 def _parse(self, fhs, **kwargs): 115 fhs_list = list(fhs.values()) 116 # One element tar.gz -> taxdump.tar.gz 117 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"): 118 nodes, ranks, names, self._merged = self._parse_taxdump( 119 fhs_list[0], extended_names=kwargs["extended_names"] 120 ) 121 else: 122 # nodes.dmp 123 nodes, ranks = self._parse_nodes(fhs_list[0]) 124 125 # [names.dmp] 126 if len(fhs) >= 2: 127 names = self._parse_names( 128 fhs_list[1], extended_names=kwargs["extended_names"] 129 ) 130 else: 131 names = {} 132 133 # [merged.dmp] 134 if len(fhs) == 3: 135 self._merged = self._parse_merged(fhs_list[2]) 136 return nodes, ranks, names 137 138 def _parse_merged(self, fh): 139 merged = {} 140 for line in fh: 141 try: 142 old_taxid, _, new_taxid, _ = line.split("\t", 3) 143 except TypeError: 144 old_taxid, _, new_taxid, _ = line.decode().split("\t", 3) 145 merged[old_taxid] = new_taxid 146 return merged 147 148 def _parse_names(self, fh, extended_names): 149 names = {} 150 for line in fh: 151 try: 152 node, name, _, name_class = line.split("\t|\t") 153 except TypeError: 154 node, name, _, name_class = line.decode().split("\t|\t") 155 if name_class.replace("\t|\n", "") == "scientific name": 156 names[node] = name 157 elif extended_names: 158 if name not in self._extended_name_nodes: 159 self._extended_name_nodes[name] = [] 160 self._extended_name_nodes[name].append(node) 161 162 return names 163 164 def _parse_nodes(self, fh): 165 nodes = {} 166 ranks = {} 167 for line in fh: 168 try: 169 taxid, parent_taxid, rank, _ = line.split("\t|\t", 3) 170 except TypeError: 171 taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3) 172 ranks[taxid] = rank 173 nodes[taxid] = parent_taxid 174 return nodes, ranks 175 176 def _parse_taxdump(self, fh_taxdump, extended_names): 177 with fh_taxdump.extractfile("nodes.dmp") as fh_nodes: 178 nodes, ranks = self._parse_nodes(fh_nodes) 179 with fh_taxdump.extractfile("names.dmp") as fh_names: 180 names = self._parse_names(fh_names, extended_names=extended_names) 181 with fh_taxdump.extractfile("merged.dmp") as fh_merged: 182 merged = self._parse_merged(fh_merged) 183 return nodes, ranks, names, merged 184 185 def latest(self, node: str): 186 n = super().latest(node) 187 if n == self.undefined_node: 188 n = self.merged(node) 189 return n 190 191 def merged(self, node: str): 192 """ 193 Returns relative entry from the merged.dmp file of a given node. 194 """ 195 if node in self._merged: 196 return self._merged[node] 197 else: 198 return self.undefined_node 199 200 def search_name( 201 self, 202 text: str, 203 rank: str = None, 204 exact: bool = True, 205 force_extended: bool = False, 206 ): 207 """ 208 Search node by exact or partial name. 209 210 Default order (can be skipped with **force_extended=True**): 211 212 1) Search names defined as "scientific name" on nodes.dmp 213 214 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 215 216 Parameters: 217 * **text** *[str]*: Text to search. 218 * **rank** *[str]*: Filter results by rank. 219 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 220 * **force_extended** *[bool]*: Search for text in all categories at once. 221 222 Returns: list of matching nodes 223 """ 224 n = super().search_name(text, rank=rank, exact=exact) 225 if n and not force_extended: 226 return n 227 else: 228 if exact: 229 ret = self._exact_name(text, self._extended_name_nodes) 230 else: 231 ret = self._partial_name(text, self._extended_name_nodes) 232 233 # Only return nodes of chosen rank 234 if rank: 235 ret = filter_function(ret, self.rank, rank) 236 237 return list(set(n + ret)) 238 239 def stats(self, **kwargs): 240 s = super().stats(**kwargs) 241 if self._merged: 242 s["merged"] = len(self._merged) 243 if self._extended_name_nodes: 244 s["extended_names"] = len(self._extended_name_nodes) 245 return s
18 def __init__(self, **kwargs): 19 self._merged = {} 20 self._extended_name_nodes = {} 21 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
185 def latest(self, node: str): 186 n = super().latest(node) 187 if n == self.undefined_node: 188 n = self.merged(node) 189 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
191 def merged(self, node: str): 192 """ 193 Returns relative entry from the merged.dmp file of a given node. 194 """ 195 if node in self._merged: 196 return self._merged[node] 197 else: 198 return self.undefined_node
Returns relative entry from the merged.dmp file of a given node.
200 def search_name( 201 self, 202 text: str, 203 rank: str = None, 204 exact: bool = True, 205 force_extended: bool = False, 206 ): 207 """ 208 Search node by exact or partial name. 209 210 Default order (can be skipped with **force_extended=True**): 211 212 1) Search names defined as "scientific name" on nodes.dmp 213 214 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 215 216 Parameters: 217 * **text** *[str]*: Text to search. 218 * **rank** *[str]*: Filter results by rank. 219 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 220 * **force_extended** *[bool]*: Search for text in all categories at once. 221 222 Returns: list of matching nodes 223 """ 224 n = super().search_name(text, rank=rank, exact=exact) 225 if n and not force_extended: 226 return n 227 else: 228 if exact: 229 ret = self._exact_name(text, self._extended_name_nodes) 230 else: 231 ret = self._partial_name(text, self._extended_name_nodes) 232 233 # Only return nodes of chosen rank 234 if rank: 235 ret = filter_function(ret, self.rank, rank) 236 237 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search names defined as "scientific name" on nodes.dmp
2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
239 def stats(self, **kwargs): 240 s = super().stats(**kwargs) 241 if self._merged: 242 s["merged"] = len(self._merged) 243 if self._extended_name_nodes: 244 s["extended_names"] = len(self._extended_name_nodes) 245 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class OttTx(MultiTax): 8 _default_version = "3.7.3" 9 _supported_versions = ["3.6", "3.7.3"] 10 _default_urls = { 11 "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz", 12 "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz", 13 } 14 _default_root_node = "805080" 15 16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs) 20 21 def __repr__(self): 22 return format_repr(inst=self) 23 24 def _build_translation(self, target_tax, file: str = None, url: str = None): 25 warnings.warn( 26 "Translation between taxonomies [" 27 + self.__class__.__name__ 28 + "," 29 + target_tax.__class__.__name__ 30 + "] not yet implemented." 31 ) 32 return {} 33 34 def _parse(self, fhs, **kwargs): 35 fhs_list = list(fhs.values()) 36 if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"): 37 nodes, ranks, names = self._parse_ott( 38 fhs_list[0], extended_names=kwargs["extended_names"] 39 ) 40 else: 41 # nodes.dmp 42 nodes, ranks, names = self._parse_taxonomy(fhs_list[0]) 43 # [forwards.tsv] 44 if len(fhs) >= 2: 45 self._forwards = self._parse_forwards(fhs_list[1]) 46 if len(fhs) == 3 and kwargs["extended_names"]: 47 self._extended_name_nodes = self._parse_synonyms(fhs_list[2]) 48 49 return nodes, ranks, names 50 51 def _parse_forwards(self, fh): 52 forwards = {} 53 # skip first line header 54 next(fh) 55 for line in fh: 56 try: 57 old_taxid, new_taxid = line.rstrip().split("\t") 58 except TypeError: 59 old_taxid, new_taxid = line.decode().rstrip().split("\t") 60 forwards[old_taxid] = new_taxid 61 return forwards 62 63 def _parse_ott(self, fh_taxdump, extended_names): 64 # Get files inside folder by name 65 for e in fh_taxdump.getnames(): 66 if e.endswith("taxonomy.tsv"): 67 tax = e 68 if e.endswith("forwards.tsv"): 69 fwr = e 70 if e.endswith("synonyms.tsv"): 71 syn = e 72 73 with fh_taxdump.extractfile(tax) as fh_nodes: 74 nodes, ranks, names = self._parse_taxonomy(fh_nodes) 75 with fh_taxdump.extractfile(fwr) as fh_forwards: 76 self._forwards = self._parse_forwards(fh_forwards) 77 if extended_names: 78 with fh_taxdump.extractfile(syn) as fh_synonyms: 79 self._extended_name_nodes = self._parse_synonyms(fh_synonyms) 80 return nodes, ranks, names 81 82 def _parse_synonyms(self, fh): 83 synonyms = {} 84 # skip first line header 85 next(fh) 86 for line in fh: 87 try: 88 name, taxid, _ = line.split("\t|\t", 2) 89 except TypeError: 90 name, taxid, _ = line.decode().split("\t|\t", 2) 91 if name not in synonyms: 92 synonyms[name] = [] 93 synonyms[name].append(taxid) 94 95 return synonyms 96 97 def _parse_taxonomy(self, fh): 98 nodes = {} 99 ranks = {} 100 names = {} 101 # skip first line header 102 next(fh) 103 for line in fh: 104 try: 105 taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4) 106 except TypeError: 107 taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4) 108 ranks[taxid] = rank 109 nodes[taxid] = parent_taxid 110 names[taxid] = name 111 return nodes, ranks, names 112 113 def forwards(self, node: str): 114 """ 115 Returns relative entry from the forwards.tsv file of a given node. 116 """ 117 if node in self._forwards: 118 return self._forwards[node] 119 else: 120 return self.undefined_node 121 122 def latest(self, node: str): 123 n = super().latest(node) 124 if n == self.undefined_node: 125 n = self.forwards(node) 126 return n 127 128 def search_name( 129 self, 130 text: str, 131 rank: str = None, 132 exact: bool = True, 133 force_extended: bool = False, 134 ): 135 """ 136 Search node by exact or partial name. 137 138 Default order (can be skipped with **force_extended=True**): 139 140 1) Search default names defined on "taxonomy.tsv" 141 142 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 143 144 Parameters: 145 * **text** *[str]*: Text to search. 146 * **rank** *[str]*: Filter results by rank. 147 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 148 * **force_extended** *[bool]*: Search for text in all categories at once. 149 150 Returns: list of matching nodes 151 """ 152 n = super().search_name(text, rank=rank, exact=exact) 153 if n and not force_extended: 154 return n 155 else: 156 if exact: 157 ret = self._exact_name(text, self._extended_name_nodes) 158 else: 159 ret = self._partial_name(text, self._extended_name_nodes) 160 161 # Only return nodes of chosen rank 162 if rank: 163 ret = filter_function(ret, self.rank, rank) 164 165 return list(set(n + ret)) 166 167 def stats(self, **kwargs): 168 s = super().stats(**kwargs) 169 if self._forwards: 170 s["forwards"] = len(self._forwards) 171 if self._extended_name_nodes: 172 s["extended_names"] = len(self._extended_name_nodes) 173 return s
16 def __init__(self, **kwargs): 17 self._forwards = {} 18 self._extended_name_nodes = {} 19 super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
113 def forwards(self, node: str): 114 """ 115 Returns relative entry from the forwards.tsv file of a given node. 116 """ 117 if node in self._forwards: 118 return self._forwards[node] 119 else: 120 return self.undefined_node
Returns relative entry from the forwards.tsv file of a given node.
122 def latest(self, node: str): 123 n = super().latest(node) 124 if n == self.undefined_node: 125 n = self.forwards(node) 126 return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
128 def search_name( 129 self, 130 text: str, 131 rank: str = None, 132 exact: bool = True, 133 force_extended: bool = False, 134 ): 135 """ 136 Search node by exact or partial name. 137 138 Default order (can be skipped with **force_extended=True**): 139 140 1) Search default names defined on "taxonomy.tsv" 141 142 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 143 144 Parameters: 145 * **text** *[str]*: Text to search. 146 * **rank** *[str]*: Filter results by rank. 147 * **exact** *[bool]*: Exact or partial name search (both case sensitive). 148 * **force_extended** *[bool]*: Search for text in all categories at once. 149 150 Returns: list of matching nodes 151 """ 152 n = super().search_name(text, rank=rank, exact=exact) 153 if n and not force_extended: 154 return n 155 else: 156 if exact: 157 ret = self._exact_name(text, self._extended_name_nodes) 158 else: 159 ret = self._partial_name(text, self._extended_name_nodes) 160 161 # Only return nodes of chosen rank 162 if rank: 163 ret = filter_function(ret, self.rank, rank) 164 165 return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search default names defined on "taxonomy.tsv"
2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
167 def stats(self, **kwargs): 168 s = super().stats(**kwargs) 169 if self._forwards: 170 s["forwards"] = len(self._forwards) 171 if self._extended_name_nodes: 172 s["extended_names"] = len(self._extended_name_nodes) 173 return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- translate
- write
7class SilvaTx(MultiTax): 8 _default_version = "ssu_138.2" 9 _supported_versions = ["lsu_138.2", "ssu_138.2"] 10 _default_urls = { 11 "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz", 12 "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz", 13 } 14 15 def __init__(self, **kwargs): 16 super().__init__(**kwargs) 17 18 def __repr__(self): 19 return format_repr(inst=self) 20 21 def _build_translation(self, target_tax, file: str = None, url: str = None): 22 warnings.warn( 23 "Translation between taxonomies [" 24 + self.__class__.__name__ 25 + "," 26 + target_tax.__class__.__name__ 27 + "] not yet implemented." 28 ) 29 return {} 30 31 def _parse(self, fhs, **kwargs): 32 nodes = {} 33 ranks = {} 34 names = {} 35 36 lin = {} 37 for source, fh in fhs.items(): 38 for line in fh: 39 try: 40 name_lineage, taxid, rank, _ = line.split("\t", 3) 41 except TypeError: 42 name_lineage, taxid, rank, _ = line.decode().split("\t", 3) 43 # Remove last char ";" 44 lineage = name_lineage[:-1] 45 name = lineage.split(";")[-1] 46 # Save lineage to build tree 47 lin[lineage] = taxid 48 names[taxid] = name 49 ranks[taxid] = rank 50 51 # Build parent node connection 52 for lineage, taxid in lin.items(): 53 t = taxid 54 lsplit = lineage.split(";")[:-1] 55 while lsplit: 56 parent_taxid = lin[";".join(lsplit)] 57 if t not in nodes: 58 nodes[t] = parent_taxid 59 t = parent_taxid 60 del lsplit[-1] # remove last element 61 # Connect last node to root 62 if t not in nodes: 63 nodes[t] = self._default_root_node 64 65 return nodes, ranks, names
Main constructor of MultiTax and sub-classes
Parameters:
- version [str]: Version to download/parse or custom version name (with files/urls).
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
- empty [bool]: Create an empty instance.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
Inherited Members
- multitax.multitax.MultiTax
- datetime
- version
- undefined_node
- undefined_name
- undefined_rank
- sources
- add
- build_lca
- build_lineages
- build_translation
- children
- check_consistency
- clear_lca
- clear_lineages
- closest_parent
- filter
- from_customtx
- latest
- leaves
- lca
- lineage
- name
- name_lineage
- nodes_rank
- parent
- parent_rank
- prune
- rank
- rank_lineage
- remove
- search_name
- stats
- translate
- write