multitax

 1import importlib.metadata
 2
 3__version__ = importlib.metadata.version(__name__)
 4
 5__all__ = (
 6    "CustomTx",
 7    "DummyTx",
 8    "GreengenesTx",
 9    "GtdbTx",
10    "NcbiTx",
11    "OttTx",
12    "SilvaTx",
13)
14
15from .customtx import CustomTx
16from .dummytx import DummyTx
17from .greengenestx import GreengenesTx
18from .gtdbtx import GtdbTx
19from .ncbitx import NcbiTx
20from .otttx import OttTx
21from .silvatx import SilvaTx
class CustomTx(multitax.multitax.MultiTax):
 7class CustomTx(MultiTax):
 8    _required_cols = ["node", "parent"]
 9    _possible_cols = ["node", "parent", "rank", "name"]
10
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)
31
32    def __repr__(self):
33        return format_repr(inst=self)
34
35    def _build_translation(self, target_tax, file: str = None, url: str = None):
36        warnings.warn(
37            "Translation between taxonomies ["
38            + self.__class__.__name__
39            + ","
40            + target_tax.__class__.__name__
41            + "] not yet implemented."
42        )
43        return {}
44
45    def _parse(self, fhs, **kwargs):
46        nodes = {}
47        ranks = {}
48        names = {}
49        for source, fh in fhs.items():
50            for line in fh:
51                try:
52                    fields = line.rstrip().split(self._sep)
53                except TypeError:
54                    fields = line.decode().rstrip().split(self._sep)
55
56                node = fields[self._cols["node"]]
57                nodes[node] = fields[self._cols["parent"]]
58                if "name" in self._cols:
59                    names[node] = fields[self._cols["name"]]
60                if "rank" in self._cols:
61                    ranks[node] = fields[self._cols["rank"]]
62
63        return nodes, ranks, names
64
65    def _parse_cols(self, cols):
66        if isinstance(cols, list):
67            cols = {c: i for i, c in enumerate(cols)}
68
69        for rc in self._required_cols:
70            if rc not in cols:
71                raise ValueError(rc + " is a required column")
72
73        for c in cols:
74            if c not in self._possible_cols:
75                raise ValueError(
76                    c + " is not a valid column: " + ",".join(self._possible_cols)
77                )
78
79        return cols
CustomTx( cols: list = ['node', 'parent', 'rank', 'name'], sep: str = '\t', **kwargs)
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)

CustomTx()

Parameters:

  • cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
  • sep [str]: Separator of fields
  • **kwargs defined at multitax.multitax.MultiTax

Example:

tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
class DummyTx(multitax.multitax.MultiTax):
 6class DummyTx(MultiTax):
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)
16
17    def __repr__(self):
18        return format_repr(inst=self)
DummyTx(**kwargs)
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)

DummyTx() - Dummy empty taxonomy

Parameters:

class GreengenesTx(multitax.multitax.MultiTax):
 7class GreengenesTx(MultiTax):
 8    _default_version = "2024.09"
 9    _supported_versions = ["2022.10", "2024.09"]
10    _default_urls = {
11        "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz",
12        "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz",
13    }
14
15    _rank_codes = [
16        ("d__", "domain"),
17        ("p__", "phylum"),
18        ("c__", "class"),
19        ("o__", "order"),
20        ("f__", "family"),
21        ("g__", "genus"),
22        ("s__", "species"),
23    ]
24
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)
29
30    def __repr__(self):
31        return format_repr(inst=self)
32
33    def _build_translation(
34        self,
35        target_tax,
36        representatives: bool = False,
37        file: str = None,
38        url: str = None,
39    ):
40        warnings.warn(
41            "Translation between taxonomies ["
42            + self.__class__.__name__
43            + ","
44            + target_tax.__class__.__name__
45            + "] not yet implemented."
46        )
47        return {}
48
49    def _parse(self, fhs, **kwargs):
50        nodes = {}
51        ranks = {}
52        names = {}
53
54        lineages = set()
55        for source, fh in fhs.items():
56            for line in fh:
57                try:
58                    fields = line.rstrip().split("\t")
59                except TypeError:
60                    fields = line.decode().rstrip().split("\t")
61
62                # skip header
63                if fields[0] == "Feature ID":
64                    continue
65
66                lineages.add(fields[1])
67
68        for lineage in lineages:
69            last_taxid = None
70            lin = lineage.split("; ")
71            for i in range(len(lin))[::-1]:
72                # assert rank
73                assert lin[i][:3] == self._rank_codes[i][0]
74
75                name = lin[i][3:]
76                if not name:
77                    continue  # empty entry "s__"
78
79                # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
80                taxid = lin[i]
81                rank = self._rank_codes[i][1]
82
83                if taxid not in nodes:
84                    names[taxid] = name
85                    ranks[taxid] = rank
86                if last_taxid:
87                    nodes[last_taxid] = taxid
88                last_taxid = taxid
89            nodes[last_taxid] = self._default_root_node
90
91        return nodes, ranks, names
GreengenesTx(**kwargs)
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
class GtdbTx(multitax.multitax.MultiTax):
 10class GtdbTx(MultiTax):
 11    _default_version = "232"
 12    _supported_versions = [
 13        "80",
 14        "83",
 15        "86.2",
 16        "89",
 17        "95",
 18        "202",
 19        "207",
 20        "214.1",
 21        "220",
 22        "226",
 23        "232",
 24    ]
 25
 26    _url_prefix = "https://data.gtdb.ecogenomic.org/releases/"
 27    _default_urls = {
 28        "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"],
 29        "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"],
 30        "86.2": [
 31            f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv",
 32            f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv",
 33        ],
 34        "89": [
 35            f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv",
 36            f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv",
 37        ],
 38        "95": [
 39            f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz",
 40            f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz",
 41        ],
 42        "202": [
 43            f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz",
 44            f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz",
 45        ],
 46        "207": [
 47            f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz",
 48            f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz",
 49        ],
 50        "214.1": [
 51            f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz",
 52            f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz",
 53        ],
 54        "220": [
 55            f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz",
 56            f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz",
 57        ],
 58        "226": [
 59            f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz",
 60            f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz",
 61        ],
 62        "232": [
 63            f"{_url_prefix}release232/232.0/ar53_taxonomy_r232.tsv.gz",
 64            f"{_url_prefix}release232/232.0/bac120_taxonomy_r232.tsv.gz",
 65        ],
 66    }
 67
 68    _rank_codes = [
 69        ("d__", "domain"),
 70        ("p__", "phylum"),
 71        ("c__", "class"),
 72        ("o__", "order"),
 73        ("f__", "family"),
 74        ("g__", "genus"),
 75        ("s__", "species"),
 76    ]
 77
 78    def __init__(self, **kwargs):
 79        self._convert_to = {}
 80        self._convert_from = {}
 81        super().__init__(**kwargs)
 82
 83    def __repr__(self):
 84        return format_repr(inst=self)
 85
 86    def _build_translation(
 87        self,
 88        target_tax,
 89        representatives: bool = False,
 90        file: str = None,
 91        url: str = None,
 92    ):
 93        translated_nodes: dict[str, list] = {}
 94        if target_tax.__class__.__name__ == "NcbiTx":
 95            for acc, rep, gtdb_lin, ncbi_txid in download_parse_data_gtdb(
 96                version=self.version, file=file, url=url
 97            ):
 98                # skip not representatives if requested
 99                if representatives and rep == "f":
100                    continue
101
102                # Build NCBI and GTDB lineage from leaf based on standard ranks
103                ncbi_leaf_node = target_tax.latest(ncbi_txid)
104                gtdb_leaf_node = gtdb_lin.split(";")[-1]
105
106                if (
107                    ncbi_leaf_node == target_tax.undefined_node
108                    or gtdb_leaf_node == self.undefined_node
109                ):
110                    continue
111                ncbi_nodes = target_tax.lineage(
112                    ncbi_leaf_node,
113                    ranks=target_tax._standard_ranks,
114                )
115                if not ncbi_nodes:
116                    continue
117                gtdb_nodes = self.lineage(
118                    gtdb_leaf_node,
119                    ranks=self._standard_ranks,
120                )
121
122                # Match ranks
123                for i, gtdb_n in enumerate(gtdb_nodes, 1):
124                    if gtdb_n == self.undefined_node:
125                        continue
126
127                    # Get closes available node for translation in the lineage up to current rank
128                    translated_node = next(
129                        node
130                        for node in reversed(ncbi_nodes[:i])
131                        if node is not target_tax.undefined_node
132                    )
133                    if gtdb_n not in translated_nodes:
134                        translated_nodes[gtdb_n] = []
135                    translated_nodes[gtdb_n].append(translated_node)
136        else:
137            warnings.warn(
138                "Translation between taxonomies ["
139                + self.__class__.__name__
140                + ","
141                + target_tax.__class__.__name__
142                + "] not yet implemented."
143            )
144
145        return translated_nodes
146
147    def _parse(self, fhs, **kwargs):
148        nodes = {}
149        ranks = {}
150        names = {}
151        for source, fh in fhs.items():
152            for line in fh:
153                try:
154                    _, lineage = line.rstrip().split("\t")
155                except TypeError:
156                    _, lineage = line.decode().rstrip().split("\t")
157                lin = lineage.split(";")
158                for i in range(len(lin))[::-1]:
159                    # assert rank
160                    assert lin[i][:3] == self._rank_codes[i][0]
161                    # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
162                    taxid = lin[i]
163                    name = lin[i][3:]
164                    # empty entry "s__"
165                    if not name:
166                        continue
167                    rank = self._rank_codes[i][1]
168                    if i == 0:
169                        parent_taxid = self._default_root_node
170                    else:
171                        parent_taxid = lin[i - 1]
172                    if taxid not in nodes:
173                        nodes[taxid] = parent_taxid
174                        names[taxid] = name
175                        ranks[taxid] = rank
176
177        return nodes, ranks, names
178
179    def _lookup_version_taxa(self, node, version: str):
180        res = set()
181        for acc in self._convert_from.get(node, ""):
182            for tx in self._convert_to[version].get(acc, "").split(";"):
183                # Return only rank of requested node
184                if tx.startswith(node[:1]):
185                    res.add(tx)
186        return res
187
188    def build_conversion(
189        self,
190        version: str,
191        representatives: bool = False,
192        files: tuple[str, str] = ("", ""),
193        urls: tuple[str, str] = ("", ""),
194    ):
195        """
196        Download and build conversion table against another version.
197        Optionally build conversion based only on representative genomes of current version.
198        Optional function, conversion tables are automatically downloaded
199        and built on first .convert() call.
200        """
201        if version not in self._supported_versions:
202            raise ValueError(
203                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
204            )
205
206        if not self._convert_from:
207            # Collect the accessions of the representative entries for each taxa in the current version
208            tx_accs = {}
209            for acc, rep, lin, _ in download_parse_data_gtdb(
210                version=self.version, file=files[0], url=urls[0]
211            ):
212                # Skip not representatives if requested
213                if representatives and rep == "f":
214                    continue
215
216                for tx in lin.split(";"):
217                    if tx not in tx_accs:
218                        tx_accs[tx] = []
219                    tx_accs[tx].append(acc)
220
221            # Assign only at the end, in case of download/parse errors
222            self._convert_from = tx_accs
223
224        if version not in self._convert_to:
225            # Collect the lineage for each accession
226            acc_lin = {}
227            for acc, _, lin, _ in download_parse_data_gtdb(
228                version=version, file=files[1], url=urls[1]
229            ):
230                acc_lin[acc] = lin
231            # Assign only at the end, in case of download/parse errors
232            self._convert_to[version] = acc_lin
233
234    def convert(self, node: str, version: str) -> set[str]:
235        """
236        Converts a taxonomic node from the loaded tax to another version.
237
238        It uses a genomic centric strategy: locates the GTDB species representative(s)
239        of the requested `node` lineage and converts to the assigned lineage(s) of that
240        genome in the requested `version`.
241
242        It may return multiple nodes for ranks above species,
243        since multiple representatives can be split into more taxa.
244        It may return an empty set if node is not found in the current version
245        or if related representative is no longer available in the requested version.
246
247        Example:
248
249            from multitax import GtdbTx
250            tax = GtdbTx(version="95")
251
252            # Species - always one-to-one
253            tax.convert('s__Giesbergeria metamorpha', version="226")
254            {'s__Simplicispira metamorpha'}
255
256            # Other ranks - may be one-to-many
257            tax.convert('g__UBA6715', version="226")
258            {'g__Aquirufa', 'g__Sandaracinomonas'}
259        """
260
261        if version not in self._supported_versions:
262            raise ValueError(
263                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
264            )
265
266        if not self._convert_from or version not in self._convert_to:
267            self.build_conversion(version=version)
268
269        return self._lookup_version_taxa(node, version)
GtdbTx(**kwargs)
78    def __init__(self, **kwargs):
79        self._convert_to = {}
80        self._convert_from = {}
81        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def build_conversion( self, version: str, representatives: bool = False, files: tuple[str, str] = ('', ''), urls: tuple[str, str] = ('', '')):
188    def build_conversion(
189        self,
190        version: str,
191        representatives: bool = False,
192        files: tuple[str, str] = ("", ""),
193        urls: tuple[str, str] = ("", ""),
194    ):
195        """
196        Download and build conversion table against another version.
197        Optionally build conversion based only on representative genomes of current version.
198        Optional function, conversion tables are automatically downloaded
199        and built on first .convert() call.
200        """
201        if version not in self._supported_versions:
202            raise ValueError(
203                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
204            )
205
206        if not self._convert_from:
207            # Collect the accessions of the representative entries for each taxa in the current version
208            tx_accs = {}
209            for acc, rep, lin, _ in download_parse_data_gtdb(
210                version=self.version, file=files[0], url=urls[0]
211            ):
212                # Skip not representatives if requested
213                if representatives and rep == "f":
214                    continue
215
216                for tx in lin.split(";"):
217                    if tx not in tx_accs:
218                        tx_accs[tx] = []
219                    tx_accs[tx].append(acc)
220
221            # Assign only at the end, in case of download/parse errors
222            self._convert_from = tx_accs
223
224        if version not in self._convert_to:
225            # Collect the lineage for each accession
226            acc_lin = {}
227            for acc, _, lin, _ in download_parse_data_gtdb(
228                version=version, file=files[1], url=urls[1]
229            ):
230                acc_lin[acc] = lin
231            # Assign only at the end, in case of download/parse errors
232            self._convert_to[version] = acc_lin

Download and build conversion table against another version. Optionally build conversion based only on representative genomes of current version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.

def convert(self, node: str, version: str) -> set[str]:
234    def convert(self, node: str, version: str) -> set[str]:
235        """
236        Converts a taxonomic node from the loaded tax to another version.
237
238        It uses a genomic centric strategy: locates the GTDB species representative(s)
239        of the requested `node` lineage and converts to the assigned lineage(s) of that
240        genome in the requested `version`.
241
242        It may return multiple nodes for ranks above species,
243        since multiple representatives can be split into more taxa.
244        It may return an empty set if node is not found in the current version
245        or if related representative is no longer available in the requested version.
246
247        Example:
248
249            from multitax import GtdbTx
250            tax = GtdbTx(version="95")
251
252            # Species - always one-to-one
253            tax.convert('s__Giesbergeria metamorpha', version="226")
254            {'s__Simplicispira metamorpha'}
255
256            # Other ranks - may be one-to-many
257            tax.convert('g__UBA6715', version="226")
258            {'g__Aquirufa', 'g__Sandaracinomonas'}
259        """
260
261        if version not in self._supported_versions:
262            raise ValueError(
263                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
264            )
265
266        if not self._convert_from or version not in self._convert_to:
267            self.build_conversion(version=version)
268
269        return self._lookup_version_taxa(node, version)

Converts a taxonomic node from the loaded tax to another version.

It uses a genomic centric strategy: locates the GTDB species representative(s) of the requested node lineage and converts to the assigned lineage(s) of that genome in the requested version.

It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.

Example:

from multitax import GtdbTx
tax = GtdbTx(version="95")

# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}

# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
class NcbiTx(multitax.multitax.MultiTax):
  7class NcbiTx(MultiTax):
  8    _default_version = "current"
  9    _supported_versions = ["current"]
 10    _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"}
 11
 12    def __init__(self, **kwargs):
 13        self._merged = {}
 14        self._extended_name_nodes = {}
 15        super().__init__(**kwargs)
 16
 17    def __repr__(self):
 18        return format_repr(inst=self)
 19
 20    def _build_translation(
 21        self,
 22        target_tax,
 23        representatives: bool = False,
 24        file: str = None,
 25        url: str = None,
 26    ):
 27        translated_nodes: dict[str, list] = {}
 28        if target_tax.__class__.__name__ == "GtdbTx":
 29            for acc, rep, gtdb_lin, ncbi_txid in download_parse_data_gtdb(
 30                version=target_tax.version, file=file, url=url
 31            ):
 32                # skip not representatives if requested
 33                if representatives and rep == "f":
 34                    continue
 35
 36                # Build NCBI and GTDB lineage from leaf based on standard ranks
 37                ncbi_leaf_node = self.latest(ncbi_txid)
 38                gtdb_leaf_node = gtdb_lin.split(";")[-1]
 39                if (
 40                    ncbi_leaf_node == self.undefined_node
 41                    or gtdb_leaf_node == target_tax.undefined_node
 42                ):
 43                    continue
 44                gtdb_nodes = target_tax.lineage(
 45                    gtdb_leaf_node,
 46                    ranks=target_tax._standard_ranks,
 47                )
 48                if not gtdb_nodes:
 49                    continue
 50                ncbi_nodes = self.lineage(
 51                    ncbi_leaf_node,
 52                    ranks=self._standard_ranks,
 53                )
 54
 55                # Additionaly add connection from NCBI leaf to GTDB species
 56                # since the NCBI taxid can be of a strain
 57                if ncbi_leaf_node not in translated_nodes:
 58                    translated_nodes[ncbi_leaf_node] = []
 59                translated_nodes[ncbi_leaf_node].append(gtdb_leaf_node)
 60
 61                # Match ranks
 62                for i, ncbi_n in enumerate(ncbi_nodes, 1):
 63                    if ncbi_n == self.undefined_node:
 64                        continue
 65
 66                    # Get closes available node for translation in the lineage up to current rank
 67                    translated_node = next(
 68                        node
 69                        for node in reversed(gtdb_nodes[:i])
 70                        if node is not target_tax.undefined_node
 71                    )
 72
 73                    if ncbi_n not in translated_nodes:
 74                        translated_nodes[ncbi_n] = []
 75                    translated_nodes[ncbi_n].append(translated_node)
 76
 77        else:
 78            warnings.warn(
 79                "Translation between taxonomies ["
 80                + self.__class__.__name__
 81                + ","
 82                + target_tax.__class__.__name__
 83                + "] not yet implemented."
 84            )
 85
 86        return translated_nodes
 87
 88    def _parse(self, fhs, **kwargs):
 89        fhs_list = list(fhs.values())
 90        # One element tar.gz -> taxdump.tar.gz
 91        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
 92            nodes, ranks, names, self._merged = self._parse_taxdump(
 93                fhs_list[0], extended_names=kwargs["extended_names"]
 94            )
 95        else:
 96            # nodes.dmp
 97            nodes, ranks = self._parse_nodes(fhs_list[0])
 98
 99            # [names.dmp]
100            if len(fhs) >= 2:
101                names = self._parse_names(
102                    fhs_list[1], extended_names=kwargs["extended_names"]
103                )
104            else:
105                names = {}
106
107            # [merged.dmp]
108            if len(fhs) == 3:
109                self._merged = self._parse_merged(fhs_list[2])
110        return nodes, ranks, names
111
112    def _parse_merged(self, fh):
113        merged = {}
114        for line in fh:
115            try:
116                old_taxid, _, new_taxid, _ = line.split("\t", 3)
117            except TypeError:
118                old_taxid, _, new_taxid, _ = line.decode().split("\t", 3)
119            merged[old_taxid] = new_taxid
120        return merged
121
122    def _parse_names(self, fh, extended_names):
123        names = {}
124        for line in fh:
125            try:
126                node, name, _, name_class = line.split("\t|\t")
127            except TypeError:
128                node, name, _, name_class = line.decode().split("\t|\t")
129            if name_class.replace("\t|\n", "") == "scientific name":
130                names[node] = name
131            elif extended_names:
132                if name not in self._extended_name_nodes:
133                    self._extended_name_nodes[name] = []
134                self._extended_name_nodes[name].append(node)
135
136        return names
137
138    def _parse_nodes(self, fh):
139        nodes = {}
140        ranks = {}
141        for line in fh:
142            try:
143                taxid, parent_taxid, rank, _ = line.split("\t|\t", 3)
144            except TypeError:
145                taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3)
146            ranks[taxid] = rank
147            nodes[taxid] = parent_taxid
148        return nodes, ranks
149
150    def _parse_taxdump(self, fh_taxdump, extended_names):
151        with fh_taxdump.extractfile("nodes.dmp") as fh_nodes:
152            nodes, ranks = self._parse_nodes(fh_nodes)
153        with fh_taxdump.extractfile("names.dmp") as fh_names:
154            names = self._parse_names(fh_names, extended_names=extended_names)
155        with fh_taxdump.extractfile("merged.dmp") as fh_merged:
156            merged = self._parse_merged(fh_merged)
157        return nodes, ranks, names, merged
158
159    def latest(self, node: str):
160        n = super().latest(node)
161        if n == self.undefined_node:
162            n = self.merged(node)
163        return n
164
165    def merged(self, node: str):
166        """
167        Returns relative entry from the merged.dmp file of a given node.
168        """
169        if node in self._merged:
170            return self._merged[node]
171        else:
172            return self.undefined_node
173
174    def search_name(
175        self,
176        text: str,
177        rank: str = None,
178        exact: bool = True,
179        force_extended: bool = False,
180    ):
181        """
182        Search node by exact or partial name.
183
184        Default order (can be skipped with **force_extended=True**):
185
186        1) Search names defined as "scientific name" on nodes.dmp
187
188        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
189
190        Parameters:
191        * **text** *[str]*: Text to search.
192        * **rank** *[str]*: Filter results by rank.
193        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
194        * **force_extended** *[bool]*: Search for text in all categories at once.
195
196        Returns: list of matching nodes
197        """
198        n = super().search_name(text, rank=rank, exact=exact)
199        if n and not force_extended:
200            return n
201        else:
202            if exact:
203                ret = self._exact_name(text, self._extended_name_nodes)
204            else:
205                ret = self._partial_name(text, self._extended_name_nodes)
206
207            # Only return nodes of chosen rank
208            if rank:
209                ret = filter_function(ret, self.rank, rank)
210
211            return list(set(n + ret))
212
213    def stats(self, **kwargs):
214        s = super().stats(**kwargs)
215        if self._merged:
216            s["merged"] = len(self._merged)
217        if self._extended_name_nodes:
218            s["extended_names"] = len(self._extended_name_nodes)
219        return s
NcbiTx(**kwargs)
12    def __init__(self, **kwargs):
13        self._merged = {}
14        self._extended_name_nodes = {}
15        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def latest(self, node: str):
159    def latest(self, node: str):
160        n = super().latest(node)
161        if n == self.undefined_node:
162            n = self.merged(node)
163        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def merged(self, node: str):
165    def merged(self, node: str):
166        """
167        Returns relative entry from the merged.dmp file of a given node.
168        """
169        if node in self._merged:
170            return self._merged[node]
171        else:
172            return self.undefined_node

Returns relative entry from the merged.dmp file of a given node.

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
174    def search_name(
175        self,
176        text: str,
177        rank: str = None,
178        exact: bool = True,
179        force_extended: bool = False,
180    ):
181        """
182        Search node by exact or partial name.
183
184        Default order (can be skipped with **force_extended=True**):
185
186        1) Search names defined as "scientific name" on nodes.dmp
187
188        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
189
190        Parameters:
191        * **text** *[str]*: Text to search.
192        * **rank** *[str]*: Filter results by rank.
193        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
194        * **force_extended** *[bool]*: Search for text in all categories at once.
195
196        Returns: list of matching nodes
197        """
198        n = super().search_name(text, rank=rank, exact=exact)
199        if n and not force_extended:
200            return n
201        else:
202            if exact:
203                ret = self._exact_name(text, self._extended_name_nodes)
204            else:
205                ret = self._partial_name(text, self._extended_name_nodes)
206
207            # Only return nodes of chosen rank
208            if rank:
209                ret = filter_function(ret, self.rank, rank)
210
211            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search names defined as "scientific name" on nodes.dmp

2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
213    def stats(self, **kwargs):
214        s = super().stats(**kwargs)
215        if self._merged:
216            s["merged"] = len(self._merged)
217        if self._extended_name_nodes:
218            s["extended_names"] = len(self._extended_name_nodes)
219        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class OttTx(multitax.multitax.MultiTax):
  7class OttTx(MultiTax):
  8    _default_version = "3.7.3"
  9    _supported_versions = ["3.6", "3.7.3"]
 10    _default_urls = {
 11        "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz",
 12        "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz",
 13    }
 14    _default_root_node = "805080"
 15
 16    def __init__(self, **kwargs):
 17        self._forwards = {}
 18        self._extended_name_nodes = {}
 19        super().__init__(**kwargs)
 20
 21    def __repr__(self):
 22        return format_repr(inst=self)
 23
 24    def _build_translation(
 25        self,
 26        target_tax,
 27        representatives: bool = False,
 28        file: str = None,
 29        url: str = None,
 30    ):
 31        warnings.warn(
 32            "Translation between taxonomies ["
 33            + self.__class__.__name__
 34            + ","
 35            + target_tax.__class__.__name__
 36            + "] not yet implemented."
 37        )
 38        return {}
 39
 40    def _parse(self, fhs, **kwargs):
 41        fhs_list = list(fhs.values())
 42        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
 43            nodes, ranks, names = self._parse_ott(
 44                fhs_list[0], extended_names=kwargs["extended_names"]
 45            )
 46        else:
 47            # nodes.dmp
 48            nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
 49            # [forwards.tsv]
 50            if len(fhs) >= 2:
 51                self._forwards = self._parse_forwards(fhs_list[1])
 52            if len(fhs) == 3 and kwargs["extended_names"]:
 53                self._extended_name_nodes = self._parse_synonyms(fhs_list[2])
 54
 55        return nodes, ranks, names
 56
 57    def _parse_forwards(self, fh):
 58        forwards = {}
 59        # skip first line header
 60        next(fh)
 61        for line in fh:
 62            try:
 63                old_taxid, new_taxid = line.rstrip().split("\t")
 64            except TypeError:
 65                old_taxid, new_taxid = line.decode().rstrip().split("\t")
 66            forwards[old_taxid] = new_taxid
 67        return forwards
 68
 69    def _parse_ott(self, fh_taxdump, extended_names):
 70        # Get files inside folder by name
 71        for e in fh_taxdump.getnames():
 72            if e.endswith("taxonomy.tsv"):
 73                tax = e
 74            if e.endswith("forwards.tsv"):
 75                fwr = e
 76            if e.endswith("synonyms.tsv"):
 77                syn = e
 78
 79        with fh_taxdump.extractfile(tax) as fh_nodes:
 80            nodes, ranks, names = self._parse_taxonomy(fh_nodes)
 81        with fh_taxdump.extractfile(fwr) as fh_forwards:
 82            self._forwards = self._parse_forwards(fh_forwards)
 83        if extended_names:
 84            with fh_taxdump.extractfile(syn) as fh_synonyms:
 85                self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
 86        return nodes, ranks, names
 87
 88    def _parse_synonyms(self, fh):
 89        synonyms = {}
 90        # skip first line header
 91        next(fh)
 92        for line in fh:
 93            try:
 94                name, taxid, _ = line.split("\t|\t", 2)
 95            except TypeError:
 96                name, taxid, _ = line.decode().split("\t|\t", 2)
 97            if name not in synonyms:
 98                synonyms[name] = []
 99            synonyms[name].append(taxid)
100
101        return synonyms
102
103    def _parse_taxonomy(self, fh):
104        nodes = {}
105        ranks = {}
106        names = {}
107        # skip first line header
108        next(fh)
109        for line in fh:
110            try:
111                taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4)
112            except TypeError:
113                taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4)
114            ranks[taxid] = rank
115            nodes[taxid] = parent_taxid
116            names[taxid] = name
117        return nodes, ranks, names
118
119    def forwards(self, node: str):
120        """
121        Returns relative entry from the forwards.tsv file of a given node.
122        """
123        if node in self._forwards:
124            return self._forwards[node]
125        else:
126            return self.undefined_node
127
128    def latest(self, node: str):
129        n = super().latest(node)
130        if n == self.undefined_node:
131            n = self.forwards(node)
132        return n
133
134    def search_name(
135        self,
136        text: str,
137        rank: str = None,
138        exact: bool = True,
139        force_extended: bool = False,
140    ):
141        """
142        Search node by exact or partial name.
143
144        Default order (can be skipped with **force_extended=True**):
145
146        1) Search default names defined on "taxonomy.tsv"
147
148        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
149
150        Parameters:
151        * **text** *[str]*: Text to search.
152        * **rank** *[str]*: Filter results by rank.
153        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
154        * **force_extended** *[bool]*: Search for text in all categories at once.
155
156        Returns: list of matching nodes
157        """
158        n = super().search_name(text, rank=rank, exact=exact)
159        if n and not force_extended:
160            return n
161        else:
162            if exact:
163                ret = self._exact_name(text, self._extended_name_nodes)
164            else:
165                ret = self._partial_name(text, self._extended_name_nodes)
166
167            # Only return nodes of chosen rank
168            if rank:
169                ret = filter_function(ret, self.rank, rank)
170
171            return list(set(n + ret))
172
173    def stats(self, **kwargs):
174        s = super().stats(**kwargs)
175        if self._forwards:
176            s["forwards"] = len(self._forwards)
177        if self._extended_name_nodes:
178            s["extended_names"] = len(self._extended_name_nodes)
179        return s
OttTx(**kwargs)
16    def __init__(self, **kwargs):
17        self._forwards = {}
18        self._extended_name_nodes = {}
19        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def forwards(self, node: str):
119    def forwards(self, node: str):
120        """
121        Returns relative entry from the forwards.tsv file of a given node.
122        """
123        if node in self._forwards:
124            return self._forwards[node]
125        else:
126            return self.undefined_node

Returns relative entry from the forwards.tsv file of a given node.

def latest(self, node: str):
128    def latest(self, node: str):
129        n = super().latest(node)
130        if n == self.undefined_node:
131            n = self.forwards(node)
132        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
134    def search_name(
135        self,
136        text: str,
137        rank: str = None,
138        exact: bool = True,
139        force_extended: bool = False,
140    ):
141        """
142        Search node by exact or partial name.
143
144        Default order (can be skipped with **force_extended=True**):
145
146        1) Search default names defined on "taxonomy.tsv"
147
148        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
149
150        Parameters:
151        * **text** *[str]*: Text to search.
152        * **rank** *[str]*: Filter results by rank.
153        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
154        * **force_extended** *[bool]*: Search for text in all categories at once.
155
156        Returns: list of matching nodes
157        """
158        n = super().search_name(text, rank=rank, exact=exact)
159        if n and not force_extended:
160            return n
161        else:
162            if exact:
163                ret = self._exact_name(text, self._extended_name_nodes)
164            else:
165                ret = self._partial_name(text, self._extended_name_nodes)
166
167            # Only return nodes of chosen rank
168            if rank:
169                ret = filter_function(ret, self.rank, rank)
170
171            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search default names defined on "taxonomy.tsv"

2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
173    def stats(self, **kwargs):
174        s = super().stats(**kwargs)
175        if self._forwards:
176            s["forwards"] = len(self._forwards)
177        if self._extended_name_nodes:
178            s["extended_names"] = len(self._extended_name_nodes)
179        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class SilvaTx(multitax.multitax.MultiTax):
 7class SilvaTx(MultiTax):
 8    _default_version = "ssu_138.2"
 9    _supported_versions = ["lsu_138.2", "ssu_138.2"]
10    _default_urls = {
11        "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz",
12        "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz",
13    }
14
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)
17
18    def __repr__(self):
19        return format_repr(inst=self)
20
21    def _build_translation(
22        self,
23        target_tax,
24        representatives: bool = False,
25        file: str = None,
26        url: str = None,
27    ):
28        warnings.warn(
29            "Translation between taxonomies ["
30            + self.__class__.__name__
31            + ","
32            + target_tax.__class__.__name__
33            + "] not yet implemented."
34        )
35        return {}
36
37    def _parse(self, fhs, **kwargs):
38        nodes = {}
39        ranks = {}
40        names = {}
41
42        lin = {}
43        for source, fh in fhs.items():
44            for line in fh:
45                try:
46                    name_lineage, taxid, rank, _ = line.split("\t", 3)
47                except TypeError:
48                    name_lineage, taxid, rank, _ = line.decode().split("\t", 3)
49                # Remove last char ";"
50                lineage = name_lineage[:-1]
51                name = lineage.split(";")[-1]
52                # Save lineage to build tree
53                lin[lineage] = taxid
54                names[taxid] = name
55                ranks[taxid] = rank
56
57        # Build parent node connection
58        for lineage, taxid in lin.items():
59            t = taxid
60            lsplit = lineage.split(";")[:-1]
61            while lsplit:
62                parent_taxid = lin[";".join(lsplit)]
63                if t not in nodes:
64                    nodes[t] = parent_taxid
65                t = parent_taxid
66                del lsplit[-1]  # remove last element
67            # Connect last node to root
68            if t not in nodes:
69                nodes[t] = self._default_root_node
70
71        return nodes, ranks, names
SilvaTx(**kwargs)
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")