multitax

 1import importlib.metadata
 2
 3__version__ = importlib.metadata.version(__name__)
 4
 5__all__ = (
 6    "CustomTx",
 7    "DummyTx",
 8    "GreengenesTx",
 9    "GtdbTx",
10    "NcbiTx",
11    "OttTx",
12    "SilvaTx",
13)
14
15from .customtx import CustomTx
16from .dummytx import DummyTx
17from .greengenestx import GreengenesTx
18from .gtdbtx import GtdbTx
19from .ncbitx import NcbiTx
20from .otttx import OttTx
21from .silvatx import SilvaTx
class CustomTx(multitax.multitax.MultiTax):
 7class CustomTx(MultiTax):
 8    _required_cols = ["node", "parent"]
 9    _possible_cols = ["node", "parent", "rank", "name"]
10
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)
31
32    def __repr__(self):
33        return format_repr(inst=self)
34
35    def _build_translation(self, target_tax, file: str = None, url: str = None):
36        warnings.warn(
37            "Translation between taxonomies ["
38            + self.__class__.__name__
39            + ","
40            + target_tax.__class__.__name__
41            + "] not yet implemented."
42        )
43        return {}
44
45    def _parse(self, fhs, **kwargs):
46        nodes = {}
47        ranks = {}
48        names = {}
49        for source, fh in fhs.items():
50            for line in fh:
51                try:
52                    fields = line.rstrip().split(self._sep)
53                except TypeError:
54                    fields = line.decode().rstrip().split(self._sep)
55
56                node = fields[self._cols["node"]]
57                nodes[node] = fields[self._cols["parent"]]
58                if "name" in self._cols:
59                    names[node] = fields[self._cols["name"]]
60                if "rank" in self._cols:
61                    ranks[node] = fields[self._cols["rank"]]
62
63        return nodes, ranks, names
64
65    def _parse_cols(self, cols):
66        if isinstance(cols, list):
67            cols = {c: i for i, c in enumerate(cols)}
68
69        for rc in self._required_cols:
70            if rc not in cols:
71                raise ValueError(rc + " is a required column")
72
73        for c in cols:
74            if c not in self._possible_cols:
75                raise ValueError(
76                    c + " is not a valid column: " + ",".join(self._possible_cols)
77                )
78
79        return cols
CustomTx( cols: list = ['node', 'parent', 'rank', 'name'], sep: str = '\t', **kwargs)
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)

CustomTx()

Parameters:

  • cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
  • sep [str]: Separator of fields
  • **kwargs defined at multitax.multitax.MultiTax

Example:

tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
class DummyTx(multitax.multitax.MultiTax):
 6class DummyTx(MultiTax):
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)
16
17    def __repr__(self):
18        return format_repr(inst=self)
DummyTx(**kwargs)
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)

DummyTx() - Dummy empty taxonomy

Parameters:

class GreengenesTx(multitax.multitax.MultiTax):
 7class GreengenesTx(MultiTax):
 8    _default_version = "2024.09"
 9    _supported_versions = ["2022.10", "2024.09"]
10    _default_urls = {
11        "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz",
12        "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz",
13    }
14
15    _rank_codes = [
16        ("d__", "domain"),
17        ("p__", "phylum"),
18        ("c__", "class"),
19        ("o__", "order"),
20        ("f__", "family"),
21        ("g__", "genus"),
22        ("s__", "species"),
23    ]
24
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)
29
30    def __repr__(self):
31        return format_repr(inst=self)
32
33    def _build_translation(self, target_tax, file: str = None, url: str = None):
34        warnings.warn(
35            "Translation between taxonomies ["
36            + self.__class__.__name__
37            + ","
38            + target_tax.__class__.__name__
39            + "] not yet implemented."
40        )
41        return {}
42
43    def _parse(self, fhs, **kwargs):
44        nodes = {}
45        ranks = {}
46        names = {}
47
48        lineages = set()
49        for source, fh in fhs.items():
50            for line in fh:
51                try:
52                    fields = line.rstrip().split("\t")
53                except TypeError:
54                    fields = line.decode().rstrip().split("\t")
55
56                # skip header
57                if fields[0] == "Feature ID":
58                    continue
59
60                lineages.add(fields[1])
61
62        for lineage in lineages:
63            last_taxid = None
64            lin = lineage.split("; ")
65            for i in range(len(lin))[::-1]:
66                # assert rank
67                assert lin[i][:3] == self._rank_codes[i][0]
68
69                name = lin[i][3:]
70                if not name:
71                    continue  # empty entry "s__"
72
73                # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
74                taxid = lin[i]
75                rank = self._rank_codes[i][1]
76
77                if taxid not in nodes:
78                    names[taxid] = name
79                    ranks[taxid] = rank
80                if last_taxid:
81                    nodes[last_taxid] = taxid
82                last_taxid = taxid
83            nodes[last_taxid] = self._default_root_node
84
85        return nodes, ranks, names
GreengenesTx(**kwargs)
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
class GtdbTx(multitax.multitax.MultiTax):
  7class GtdbTx(MultiTax):
  8    _default_version = "226"
  9    _supported_versions = [
 10        "80",
 11        "83",
 12        "86.2",
 13        "89",
 14        "95",
 15        "202",
 16        "207",
 17        "214.1",
 18        "220",
 19        "226",
 20    ]
 21
 22    _url_prefix = "https://data.gtdb.ecogenomic.org/releases/"
 23    _default_urls = {
 24        "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"],
 25        "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"],
 26        "86.2": [
 27            f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv",
 28            f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv",
 29        ],
 30        "89": [
 31            f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv",
 32            f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv",
 33        ],
 34        "95": [
 35            f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz",
 36            f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz",
 37        ],
 38        "202": [
 39            f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz",
 40            f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz",
 41        ],
 42        "207": [
 43            f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz",
 44            f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz",
 45        ],
 46        "214.1": [
 47            f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz",
 48            f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz",
 49        ],
 50        "220": [
 51            f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz",
 52            f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz",
 53        ],
 54        "226": [
 55            f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz",
 56            f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz",
 57        ],
 58    }
 59
 60    _rank_codes = [
 61        ("d__", "domain"),
 62        ("p__", "phylum"),
 63        ("c__", "class"),
 64        ("o__", "order"),
 65        ("f__", "family"),
 66        ("g__", "genus"),
 67        ("s__", "species"),
 68    ]
 69
 70    def __init__(self, **kwargs):
 71        self._convert_to = {}
 72        self._convert_from = {}
 73        super().__init__(**kwargs)
 74
 75    def __repr__(self):
 76        return format_repr(inst=self)
 77
 78    def _build_translation(self, target_tax, file: str = None, url: str = None):
 79        translated_nodes = {}
 80        if target_tax.__class__.__name__ == "NcbiTx":
 81            if file:
 82                fhs = open_files([file])
 83            else:
 84                if not url:
 85                    url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{self.version}_acc_rep_lin_ncbi.tsv.gz"
 86                fhs = download_files(urls=[url], retry_attempts=3)
 87
 88            accession_col = 0
 89            gtdb_taxonomy_col = 2
 90            ncbi_taxid_col = 3
 91
 92            for source, fh in fhs.items():
 93                for line in fh:
 94                    try:
 95                        fields = line.rstrip().split("\t")
 96                    except TypeError:
 97                        fields = line.decode().rstrip().split("\t")
 98
 99                    # skip header
100                    if fields[accession_col] == "accession":
101                        continue
102
103                    ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col])
104                    if ncbi_leaf_node != target_tax.undefined_node:
105                        ncbi_nodes = target_tax.lineage(
106                            ncbi_leaf_node,
107                            ranks=[
108                                "domain",
109                                "phylum",
110                                "class",
111                                "order",
112                                "family",
113                                "genus",
114                                "species",
115                            ],
116                        )
117                    else:
118                        continue
119
120                    # Build GTDB lineage from leaf (species on given lineage)
121                    # to accomodate possible changes in the loaded tax
122                    gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
123                    if gtdb_leaf_node != self.undefined_node:
124                        gtdb_nodes = self.lineage(
125                            gtdb_leaf_node,
126                            ranks=[
127                                "domain",
128                                "phylum",
129                                "class",
130                                "order",
131                                "family",
132                                "genus",
133                                "species",
134                            ],
135                        )
136                    else:
137                        continue
138
139                    # Match ranks
140                    for i, gtdb_n in enumerate(gtdb_nodes):
141                        if (
142                            ncbi_nodes[i] != target_tax.undefined_node
143                            and gtdb_n != self.undefined_node
144                        ):
145                            if gtdb_n not in translated_nodes:
146                                translated_nodes[gtdb_n] = set()
147                            translated_nodes[gtdb_n].add(ncbi_nodes[i])
148
149            close_files(fhs)
150        else:
151            warnings.warn(
152                "Translation between taxonomies ["
153                + self.__class__.__name__
154                + ","
155                + target_tax.__class__.__name__
156                + "] not yet implemented."
157            )
158
159        return translated_nodes
160
161    def _parse(self, fhs, **kwargs):
162        nodes = {}
163        ranks = {}
164        names = {}
165        for source, fh in fhs.items():
166            for line in fh:
167                try:
168                    _, lineage = line.rstrip().split("\t")
169                except TypeError:
170                    _, lineage = line.decode().rstrip().split("\t")
171                lin = lineage.split(";")
172                for i in range(len(lin))[::-1]:
173                    # assert rank
174                    assert lin[i][:3] == self._rank_codes[i][0]
175                    # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
176                    taxid = lin[i]
177                    name = lin[i][3:]
178                    # empty entry "s__"
179                    if not name:
180                        continue
181                    rank = self._rank_codes[i][1]
182                    if i == 0:
183                        parent_taxid = self._default_root_node
184                    else:
185                        parent_taxid = lin[i - 1]
186                    if taxid not in nodes:
187                        nodes[taxid] = parent_taxid
188                        names[taxid] = name
189                        ranks[taxid] = rank
190
191        return nodes, ranks, names
192
193    def _lookup_version_taxa(self, node, version: str):
194        res = set()
195        for acc in self._convert_from.get(node, ""):
196            for tx in self._convert_to[version].get(acc, "").split(";"):
197                # Return only rank of requested node
198                if tx.startswith(node[:1]):
199                    res.add(tx)
200        return res
201
202    def _download_parse_version_taxa(self, version, file, url):
203        if file:
204            fhs = open_files(files=[file])
205        else:
206            if not url:
207                url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{version}_acc_rep_lin_ncbi.tsv.gz"
208            fhs = download_files(urls=[url], retry_attempts=3)
209
210        for fh in fhs.values():
211            for line in fh:
212                try:
213                    yield line.rstrip().split("\t")
214                except TypeError:
215                    yield line.decode().rstrip().split("\t")
216
217    def build_conversion(
218        self,
219        version: str,
220        files: tuple[str, str] = ("", ""),
221        urls: tuple[str, str] = ("", ""),
222    ):
223        """
224        Download and build conversion table against another version.
225        Optional function, conversion tables are automatically downloaded
226        and built on first .convert() call.
227        """
228        if version not in self._supported_versions:
229            raise ValueError(
230                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
231            )
232
233        if not self._convert_from:
234            # Collect the accessions of the representative entries for each taxa in the current version
235            tx_accs = {}
236            for acc, rep, lin, _ in self._download_parse_version_taxa(
237                version=self.version, file=files[0], url=urls[0]
238            ):
239                if rep == "t":
240                    for tx in lin.split(";"):
241                        if tx not in tx_accs:
242                            tx_accs[tx] = []
243                        tx_accs[tx].append(acc)
244            # Assign only at the end, in case of download/parse errors
245            self._convert_from = tx_accs
246
247        if version not in self._convert_to:
248            # Collect the lineage for each accession
249            acc_lin = {}
250            for acc, _, lin, _ in self._download_parse_version_taxa(
251                version=version, file=files[1], url=urls[1]
252            ):
253                acc_lin[acc] = lin
254            # Assign only at the end, in case of download/parse errors
255            self._convert_to[version] = acc_lin
256
257    def convert(self, node: str, version: str) -> set[str]:
258        """
259        Converts a taxonomic node from current version to another.
260        It uses a genomic centric strategy, based on the taxa of the representative
261        genome among versions.
262        It may return multiple nodes for ranks above species,
263        since multiple representatives can be split into more taxa.
264        It may return an empty set if node is not found in the current version
265        or if related representative is no longer available in the requested version.
266
267        Example:
268
269            from multitax import GtdbTx
270            tax = GtdbTx(version="95")
271
272            # Species - always one-to-one
273            tax.convert('s__Giesbergeria metamorpha', version="226")
274            {'s__Simplicispira metamorpha'}
275
276            # Other ranks - may be one-to-many
277            tax.convert('g__UBA6715', version="226")
278            {'g__Aquirufa', 'g__Sandaracinomonas'}
279        """
280
281        if version not in self._supported_versions:
282            raise ValueError(
283                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
284            )
285
286        if not self._convert_from or version not in self._convert_to:
287            self.build_conversion(version=version)
288
289        return self._lookup_version_taxa(node, version)
GtdbTx(**kwargs)
70    def __init__(self, **kwargs):
71        self._convert_to = {}
72        self._convert_from = {}
73        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def build_conversion( self, version: str, files: tuple[str, str] = ('', ''), urls: tuple[str, str] = ('', '')):
217    def build_conversion(
218        self,
219        version: str,
220        files: tuple[str, str] = ("", ""),
221        urls: tuple[str, str] = ("", ""),
222    ):
223        """
224        Download and build conversion table against another version.
225        Optional function, conversion tables are automatically downloaded
226        and built on first .convert() call.
227        """
228        if version not in self._supported_versions:
229            raise ValueError(
230                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
231            )
232
233        if not self._convert_from:
234            # Collect the accessions of the representative entries for each taxa in the current version
235            tx_accs = {}
236            for acc, rep, lin, _ in self._download_parse_version_taxa(
237                version=self.version, file=files[0], url=urls[0]
238            ):
239                if rep == "t":
240                    for tx in lin.split(";"):
241                        if tx not in tx_accs:
242                            tx_accs[tx] = []
243                        tx_accs[tx].append(acc)
244            # Assign only at the end, in case of download/parse errors
245            self._convert_from = tx_accs
246
247        if version not in self._convert_to:
248            # Collect the lineage for each accession
249            acc_lin = {}
250            for acc, _, lin, _ in self._download_parse_version_taxa(
251                version=version, file=files[1], url=urls[1]
252            ):
253                acc_lin[acc] = lin
254            # Assign only at the end, in case of download/parse errors
255            self._convert_to[version] = acc_lin

Download and build conversion table against another version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.

def convert(self, node: str, version: str) -> set[str]:
257    def convert(self, node: str, version: str) -> set[str]:
258        """
259        Converts a taxonomic node from current version to another.
260        It uses a genomic centric strategy, based on the taxa of the representative
261        genome among versions.
262        It may return multiple nodes for ranks above species,
263        since multiple representatives can be split into more taxa.
264        It may return an empty set if node is not found in the current version
265        or if related representative is no longer available in the requested version.
266
267        Example:
268
269            from multitax import GtdbTx
270            tax = GtdbTx(version="95")
271
272            # Species - always one-to-one
273            tax.convert('s__Giesbergeria metamorpha', version="226")
274            {'s__Simplicispira metamorpha'}
275
276            # Other ranks - may be one-to-many
277            tax.convert('g__UBA6715', version="226")
278            {'g__Aquirufa', 'g__Sandaracinomonas'}
279        """
280
281        if version not in self._supported_versions:
282            raise ValueError(
283                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
284            )
285
286        if not self._convert_from or version not in self._convert_to:
287            self.build_conversion(version=version)
288
289        return self._lookup_version_taxa(node, version)

Converts a taxonomic node from current version to another. It uses a genomic centric strategy, based on the taxa of the representative genome among versions. It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.

Example:

from multitax import GtdbTx
tax = GtdbTx(version="95")

# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}

# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
class NcbiTx(multitax.multitax.MultiTax):
 13class NcbiTx(MultiTax):
 14    _default_version = "current"
 15    _supported_versions = ["current"]
 16    _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"}
 17
 18    def __init__(self, **kwargs):
 19        self._merged = {}
 20        self._extended_name_nodes = {}
 21        super().__init__(**kwargs)
 22
 23    def __repr__(self):
 24        return format_repr(inst=self)
 25
 26    def _build_translation(self, target_tax, file: str = None, url: str = None):
 27        translated_nodes = {}
 28        if target_tax.__class__.__name__ == "GtdbTx":
 29            if file:
 30                fhs = open_files([file])
 31            else:
 32                if not url:
 33                    url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{target_tax.version}_acc_rep_lin_ncbi.tsv.gz"
 34                fhs = download_files(urls=[url], retry_attempts=3)
 35
 36            accession_col = 0
 37            gtdb_taxonomy_col = 2
 38            ncbi_taxid_col = 3
 39
 40            for source, fh in fhs.items():
 41                for line in fh:
 42                    try:
 43                        fields = line.rstrip().split("\t")
 44                    except TypeError:
 45                        fields = line.decode().rstrip().split("\t")
 46
 47                    # skip header
 48                    if fields[accession_col] == "accession":
 49                        continue
 50
 51                    # Build GTDB lineage from leaf (species on given lineage)
 52                    # to accomodate possible changes in the loaded tax
 53                    gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
 54                    if gtdb_leaf_node != target_tax.undefined_node:
 55                        gtdb_nodes = target_tax.lineage(
 56                            gtdb_leaf_node,
 57                            ranks=[
 58                                "domain",
 59                                "phylum",
 60                                "class",
 61                                "order",
 62                                "family",
 63                                "genus",
 64                                "species",
 65                            ],
 66                        )
 67                    else:
 68                        continue
 69
 70                    # Build NCBI lineage from leaf
 71                    ncbi_leaf_node = self.latest(fields[ncbi_taxid_col])
 72                    if ncbi_leaf_node != self.undefined_node:
 73                        # Additional add connection from leaf to species on GTDB
 74                        # that could represent strain, etc on NCBI tax
 75                        if ncbi_leaf_node not in translated_nodes:
 76                            translated_nodes[ncbi_leaf_node] = set()
 77                        translated_nodes[ncbi_leaf_node].add(gtdb_leaf_node)
 78                        ncbi_nodes = self.lineage(
 79                            ncbi_leaf_node,
 80                            ranks=[
 81                                "domain",
 82                                "phylum",
 83                                "class",
 84                                "order",
 85                                "family",
 86                                "genus",
 87                                "species",
 88                            ],
 89                        )
 90                    else:
 91                        continue
 92
 93                    # Match ranks
 94                    for i, ncbi_n in enumerate(ncbi_nodes):
 95                        if (
 96                            gtdb_nodes[i] != target_tax.undefined_node
 97                            and ncbi_n != self.undefined_node
 98                        ):
 99                            if ncbi_n not in translated_nodes:
100                                translated_nodes[ncbi_n] = set()
101                            translated_nodes[ncbi_n].add(gtdb_nodes[i])
102            close_files(fhs)
103        else:
104            warnings.warn(
105                "Translation between taxonomies ["
106                + self.__class__.__name__
107                + ","
108                + target_tax.__class__.__name__
109                + "] not yet implemented."
110            )
111
112        return translated_nodes
113
114    def _parse(self, fhs, **kwargs):
115        fhs_list = list(fhs.values())
116        # One element tar.gz -> taxdump.tar.gz
117        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
118            nodes, ranks, names, self._merged = self._parse_taxdump(
119                fhs_list[0], extended_names=kwargs["extended_names"]
120            )
121        else:
122            # nodes.dmp
123            nodes, ranks = self._parse_nodes(fhs_list[0])
124
125            # [names.dmp]
126            if len(fhs) >= 2:
127                names = self._parse_names(
128                    fhs_list[1], extended_names=kwargs["extended_names"]
129                )
130            else:
131                names = {}
132
133            # [merged.dmp]
134            if len(fhs) == 3:
135                self._merged = self._parse_merged(fhs_list[2])
136        return nodes, ranks, names
137
138    def _parse_merged(self, fh):
139        merged = {}
140        for line in fh:
141            try:
142                old_taxid, _, new_taxid, _ = line.split("\t", 3)
143            except TypeError:
144                old_taxid, _, new_taxid, _ = line.decode().split("\t", 3)
145            merged[old_taxid] = new_taxid
146        return merged
147
148    def _parse_names(self, fh, extended_names):
149        names = {}
150        for line in fh:
151            try:
152                node, name, _, name_class = line.split("\t|\t")
153            except TypeError:
154                node, name, _, name_class = line.decode().split("\t|\t")
155            if name_class.replace("\t|\n", "") == "scientific name":
156                names[node] = name
157            elif extended_names:
158                if name not in self._extended_name_nodes:
159                    self._extended_name_nodes[name] = []
160                self._extended_name_nodes[name].append(node)
161
162        return names
163
164    def _parse_nodes(self, fh):
165        nodes = {}
166        ranks = {}
167        for line in fh:
168            try:
169                taxid, parent_taxid, rank, _ = line.split("\t|\t", 3)
170            except TypeError:
171                taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3)
172            ranks[taxid] = rank
173            nodes[taxid] = parent_taxid
174        return nodes, ranks
175
176    def _parse_taxdump(self, fh_taxdump, extended_names):
177        with fh_taxdump.extractfile("nodes.dmp") as fh_nodes:
178            nodes, ranks = self._parse_nodes(fh_nodes)
179        with fh_taxdump.extractfile("names.dmp") as fh_names:
180            names = self._parse_names(fh_names, extended_names=extended_names)
181        with fh_taxdump.extractfile("merged.dmp") as fh_merged:
182            merged = self._parse_merged(fh_merged)
183        return nodes, ranks, names, merged
184
185    def latest(self, node: str):
186        n = super().latest(node)
187        if n == self.undefined_node:
188            n = self.merged(node)
189        return n
190
191    def merged(self, node: str):
192        """
193        Returns relative entry from the merged.dmp file of a given node.
194        """
195        if node in self._merged:
196            return self._merged[node]
197        else:
198            return self.undefined_node
199
200    def search_name(
201        self,
202        text: str,
203        rank: str = None,
204        exact: bool = True,
205        force_extended: bool = False,
206    ):
207        """
208        Search node by exact or partial name.
209
210        Default order (can be skipped with **force_extended=True**):
211
212        1) Search names defined as "scientific name" on nodes.dmp
213
214        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
215
216        Parameters:
217        * **text** *[str]*: Text to search.
218        * **rank** *[str]*: Filter results by rank.
219        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
220        * **force_extended** *[bool]*: Search for text in all categories at once.
221
222        Returns: list of matching nodes
223        """
224        n = super().search_name(text, rank=rank, exact=exact)
225        if n and not force_extended:
226            return n
227        else:
228            if exact:
229                ret = self._exact_name(text, self._extended_name_nodes)
230            else:
231                ret = self._partial_name(text, self._extended_name_nodes)
232
233            # Only return nodes of chosen rank
234            if rank:
235                ret = filter_function(ret, self.rank, rank)
236
237            return list(set(n + ret))
238
239    def stats(self, **kwargs):
240        s = super().stats(**kwargs)
241        if self._merged:
242            s["merged"] = len(self._merged)
243        if self._extended_name_nodes:
244            s["extended_names"] = len(self._extended_name_nodes)
245        return s
NcbiTx(**kwargs)
18    def __init__(self, **kwargs):
19        self._merged = {}
20        self._extended_name_nodes = {}
21        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def latest(self, node: str):
185    def latest(self, node: str):
186        n = super().latest(node)
187        if n == self.undefined_node:
188            n = self.merged(node)
189        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def merged(self, node: str):
191    def merged(self, node: str):
192        """
193        Returns relative entry from the merged.dmp file of a given node.
194        """
195        if node in self._merged:
196            return self._merged[node]
197        else:
198            return self.undefined_node

Returns relative entry from the merged.dmp file of a given node.

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
200    def search_name(
201        self,
202        text: str,
203        rank: str = None,
204        exact: bool = True,
205        force_extended: bool = False,
206    ):
207        """
208        Search node by exact or partial name.
209
210        Default order (can be skipped with **force_extended=True**):
211
212        1) Search names defined as "scientific name" on nodes.dmp
213
214        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
215
216        Parameters:
217        * **text** *[str]*: Text to search.
218        * **rank** *[str]*: Filter results by rank.
219        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
220        * **force_extended** *[bool]*: Search for text in all categories at once.
221
222        Returns: list of matching nodes
223        """
224        n = super().search_name(text, rank=rank, exact=exact)
225        if n and not force_extended:
226            return n
227        else:
228            if exact:
229                ret = self._exact_name(text, self._extended_name_nodes)
230            else:
231                ret = self._partial_name(text, self._extended_name_nodes)
232
233            # Only return nodes of chosen rank
234            if rank:
235                ret = filter_function(ret, self.rank, rank)
236
237            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search names defined as "scientific name" on nodes.dmp

2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
239    def stats(self, **kwargs):
240        s = super().stats(**kwargs)
241        if self._merged:
242            s["merged"] = len(self._merged)
243        if self._extended_name_nodes:
244            s["extended_names"] = len(self._extended_name_nodes)
245        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class OttTx(multitax.multitax.MultiTax):
  7class OttTx(MultiTax):
  8    _default_version = "3.7.3"
  9    _supported_versions = ["3.6", "3.7.3"]
 10    _default_urls = {
 11        "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz",
 12        "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz",
 13    }
 14    _default_root_node = "805080"
 15
 16    def __init__(self, **kwargs):
 17        self._forwards = {}
 18        self._extended_name_nodes = {}
 19        super().__init__(**kwargs)
 20
 21    def __repr__(self):
 22        return format_repr(inst=self)
 23
 24    def _build_translation(self, target_tax, file: str = None, url: str = None):
 25        warnings.warn(
 26            "Translation between taxonomies ["
 27            + self.__class__.__name__
 28            + ","
 29            + target_tax.__class__.__name__
 30            + "] not yet implemented."
 31        )
 32        return {}
 33
 34    def _parse(self, fhs, **kwargs):
 35        fhs_list = list(fhs.values())
 36        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
 37            nodes, ranks, names = self._parse_ott(
 38                fhs_list[0], extended_names=kwargs["extended_names"]
 39            )
 40        else:
 41            # nodes.dmp
 42            nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
 43            # [forwards.tsv]
 44            if len(fhs) >= 2:
 45                self._forwards = self._parse_forwards(fhs_list[1])
 46            if len(fhs) == 3 and kwargs["extended_names"]:
 47                self._extended_name_nodes = self._parse_synonyms(fhs_list[2])
 48
 49        return nodes, ranks, names
 50
 51    def _parse_forwards(self, fh):
 52        forwards = {}
 53        # skip first line header
 54        next(fh)
 55        for line in fh:
 56            try:
 57                old_taxid, new_taxid = line.rstrip().split("\t")
 58            except TypeError:
 59                old_taxid, new_taxid = line.decode().rstrip().split("\t")
 60            forwards[old_taxid] = new_taxid
 61        return forwards
 62
 63    def _parse_ott(self, fh_taxdump, extended_names):
 64        # Get files inside folder by name
 65        for e in fh_taxdump.getnames():
 66            if e.endswith("taxonomy.tsv"):
 67                tax = e
 68            if e.endswith("forwards.tsv"):
 69                fwr = e
 70            if e.endswith("synonyms.tsv"):
 71                syn = e
 72
 73        with fh_taxdump.extractfile(tax) as fh_nodes:
 74            nodes, ranks, names = self._parse_taxonomy(fh_nodes)
 75        with fh_taxdump.extractfile(fwr) as fh_forwards:
 76            self._forwards = self._parse_forwards(fh_forwards)
 77        if extended_names:
 78            with fh_taxdump.extractfile(syn) as fh_synonyms:
 79                self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
 80        return nodes, ranks, names
 81
 82    def _parse_synonyms(self, fh):
 83        synonyms = {}
 84        # skip first line header
 85        next(fh)
 86        for line in fh:
 87            try:
 88                name, taxid, _ = line.split("\t|\t", 2)
 89            except TypeError:
 90                name, taxid, _ = line.decode().split("\t|\t", 2)
 91            if name not in synonyms:
 92                synonyms[name] = []
 93            synonyms[name].append(taxid)
 94
 95        return synonyms
 96
 97    def _parse_taxonomy(self, fh):
 98        nodes = {}
 99        ranks = {}
100        names = {}
101        # skip first line header
102        next(fh)
103        for line in fh:
104            try:
105                taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4)
106            except TypeError:
107                taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4)
108            ranks[taxid] = rank
109            nodes[taxid] = parent_taxid
110            names[taxid] = name
111        return nodes, ranks, names
112
113    def forwards(self, node: str):
114        """
115        Returns relative entry from the forwards.tsv file of a given node.
116        """
117        if node in self._forwards:
118            return self._forwards[node]
119        else:
120            return self.undefined_node
121
122    def latest(self, node: str):
123        n = super().latest(node)
124        if n == self.undefined_node:
125            n = self.forwards(node)
126        return n
127
128    def search_name(
129        self,
130        text: str,
131        rank: str = None,
132        exact: bool = True,
133        force_extended: bool = False,
134    ):
135        """
136        Search node by exact or partial name.
137
138        Default order (can be skipped with **force_extended=True**):
139
140        1) Search default names defined on "taxonomy.tsv"
141
142        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
143
144        Parameters:
145        * **text** *[str]*: Text to search.
146        * **rank** *[str]*: Filter results by rank.
147        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
148        * **force_extended** *[bool]*: Search for text in all categories at once.
149
150        Returns: list of matching nodes
151        """
152        n = super().search_name(text, rank=rank, exact=exact)
153        if n and not force_extended:
154            return n
155        else:
156            if exact:
157                ret = self._exact_name(text, self._extended_name_nodes)
158            else:
159                ret = self._partial_name(text, self._extended_name_nodes)
160
161            # Only return nodes of chosen rank
162            if rank:
163                ret = filter_function(ret, self.rank, rank)
164
165            return list(set(n + ret))
166
167    def stats(self, **kwargs):
168        s = super().stats(**kwargs)
169        if self._forwards:
170            s["forwards"] = len(self._forwards)
171        if self._extended_name_nodes:
172            s["extended_names"] = len(self._extended_name_nodes)
173        return s
OttTx(**kwargs)
16    def __init__(self, **kwargs):
17        self._forwards = {}
18        self._extended_name_nodes = {}
19        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def forwards(self, node: str):
113    def forwards(self, node: str):
114        """
115        Returns relative entry from the forwards.tsv file of a given node.
116        """
117        if node in self._forwards:
118            return self._forwards[node]
119        else:
120            return self.undefined_node

Returns relative entry from the forwards.tsv file of a given node.

def latest(self, node: str):
122    def latest(self, node: str):
123        n = super().latest(node)
124        if n == self.undefined_node:
125            n = self.forwards(node)
126        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
128    def search_name(
129        self,
130        text: str,
131        rank: str = None,
132        exact: bool = True,
133        force_extended: bool = False,
134    ):
135        """
136        Search node by exact or partial name.
137
138        Default order (can be skipped with **force_extended=True**):
139
140        1) Search default names defined on "taxonomy.tsv"
141
142        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
143
144        Parameters:
145        * **text** *[str]*: Text to search.
146        * **rank** *[str]*: Filter results by rank.
147        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
148        * **force_extended** *[bool]*: Search for text in all categories at once.
149
150        Returns: list of matching nodes
151        """
152        n = super().search_name(text, rank=rank, exact=exact)
153        if n and not force_extended:
154            return n
155        else:
156            if exact:
157                ret = self._exact_name(text, self._extended_name_nodes)
158            else:
159                ret = self._partial_name(text, self._extended_name_nodes)
160
161            # Only return nodes of chosen rank
162            if rank:
163                ret = filter_function(ret, self.rank, rank)
164
165            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search default names defined on "taxonomy.tsv"

2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
167    def stats(self, **kwargs):
168        s = super().stats(**kwargs)
169        if self._forwards:
170            s["forwards"] = len(self._forwards)
171        if self._extended_name_nodes:
172            s["extended_names"] = len(self._extended_name_nodes)
173        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class SilvaTx(multitax.multitax.MultiTax):
 7class SilvaTx(MultiTax):
 8    _default_version = "ssu_138.2"
 9    _supported_versions = ["lsu_138.2", "ssu_138.2"]
10    _default_urls = {
11        "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz",
12        "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz",
13    }
14
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)
17
18    def __repr__(self):
19        return format_repr(inst=self)
20
21    def _build_translation(self, target_tax, file: str = None, url: str = None):
22        warnings.warn(
23            "Translation between taxonomies ["
24            + self.__class__.__name__
25            + ","
26            + target_tax.__class__.__name__
27            + "] not yet implemented."
28        )
29        return {}
30
31    def _parse(self, fhs, **kwargs):
32        nodes = {}
33        ranks = {}
34        names = {}
35
36        lin = {}
37        for source, fh in fhs.items():
38            for line in fh:
39                try:
40                    name_lineage, taxid, rank, _ = line.split("\t", 3)
41                except TypeError:
42                    name_lineage, taxid, rank, _ = line.decode().split("\t", 3)
43                # Remove last char ";"
44                lineage = name_lineage[:-1]
45                name = lineage.split(";")[-1]
46                # Save lineage to build tree
47                lin[lineage] = taxid
48                names[taxid] = name
49                ranks[taxid] = rank
50
51        # Build parent node connection
52        for lineage, taxid in lin.items():
53            t = taxid
54            lsplit = lineage.split(";")[:-1]
55            while lsplit:
56                parent_taxid = lin[";".join(lsplit)]
57                if t not in nodes:
58                    nodes[t] = parent_taxid
59                t = parent_taxid
60                del lsplit[-1]  # remove last element
61            # Connect last node to root
62            if t not in nodes:
63                nodes[t] = self._default_root_node
64
65        return nodes, ranks, names
SilvaTx(**kwargs)
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")