multitax

 1import importlib.metadata
 2
 3__version__ = importlib.metadata.version(__name__)
 4
 5__all__ = (
 6    "CustomTx",
 7    "DummyTx",
 8    "GreengenesTx",
 9    "GtdbTx",
10    "NcbiTx",
11    "OttTx",
12    "SilvaTx",
13)
14
15from .customtx import CustomTx
16from .dummytx import DummyTx
17from .greengenestx import GreengenesTx
18from .gtdbtx import GtdbTx
19from .ncbitx import NcbiTx
20from .otttx import OttTx
21from .silvatx import SilvaTx
class CustomTx(multitax.multitax.MultiTax):
 7class CustomTx(MultiTax):
 8    _required_cols = ["node", "parent"]
 9    _possible_cols = ["node", "parent", "rank", "name"]
10
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)
31
32    def __repr__(self):
33        return format_repr(inst=self)
34
35    def _build_translation(self, target_tax, file: str = None, url: str = None):
36        warnings.warn(
37            "Translation between taxonomies ["
38            + self.__class__.__name__
39            + ","
40            + target_tax.__class__.__name__
41            + "] not yet implemented."
42        )
43        return {}
44
45    def _parse(self, fhs, **kwargs):
46        nodes = {}
47        ranks = {}
48        names = {}
49        for source, fh in fhs.items():
50            for line in fh:
51                try:
52                    fields = line.rstrip().split(self._sep)
53                except TypeError:
54                    fields = line.decode().rstrip().split(self._sep)
55
56                node = fields[self._cols["node"]]
57                nodes[node] = fields[self._cols["parent"]]
58                if "name" in self._cols:
59                    names[node] = fields[self._cols["name"]]
60                if "rank" in self._cols:
61                    ranks[node] = fields[self._cols["rank"]]
62
63        return nodes, ranks, names
64
65    def _parse_cols(self, cols):
66        if isinstance(cols, list):
67            cols = {c: i for i, c in enumerate(cols)}
68
69        for rc in self._required_cols:
70            if rc not in cols:
71                raise ValueError(rc + " is a required column")
72
73        for c in cols:
74            if c not in self._possible_cols:
75                raise ValueError(
76                    c + " is not a valid column: " + ",".join(self._possible_cols)
77                )
78
79        return cols
CustomTx( cols: list = ['node', 'parent', 'rank', 'name'], sep: str = '\t', **kwargs)
11    def __init__(
12        self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs
13    ):
14        """
15        CustomTx()
16
17        Parameters:
18        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
19        * **sep** *[str]*: Separator of fields
20        * **\\*\\*kwargs** defined at `multitax.multitax.MultiTax`
21
22        Example:
23
24            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
25            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
26        """
27
28        self._cols = self._parse_cols(cols)
29        self._sep = sep
30        super().__init__(**kwargs)

CustomTx()

Parameters:

  • cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
  • sep [str]: Separator of fields
  • **kwargs defined at multitax.multitax.MultiTax

Example:

tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
class DummyTx(multitax.multitax.MultiTax):
 6class DummyTx(MultiTax):
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)
16
17    def __repr__(self):
18        return format_repr(inst=self)
DummyTx(**kwargs)
 7    def __init__(self, **kwargs):
 8        """
 9        DummyTx() - Dummy empty taxonomy
10
11        Parameters:
12
13        * \\*\\*kwargs defined at `multitax.multitax.MultiTax`
14        """
15        super().__init__(**kwargs)

DummyTx() - Dummy empty taxonomy

Parameters:

class GreengenesTx(multitax.multitax.MultiTax):
 7class GreengenesTx(MultiTax):
 8    _default_version = "2024.09"
 9    _supported_versions = ["2022.10", "2024.09"]
10    _default_urls = {
11        "2024.09": "https://ftp.microbio.me/greengenes_release/2024.09/2024.09.taxonomy.id.tsv.gz",
12        "2022.10": "https://ftp.microbio.me/greengenes_release/2022.10/2022.10.taxonomy.id.tsv.gz",
13    }
14
15    _rank_codes = [
16        ("d__", "domain"),
17        ("p__", "phylum"),
18        ("c__", "class"),
19        ("o__", "order"),
20        ("f__", "family"),
21        ("g__", "genus"),
22        ("s__", "species"),
23    ]
24
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)
29
30    def __repr__(self):
31        return format_repr(inst=self)
32
33    def _build_translation(self, target_tax, file: str = None, url: str = None):
34        warnings.warn(
35            "Translation between taxonomies ["
36            + self.__class__.__name__
37            + ","
38            + target_tax.__class__.__name__
39            + "] not yet implemented."
40        )
41        return {}
42
43    def _parse(self, fhs, **kwargs):
44        nodes = {}
45        ranks = {}
46        names = {}
47
48        lineages = set()
49        for source, fh in fhs.items():
50            for line in fh:
51                try:
52                    fields = line.rstrip().split("\t")
53                except TypeError:
54                    fields = line.decode().rstrip().split("\t")
55
56                # skip header
57                if fields[0] == "Feature ID":
58                    continue
59
60                lineages.add(fields[1])
61
62        for lineage in lineages:
63            last_taxid = None
64            lin = lineage.split("; ")
65            for i in range(len(lin))[::-1]:
66                # assert rank
67                assert lin[i][:3] == self._rank_codes[i][0]
68
69                name = lin[i][3:]
70                if not name:
71                    continue  # empty entry "s__"
72
73                # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
74                taxid = lin[i]
75                rank = self._rank_codes[i][1]
76
77                if taxid not in nodes:
78                    names[taxid] = name
79                    ranks[taxid] = rank
80                if last_taxid:
81                    nodes[last_taxid] = taxid
82                last_taxid = taxid
83            nodes[last_taxid] = self._default_root_node
84
85        return nodes, ranks, names
GreengenesTx(**kwargs)
25    def __init__(self, **kwargs):
26        # forwards.tsv
27        self._forwards = {}
28        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
class GtdbTx(multitax.multitax.MultiTax):
  7class GtdbTx(MultiTax):
  8    _default_version = "232"
  9    _supported_versions = [
 10        "80",
 11        "83",
 12        "86.2",
 13        "89",
 14        "95",
 15        "202",
 16        "207",
 17        "214.1",
 18        "220",
 19        "226",
 20        "232",
 21    ]
 22
 23    _url_prefix = "https://data.gtdb.ecogenomic.org/releases/"
 24    _default_urls = {
 25        "80": [f"{_url_prefix}release80/80.0/bac_taxonomy_r80.tsv"],
 26        "83": [f"{_url_prefix}release83/83.0/bac_taxonomy_r83.tsv"],
 27        "86.2": [
 28            f"{_url_prefix}release86/86.2/ar122_taxonomy_r86.2.tsv",
 29            f"{_url_prefix}release86/86.2/bac120_taxonomy_r86.2.tsv",
 30        ],
 31        "89": [
 32            f"{_url_prefix}release89/89.0/ar122_taxonomy_r89.tsv",
 33            f"{_url_prefix}release89/89.0/bac120_taxonomy_r89.tsv",
 34        ],
 35        "95": [
 36            f"{_url_prefix}release95/95.0/ar122_taxonomy_r95.tsv.gz",
 37            f"{_url_prefix}release95/95.0/bac120_taxonomy_r95.tsv.gz",
 38        ],
 39        "202": [
 40            f"{_url_prefix}release202/202.0/ar122_taxonomy_r202.tsv.gz",
 41            f"{_url_prefix}release202/202.0/bac120_taxonomy_r202.tsv.gz",
 42        ],
 43        "207": [
 44            f"{_url_prefix}release207/207.0/ar53_taxonomy_r207.tsv.gz",
 45            f"{_url_prefix}release207/207.0/bac120_taxonomy_r207.tsv.gz",
 46        ],
 47        "214.1": [
 48            f"{_url_prefix}release214/214.1/ar53_taxonomy_r214.tsv.gz",
 49            f"{_url_prefix}release214/214.1/bac120_taxonomy_r214.tsv.gz",
 50        ],
 51        "220": [
 52            f"{_url_prefix}release220/220.0/ar53_taxonomy_r220.tsv.gz",
 53            f"{_url_prefix}release220/220.0/bac120_taxonomy_r220.tsv.gz",
 54        ],
 55        "226": [
 56            f"{_url_prefix}release226/226.0/ar53_taxonomy_r226.tsv.gz",
 57            f"{_url_prefix}release226/226.0/bac120_taxonomy_r226.tsv.gz",
 58        ],
 59        "232": [
 60            f"{_url_prefix}release232/232.0/ar53_taxonomy_r232.tsv.gz",
 61            f"{_url_prefix}release232/232.0/bac120_taxonomy_r232.tsv.gz",
 62        ],
 63    }
 64
 65    _rank_codes = [
 66        ("d__", "domain"),
 67        ("p__", "phylum"),
 68        ("c__", "class"),
 69        ("o__", "order"),
 70        ("f__", "family"),
 71        ("g__", "genus"),
 72        ("s__", "species"),
 73    ]
 74
 75    def __init__(self, **kwargs):
 76        self._convert_to = {}
 77        self._convert_from = {}
 78        super().__init__(**kwargs)
 79
 80    def __repr__(self):
 81        return format_repr(inst=self)
 82
 83    def _build_translation(self, target_tax, file: str = None, url: str = None):
 84        translated_nodes = {}
 85        if target_tax.__class__.__name__ == "NcbiTx":
 86            if file:
 87                fhs = open_files([file])
 88            else:
 89                if not url:
 90                    url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{self.version}_acc_rep_lin_ncbi.tsv.gz"
 91                fhs = download_files(urls=[url], retry_attempts=3)
 92
 93            accession_col = 0
 94            gtdb_taxonomy_col = 2
 95            ncbi_taxid_col = 3
 96
 97            for source, fh in fhs.items():
 98                for line in fh:
 99                    try:
100                        fields = line.rstrip().split("\t")
101                    except TypeError:
102                        fields = line.decode().rstrip().split("\t")
103
104                    # skip header
105                    if fields[accession_col] == "accession":
106                        continue
107
108                    ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col])
109                    if ncbi_leaf_node != target_tax.undefined_node:
110                        ncbi_nodes = target_tax.lineage(
111                            ncbi_leaf_node,
112                            ranks=[
113                                "domain",
114                                "phylum",
115                                "class",
116                                "order",
117                                "family",
118                                "genus",
119                                "species",
120                            ],
121                        )
122                    else:
123                        continue
124
125                    # Build GTDB lineage from leaf (species on given lineage)
126                    # to accomodate possible changes in the loaded tax
127                    gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
128                    if gtdb_leaf_node != self.undefined_node:
129                        gtdb_nodes = self.lineage(
130                            gtdb_leaf_node,
131                            ranks=[
132                                "domain",
133                                "phylum",
134                                "class",
135                                "order",
136                                "family",
137                                "genus",
138                                "species",
139                            ],
140                        )
141                    else:
142                        continue
143
144                    # Match ranks
145                    for i, gtdb_n in enumerate(gtdb_nodes):
146                        if (
147                            ncbi_nodes[i] != target_tax.undefined_node
148                            and gtdb_n != self.undefined_node
149                        ):
150                            if gtdb_n not in translated_nodes:
151                                translated_nodes[gtdb_n] = set()
152                            translated_nodes[gtdb_n].add(ncbi_nodes[i])
153
154            close_files(fhs)
155        else:
156            warnings.warn(
157                "Translation between taxonomies ["
158                + self.__class__.__name__
159                + ","
160                + target_tax.__class__.__name__
161                + "] not yet implemented."
162            )
163
164        return translated_nodes
165
166    def _parse(self, fhs, **kwargs):
167        nodes = {}
168        ranks = {}
169        names = {}
170        for source, fh in fhs.items():
171            for line in fh:
172                try:
173                    _, lineage = line.rstrip().split("\t")
174                except TypeError:
175                    _, lineage = line.decode().rstrip().split("\t")
176                lin = lineage.split(";")
177                for i in range(len(lin))[::-1]:
178                    # assert rank
179                    assert lin[i][:3] == self._rank_codes[i][0]
180                    # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
181                    taxid = lin[i]
182                    name = lin[i][3:]
183                    # empty entry "s__"
184                    if not name:
185                        continue
186                    rank = self._rank_codes[i][1]
187                    if i == 0:
188                        parent_taxid = self._default_root_node
189                    else:
190                        parent_taxid = lin[i - 1]
191                    if taxid not in nodes:
192                        nodes[taxid] = parent_taxid
193                        names[taxid] = name
194                        ranks[taxid] = rank
195
196        return nodes, ranks, names
197
198    def _lookup_version_taxa(self, node, version: str):
199        res = set()
200        for acc in self._convert_from.get(node, ""):
201            for tx in self._convert_to[version].get(acc, "").split(";"):
202                # Return only rank of requested node
203                if tx.startswith(node[:1]):
204                    res.add(tx)
205        return res
206
207    def _download_parse_version_taxa(self, version, file, url):
208        if file:
209            fhs = open_files(files=[file])
210        else:
211            if not url:
212                url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{version}_acc_rep_lin_ncbi.tsv.gz"
213            fhs = download_files(urls=[url], retry_attempts=3)
214
215        for fh in fhs.values():
216            for line in fh:
217                try:
218                    yield line.rstrip().split("\t")
219                except TypeError:
220                    yield line.decode().rstrip().split("\t")
221
222    def build_conversion(
223        self,
224        version: str,
225        files: tuple[str, str] = ("", ""),
226        urls: tuple[str, str] = ("", ""),
227    ):
228        """
229        Download and build conversion table against another version.
230        Optional function, conversion tables are automatically downloaded
231        and built on first .convert() call.
232        """
233        if version not in self._supported_versions:
234            raise ValueError(
235                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
236            )
237
238        if not self._convert_from:
239            # Collect the accessions of the representative entries for each taxa in the current version
240            tx_accs = {}
241            for acc, rep, lin, _ in self._download_parse_version_taxa(
242                version=self.version, file=files[0], url=urls[0]
243            ):
244                if rep == "t":
245                    for tx in lin.split(";"):
246                        if tx not in tx_accs:
247                            tx_accs[tx] = []
248                        tx_accs[tx].append(acc)
249            # Assign only at the end, in case of download/parse errors
250            self._convert_from = tx_accs
251
252        if version not in self._convert_to:
253            # Collect the lineage for each accession
254            acc_lin = {}
255            for acc, _, lin, _ in self._download_parse_version_taxa(
256                version=version, file=files[1], url=urls[1]
257            ):
258                acc_lin[acc] = lin
259            # Assign only at the end, in case of download/parse errors
260            self._convert_to[version] = acc_lin
261
262    def convert(self, node: str, version: str) -> set[str]:
263        """
264        Converts a taxonomic node from current version to another.
265        It uses a genomic centric strategy, based on the taxa of the representative
266        genome among versions.
267        It may return multiple nodes for ranks above species,
268        since multiple representatives can be split into more taxa.
269        It may return an empty set if node is not found in the current version
270        or if related representative is no longer available in the requested version.
271
272        Example:
273
274            from multitax import GtdbTx
275            tax = GtdbTx(version="95")
276
277            # Species - always one-to-one
278            tax.convert('s__Giesbergeria metamorpha', version="226")
279            {'s__Simplicispira metamorpha'}
280
281            # Other ranks - may be one-to-many
282            tax.convert('g__UBA6715', version="226")
283            {'g__Aquirufa', 'g__Sandaracinomonas'}
284        """
285
286        if version not in self._supported_versions:
287            raise ValueError(
288                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
289            )
290
291        if not self._convert_from or version not in self._convert_to:
292            self.build_conversion(version=version)
293
294        return self._lookup_version_taxa(node, version)
GtdbTx(**kwargs)
75    def __init__(self, **kwargs):
76        self._convert_to = {}
77        self._convert_from = {}
78        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def build_conversion( self, version: str, files: tuple[str, str] = ('', ''), urls: tuple[str, str] = ('', '')):
222    def build_conversion(
223        self,
224        version: str,
225        files: tuple[str, str] = ("", ""),
226        urls: tuple[str, str] = ("", ""),
227    ):
228        """
229        Download and build conversion table against another version.
230        Optional function, conversion tables are automatically downloaded
231        and built on first .convert() call.
232        """
233        if version not in self._supported_versions:
234            raise ValueError(
235                f"Version [{version}] not supported for conversion: {', '.join(self._supported_versions)}"
236            )
237
238        if not self._convert_from:
239            # Collect the accessions of the representative entries for each taxa in the current version
240            tx_accs = {}
241            for acc, rep, lin, _ in self._download_parse_version_taxa(
242                version=self.version, file=files[0], url=urls[0]
243            ):
244                if rep == "t":
245                    for tx in lin.split(";"):
246                        if tx not in tx_accs:
247                            tx_accs[tx] = []
248                        tx_accs[tx].append(acc)
249            # Assign only at the end, in case of download/parse errors
250            self._convert_from = tx_accs
251
252        if version not in self._convert_to:
253            # Collect the lineage for each accession
254            acc_lin = {}
255            for acc, _, lin, _ in self._download_parse_version_taxa(
256                version=version, file=files[1], url=urls[1]
257            ):
258                acc_lin[acc] = lin
259            # Assign only at the end, in case of download/parse errors
260            self._convert_to[version] = acc_lin

Download and build conversion table against another version. Optional function, conversion tables are automatically downloaded and built on first .convert() call.

def convert(self, node: str, version: str) -> set[str]:
262    def convert(self, node: str, version: str) -> set[str]:
263        """
264        Converts a taxonomic node from current version to another.
265        It uses a genomic centric strategy, based on the taxa of the representative
266        genome among versions.
267        It may return multiple nodes for ranks above species,
268        since multiple representatives can be split into more taxa.
269        It may return an empty set if node is not found in the current version
270        or if related representative is no longer available in the requested version.
271
272        Example:
273
274            from multitax import GtdbTx
275            tax = GtdbTx(version="95")
276
277            # Species - always one-to-one
278            tax.convert('s__Giesbergeria metamorpha', version="226")
279            {'s__Simplicispira metamorpha'}
280
281            # Other ranks - may be one-to-many
282            tax.convert('g__UBA6715', version="226")
283            {'g__Aquirufa', 'g__Sandaracinomonas'}
284        """
285
286        if version not in self._supported_versions:
287            raise ValueError(
288                f"Version [{version}] not supported: {', '.join(self._supported_versions)}"
289            )
290
291        if not self._convert_from or version not in self._convert_to:
292            self.build_conversion(version=version)
293
294        return self._lookup_version_taxa(node, version)

Converts a taxonomic node from current version to another. It uses a genomic centric strategy, based on the taxa of the representative genome among versions. It may return multiple nodes for ranks above species, since multiple representatives can be split into more taxa. It may return an empty set if node is not found in the current version or if related representative is no longer available in the requested version.

Example:

from multitax import GtdbTx
tax = GtdbTx(version="95")

# Species - always one-to-one
tax.convert('s__Giesbergeria metamorpha', version="226")
{'s__Simplicispira metamorpha'}

# Other ranks - may be one-to-many
tax.convert('g__UBA6715', version="226")
{'g__Aquirufa', 'g__Sandaracinomonas'}
class NcbiTx(multitax.multitax.MultiTax):
 13class NcbiTx(MultiTax):
 14    _default_version = "current"
 15    _supported_versions = ["current"]
 16    _default_urls = {"current": "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"}
 17
 18    def __init__(self, **kwargs):
 19        self._merged = {}
 20        self._extended_name_nodes = {}
 21        super().__init__(**kwargs)
 22
 23    def __repr__(self):
 24        return format_repr(inst=self)
 25
 26    def _build_translation(self, target_tax, file: str = None, url: str = None):
 27        translated_nodes = {}
 28        if target_tax.__class__.__name__ == "GtdbTx":
 29            if file:
 30                fhs = open_files([file])
 31            else:
 32                if not url:
 33                    url = f"https://github.com/pirovc/multitax/raw/refs/heads/main/data/gtdb/{target_tax.version}_acc_rep_lin_ncbi.tsv.gz"
 34                fhs = download_files(urls=[url], retry_attempts=3)
 35
 36            accession_col = 0
 37            gtdb_taxonomy_col = 2
 38            ncbi_taxid_col = 3
 39
 40            for source, fh in fhs.items():
 41                for line in fh:
 42                    try:
 43                        fields = line.rstrip().split("\t")
 44                    except TypeError:
 45                        fields = line.decode().rstrip().split("\t")
 46
 47                    # skip header
 48                    if fields[accession_col] == "accession":
 49                        continue
 50
 51                    # Build GTDB lineage from leaf (species on given lineage)
 52                    # to accomodate possible changes in the loaded tax
 53                    gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
 54                    if gtdb_leaf_node != target_tax.undefined_node:
 55                        gtdb_nodes = target_tax.lineage(
 56                            gtdb_leaf_node,
 57                            ranks=[
 58                                "domain",
 59                                "phylum",
 60                                "class",
 61                                "order",
 62                                "family",
 63                                "genus",
 64                                "species",
 65                            ],
 66                        )
 67                    else:
 68                        continue
 69
 70                    # Build NCBI lineage from leaf
 71                    ncbi_leaf_node = self.latest(fields[ncbi_taxid_col])
 72                    if ncbi_leaf_node != self.undefined_node:
 73                        # Additional add connection from leaf to species on GTDB
 74                        # that could represent strain, etc on NCBI tax
 75                        if ncbi_leaf_node not in translated_nodes:
 76                            translated_nodes[ncbi_leaf_node] = set()
 77                        translated_nodes[ncbi_leaf_node].add(gtdb_leaf_node)
 78                        ncbi_nodes = self.lineage(
 79                            ncbi_leaf_node,
 80                            ranks=[
 81                                "domain",
 82                                "phylum",
 83                                "class",
 84                                "order",
 85                                "family",
 86                                "genus",
 87                                "species",
 88                            ],
 89                        )
 90                    else:
 91                        continue
 92
 93                    # Match ranks
 94                    for i, ncbi_n in enumerate(ncbi_nodes):
 95                        if (
 96                            gtdb_nodes[i] != target_tax.undefined_node
 97                            and ncbi_n != self.undefined_node
 98                        ):
 99                            if ncbi_n not in translated_nodes:
100                                translated_nodes[ncbi_n] = set()
101                            translated_nodes[ncbi_n].add(gtdb_nodes[i])
102            close_files(fhs)
103        else:
104            warnings.warn(
105                "Translation between taxonomies ["
106                + self.__class__.__name__
107                + ","
108                + target_tax.__class__.__name__
109                + "] not yet implemented."
110            )
111
112        return translated_nodes
113
114    def _parse(self, fhs, **kwargs):
115        fhs_list = list(fhs.values())
116        # One element tar.gz -> taxdump.tar.gz
117        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
118            nodes, ranks, names, self._merged = self._parse_taxdump(
119                fhs_list[0], extended_names=kwargs["extended_names"]
120            )
121        else:
122            # nodes.dmp
123            nodes, ranks = self._parse_nodes(fhs_list[0])
124
125            # [names.dmp]
126            if len(fhs) >= 2:
127                names = self._parse_names(
128                    fhs_list[1], extended_names=kwargs["extended_names"]
129                )
130            else:
131                names = {}
132
133            # [merged.dmp]
134            if len(fhs) == 3:
135                self._merged = self._parse_merged(fhs_list[2])
136        return nodes, ranks, names
137
138    def _parse_merged(self, fh):
139        merged = {}
140        for line in fh:
141            try:
142                old_taxid, _, new_taxid, _ = line.split("\t", 3)
143            except TypeError:
144                old_taxid, _, new_taxid, _ = line.decode().split("\t", 3)
145            merged[old_taxid] = new_taxid
146        return merged
147
148    def _parse_names(self, fh, extended_names):
149        names = {}
150        for line in fh:
151            try:
152                node, name, _, name_class = line.split("\t|\t")
153            except TypeError:
154                node, name, _, name_class = line.decode().split("\t|\t")
155            if name_class.replace("\t|\n", "") == "scientific name":
156                names[node] = name
157            elif extended_names:
158                if name not in self._extended_name_nodes:
159                    self._extended_name_nodes[name] = []
160                self._extended_name_nodes[name].append(node)
161
162        return names
163
164    def _parse_nodes(self, fh):
165        nodes = {}
166        ranks = {}
167        for line in fh:
168            try:
169                taxid, parent_taxid, rank, _ = line.split("\t|\t", 3)
170            except TypeError:
171                taxid, parent_taxid, rank, _ = line.decode().split("\t|\t", 3)
172            ranks[taxid] = rank
173            nodes[taxid] = parent_taxid
174        return nodes, ranks
175
176    def _parse_taxdump(self, fh_taxdump, extended_names):
177        with fh_taxdump.extractfile("nodes.dmp") as fh_nodes:
178            nodes, ranks = self._parse_nodes(fh_nodes)
179        with fh_taxdump.extractfile("names.dmp") as fh_names:
180            names = self._parse_names(fh_names, extended_names=extended_names)
181        with fh_taxdump.extractfile("merged.dmp") as fh_merged:
182            merged = self._parse_merged(fh_merged)
183        return nodes, ranks, names, merged
184
185    def latest(self, node: str):
186        n = super().latest(node)
187        if n == self.undefined_node:
188            n = self.merged(node)
189        return n
190
191    def merged(self, node: str):
192        """
193        Returns relative entry from the merged.dmp file of a given node.
194        """
195        if node in self._merged:
196            return self._merged[node]
197        else:
198            return self.undefined_node
199
200    def search_name(
201        self,
202        text: str,
203        rank: str = None,
204        exact: bool = True,
205        force_extended: bool = False,
206    ):
207        """
208        Search node by exact or partial name.
209
210        Default order (can be skipped with **force_extended=True**):
211
212        1) Search names defined as "scientific name" on nodes.dmp
213
214        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
215
216        Parameters:
217        * **text** *[str]*: Text to search.
218        * **rank** *[str]*: Filter results by rank.
219        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
220        * **force_extended** *[bool]*: Search for text in all categories at once.
221
222        Returns: list of matching nodes
223        """
224        n = super().search_name(text, rank=rank, exact=exact)
225        if n and not force_extended:
226            return n
227        else:
228            if exact:
229                ret = self._exact_name(text, self._extended_name_nodes)
230            else:
231                ret = self._partial_name(text, self._extended_name_nodes)
232
233            # Only return nodes of chosen rank
234            if rank:
235                ret = filter_function(ret, self.rank, rank)
236
237            return list(set(n + ret))
238
239    def stats(self, **kwargs):
240        s = super().stats(**kwargs)
241        if self._merged:
242            s["merged"] = len(self._merged)
243        if self._extended_name_nodes:
244            s["extended_names"] = len(self._extended_name_nodes)
245        return s
NcbiTx(**kwargs)
18    def __init__(self, **kwargs):
19        self._merged = {}
20        self._extended_name_nodes = {}
21        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def latest(self, node: str):
185    def latest(self, node: str):
186        n = super().latest(node)
187        if n == self.undefined_node:
188            n = self.merged(node)
189        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def merged(self, node: str):
191    def merged(self, node: str):
192        """
193        Returns relative entry from the merged.dmp file of a given node.
194        """
195        if node in self._merged:
196            return self._merged[node]
197        else:
198            return self.undefined_node

Returns relative entry from the merged.dmp file of a given node.

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
200    def search_name(
201        self,
202        text: str,
203        rank: str = None,
204        exact: bool = True,
205        force_extended: bool = False,
206    ):
207        """
208        Search node by exact or partial name.
209
210        Default order (can be skipped with **force_extended=True**):
211
212        1) Search names defined as "scientific name" on nodes.dmp
213
214        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
215
216        Parameters:
217        * **text** *[str]*: Text to search.
218        * **rank** *[str]*: Filter results by rank.
219        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
220        * **force_extended** *[bool]*: Search for text in all categories at once.
221
222        Returns: list of matching nodes
223        """
224        n = super().search_name(text, rank=rank, exact=exact)
225        if n and not force_extended:
226            return n
227        else:
228            if exact:
229                ret = self._exact_name(text, self._extended_name_nodes)
230            else:
231                ret = self._partial_name(text, self._extended_name_nodes)
232
233            # Only return nodes of chosen rank
234            if rank:
235                ret = filter_function(ret, self.rank, rank)
236
237            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search names defined as "scientific name" on nodes.dmp

2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
239    def stats(self, **kwargs):
240        s = super().stats(**kwargs)
241        if self._merged:
242            s["merged"] = len(self._merged)
243        if self._extended_name_nodes:
244            s["extended_names"] = len(self._extended_name_nodes)
245        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class OttTx(multitax.multitax.MultiTax):
  7class OttTx(MultiTax):
  8    _default_version = "3.7.3"
  9    _supported_versions = ["3.6", "3.7.3"]
 10    _default_urls = {
 11        "3.6": "https://files.opentreeoflife.org/ott/ott3.6/ott3.6.tgz",
 12        "3.7.3": "https://files.opentreeoflife.org/ott/ott3.7.3/ott3.7.3.tgz",
 13    }
 14    _default_root_node = "805080"
 15
 16    def __init__(self, **kwargs):
 17        self._forwards = {}
 18        self._extended_name_nodes = {}
 19        super().__init__(**kwargs)
 20
 21    def __repr__(self):
 22        return format_repr(inst=self)
 23
 24    def _build_translation(self, target_tax, file: str = None, url: str = None):
 25        warnings.warn(
 26            "Translation between taxonomies ["
 27            + self.__class__.__name__
 28            + ","
 29            + target_tax.__class__.__name__
 30            + "] not yet implemented."
 31        )
 32        return {}
 33
 34    def _parse(self, fhs, **kwargs):
 35        fhs_list = list(fhs.values())
 36        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
 37            nodes, ranks, names = self._parse_ott(
 38                fhs_list[0], extended_names=kwargs["extended_names"]
 39            )
 40        else:
 41            # nodes.dmp
 42            nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
 43            # [forwards.tsv]
 44            if len(fhs) >= 2:
 45                self._forwards = self._parse_forwards(fhs_list[1])
 46            if len(fhs) == 3 and kwargs["extended_names"]:
 47                self._extended_name_nodes = self._parse_synonyms(fhs_list[2])
 48
 49        return nodes, ranks, names
 50
 51    def _parse_forwards(self, fh):
 52        forwards = {}
 53        # skip first line header
 54        next(fh)
 55        for line in fh:
 56            try:
 57                old_taxid, new_taxid = line.rstrip().split("\t")
 58            except TypeError:
 59                old_taxid, new_taxid = line.decode().rstrip().split("\t")
 60            forwards[old_taxid] = new_taxid
 61        return forwards
 62
 63    def _parse_ott(self, fh_taxdump, extended_names):
 64        # Get files inside folder by name
 65        for e in fh_taxdump.getnames():
 66            if e.endswith("taxonomy.tsv"):
 67                tax = e
 68            if e.endswith("forwards.tsv"):
 69                fwr = e
 70            if e.endswith("synonyms.tsv"):
 71                syn = e
 72
 73        with fh_taxdump.extractfile(tax) as fh_nodes:
 74            nodes, ranks, names = self._parse_taxonomy(fh_nodes)
 75        with fh_taxdump.extractfile(fwr) as fh_forwards:
 76            self._forwards = self._parse_forwards(fh_forwards)
 77        if extended_names:
 78            with fh_taxdump.extractfile(syn) as fh_synonyms:
 79                self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
 80        return nodes, ranks, names
 81
 82    def _parse_synonyms(self, fh):
 83        synonyms = {}
 84        # skip first line header
 85        next(fh)
 86        for line in fh:
 87            try:
 88                name, taxid, _ = line.split("\t|\t", 2)
 89            except TypeError:
 90                name, taxid, _ = line.decode().split("\t|\t", 2)
 91            if name not in synonyms:
 92                synonyms[name] = []
 93            synonyms[name].append(taxid)
 94
 95        return synonyms
 96
 97    def _parse_taxonomy(self, fh):
 98        nodes = {}
 99        ranks = {}
100        names = {}
101        # skip first line header
102        next(fh)
103        for line in fh:
104            try:
105                taxid, parent_taxid, name, rank, _ = line.split("\t|\t", 4)
106            except TypeError:
107                taxid, parent_taxid, name, rank, _ = line.decode().split("\t|\t", 4)
108            ranks[taxid] = rank
109            nodes[taxid] = parent_taxid
110            names[taxid] = name
111        return nodes, ranks, names
112
113    def forwards(self, node: str):
114        """
115        Returns relative entry from the forwards.tsv file of a given node.
116        """
117        if node in self._forwards:
118            return self._forwards[node]
119        else:
120            return self.undefined_node
121
122    def latest(self, node: str):
123        n = super().latest(node)
124        if n == self.undefined_node:
125            n = self.forwards(node)
126        return n
127
128    def search_name(
129        self,
130        text: str,
131        rank: str = None,
132        exact: bool = True,
133        force_extended: bool = False,
134    ):
135        """
136        Search node by exact or partial name.
137
138        Default order (can be skipped with **force_extended=True**):
139
140        1) Search default names defined on "taxonomy.tsv"
141
142        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
143
144        Parameters:
145        * **text** *[str]*: Text to search.
146        * **rank** *[str]*: Filter results by rank.
147        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
148        * **force_extended** *[bool]*: Search for text in all categories at once.
149
150        Returns: list of matching nodes
151        """
152        n = super().search_name(text, rank=rank, exact=exact)
153        if n and not force_extended:
154            return n
155        else:
156            if exact:
157                ret = self._exact_name(text, self._extended_name_nodes)
158            else:
159                ret = self._partial_name(text, self._extended_name_nodes)
160
161            # Only return nodes of chosen rank
162            if rank:
163                ret = filter_function(ret, self.rank, rank)
164
165            return list(set(n + ret))
166
167    def stats(self, **kwargs):
168        s = super().stats(**kwargs)
169        if self._forwards:
170            s["forwards"] = len(self._forwards)
171        if self._extended_name_nodes:
172            s["extended_names"] = len(self._extended_name_nodes)
173        return s
OttTx(**kwargs)
16    def __init__(self, **kwargs):
17        self._forwards = {}
18        self._extended_name_nodes = {}
19        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
def forwards(self, node: str):
113    def forwards(self, node: str):
114        """
115        Returns relative entry from the forwards.tsv file of a given node.
116        """
117        if node in self._forwards:
118            return self._forwards[node]
119        else:
120            return self.undefined_node

Returns relative entry from the forwards.tsv file of a given node.

def latest(self, node: str):
122    def latest(self, node: str):
123        n = super().latest(node)
124        if n == self.undefined_node:
125            n = self.forwards(node)
126        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
128    def search_name(
129        self,
130        text: str,
131        rank: str = None,
132        exact: bool = True,
133        force_extended: bool = False,
134    ):
135        """
136        Search node by exact or partial name.
137
138        Default order (can be skipped with **force_extended=True**):
139
140        1) Search default names defined on "taxonomy.tsv"
141
142        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
143
144        Parameters:
145        * **text** *[str]*: Text to search.
146        * **rank** *[str]*: Filter results by rank.
147        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
148        * **force_extended** *[bool]*: Search for text in all categories at once.
149
150        Returns: list of matching nodes
151        """
152        n = super().search_name(text, rank=rank, exact=exact)
153        if n and not force_extended:
154            return n
155        else:
156            if exact:
157                ret = self._exact_name(text, self._extended_name_nodes)
158            else:
159                ret = self._partial_name(text, self._extended_name_nodes)
160
161            # Only return nodes of chosen rank
162            if rank:
163                ret = filter_function(ret, self.rank, rank)
164
165            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search default names defined on "taxonomy.tsv"

2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

def stats(self, **kwargs):
167    def stats(self, **kwargs):
168        s = super().stats(**kwargs)
169        if self._forwards:
170            s["forwards"] = len(self._forwards)
171        if self._extended_name_nodes:
172            s["extended_names"] = len(self._extended_name_nodes)
173        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
class SilvaTx(multitax.multitax.MultiTax):
 7class SilvaTx(MultiTax):
 8    _default_version = "ssu_138.2"
 9    _supported_versions = ["lsu_138.2", "ssu_138.2"]
10    _default_urls = {
11        "ssu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz",
12        "lsu_138.2": "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.2.txt.gz",
13    }
14
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)
17
18    def __repr__(self):
19        return format_repr(inst=self)
20
21    def _build_translation(self, target_tax, file: str = None, url: str = None):
22        warnings.warn(
23            "Translation between taxonomies ["
24            + self.__class__.__name__
25            + ","
26            + target_tax.__class__.__name__
27            + "] not yet implemented."
28        )
29        return {}
30
31    def _parse(self, fhs, **kwargs):
32        nodes = {}
33        ranks = {}
34        names = {}
35
36        lin = {}
37        for source, fh in fhs.items():
38            for line in fh:
39                try:
40                    name_lineage, taxid, rank, _ = line.split("\t", 3)
41                except TypeError:
42                    name_lineage, taxid, rank, _ = line.decode().split("\t", 3)
43                # Remove last char ";"
44                lineage = name_lineage[:-1]
45                name = lineage.split(";")[-1]
46                # Save lineage to build tree
47                lin[lineage] = taxid
48                names[taxid] = name
49                ranks[taxid] = rank
50
51        # Build parent node connection
52        for lineage, taxid in lin.items():
53            t = taxid
54            lsplit = lineage.split(";")[:-1]
55            while lsplit:
56                parent_taxid = lin[";".join(lsplit)]
57                if t not in nodes:
58                    nodes[t] = parent_taxid
59                t = parent_taxid
60                del lsplit[-1]  # remove last element
61            # Connect last node to root
62            if t not in nodes:
63                nodes[t] = self._default_root_node
64
65        return nodes, ranks, names
SilvaTx(**kwargs)
15    def __init__(self, **kwargs):
16        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • version [str]: Version to download/parse or custom version name (with files/urls).
  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.
  • empty [bool]: Create an empty instance.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")