Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_site_map.py: 91%
56 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1import typing
3from abc import ABC, abstractmethod
4from dataclasses import dataclass
5from functools import cache
7from cogent3.util.misc import extend_docstring_from
10_ensembl_site_map = {}
13class register_ensembl_site_map:
14 """
15 registration decorator for Ensembl site-map classes
17 The registration key must be a string that of the domain name.
19 Parameters
20 ----------
21 domain: str of domain name, must be unique
22 """
24 def __init__(self, domain: str):
25 if not isinstance(domain, str):
26 raise TypeError(f"{domain!r} is not a string")
28 domain = domain.strip()
29 if not domain:
30 raise ValueError("cannot have empty string domain")
32 assert (
33 domain not in _ensembl_site_map
34 ), f"{domain!r} already in {list(_ensembl_site_map)}"
36 self._domain = domain
38 def __call__(self, func):
39 # pass through
40 _ensembl_site_map[self._domain] = func
41 return func
44StrOrNone = typing.Union[str, type(None)]
47class SiteMapABC(ABC):
48 @abstractmethod
49 def get_seqs_path(self, ensembl_name: str) -> str:
50 """returns the path to genome sequences for species_db_name"""
51 ...
53 @abstractmethod
54 def get_annotations_path(self, ensembl_name: str) -> str: ...
56 @property
57 def alignments_path(self) -> StrOrNone:
58 return self._alignments_path
60 @property
61 def homologies_path(self) -> StrOrNone:
62 return self._homologies_path
64 @property
65 def trees_path(self) -> StrOrNone:
66 return self._trees_path
69@dataclass(slots=True)
70class SiteMap:
71 """records the locations of specific attributes relative to an Ensembl release"""
73 site: str
74 _seqs_path: str = "fasta"
75 _annotations_path: str = "gff3"
76 _alignments_path: str | None = None
77 _homologies_path: str | None = None
78 _trees_path: str | None = None
81class EnsemblPrimary(SiteMapABC, SiteMap):
82 def __init__(self, *args, **kwargs):
83 super().__init__(*args, **kwargs)
85 def get_seqs_path(self, ensembl_name: str) -> str:
86 """path to unmasked genome sequences"""
87 return f"{self._seqs_path}/{ensembl_name}/dna"
89 def get_annotations_path(self, ensembl_name: str) -> str:
90 return f"{self._annotations_path}/{ensembl_name}"
93@extend_docstring_from(SiteMap)
94@register_ensembl_site_map("ftp.ensembl.org")
95def ensembl_main_sitemap():
96 """the main Ensembl site map"""
97 return EnsemblPrimary(
98 site="ftp.ensembl.org",
99 _alignments_path="maf/ensembl-compara/multiple_alignments",
100 _homologies_path="tsv/ensembl-compara/homologies",
101 _trees_path="compara/species_trees",
102 )
105# for bacteria we have, but complexities related to the bacterial collection
106# a species belongs to. For example
107# https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-57/fasta/bacteria_15_collection/_butyribacterium_methylotrophicum_gca_001753695/dna/
108# so to address this, the sitemap class needs to download the species table from ensembl bacteria
109# and cache the collection/species mapping
110# site = "ftp.ensemblgenomes.ebi.ac.uk",
111# _genomes_path = "fasta",
112# _annotations_path = "gff3",
113# _homologies_path = "pan_ensembl/tsv/ensembl-compara/homologies",
116@cache
117def get_site_map(domain: str) -> SiteMapABC:
118 """returns a site map instance"""
119 return _ensembl_site_map[domain]()