Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_site_map.py: 89%
57 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1import typing
3from abc import ABC, abstractmethod
4from dataclasses import dataclass
5from functools import cache
7from cogent3.util.misc import extend_docstring_from
10_ensembl_site_map = {}
13class register_ensembl_site_map:
14 """
15 registration decorator for Ensembl site-map classes
17 The registration key must be a string that of the domain name.
19 Parameters
20 ----------
21 domain: str of domain name, must be unique
22 """
24 def __init__(self, domain: str):
25 if not isinstance(domain, str):
26 raise TypeError(f"{domain!r} is not a string")
28 domain = domain.strip()
29 if not domain:
30 raise ValueError("cannot have empty string domain")
32 assert (
33 domain not in _ensembl_site_map
34 ), f"{domain!r} already in {list(_ensembl_site_map)}"
36 self._domain = domain
38 def __call__(self, func):
39 # pass through
40 _ensembl_site_map[self._domain] = func
41 return func
44StrOrNone = typing.Union[str, type(None)]
47class SiteMapABC(ABC):
48 @abstractmethod
49 def get_seqs_path(self, ensembl_name: str) -> str:
50 """returns the path to genome sequences for species_db_name"""
51 ...
53 @abstractmethod
54 def get_annotations_path(self, ensembl_name: str) -> str:
55 ...
57 @property
58 def alignments_path(self) -> StrOrNone:
59 return self._alignments_path
61 @property
62 def homologies_path(self) -> StrOrNone:
63 return self._homologies_path
65 @property
66 def trees_path(self) -> StrOrNone:
67 return self._trees_path
70@dataclass(slots=True)
71class SiteMap:
72 """records the locations of specific attributes relative to an Ensembl release"""
74 site: str
75 _seqs_path: str = "fasta"
76 _annotations_path: str = "gff3"
77 _alignments_path: str | None = None
78 _homologies_path: str | None = None
79 _trees_path: str | None = None
82class EnsemblPrimary(SiteMapABC, SiteMap):
83 def __init__(self, *args, **kwargs):
84 super().__init__(*args, **kwargs)
86 def get_seqs_path(self, ensembl_name: str) -> str:
87 """path to unmasked genome sequences"""
88 return f"{self._seqs_path}/{ensembl_name}/dna"
90 def get_annotations_path(self, ensembl_name: str) -> str:
91 return f"{self._annotations_path}/{ensembl_name}"
94@extend_docstring_from(SiteMap)
95@register_ensembl_site_map("ftp.ensembl.org")
96def ensembl_main_sitemap():
97 """the main Ensembl site map"""
98 return EnsemblPrimary(
99 site="ftp.ensembl.org",
100 _alignments_path="maf/ensembl-compara/multiple_alignments",
101 _homologies_path="tsv/ensembl-compara/homologies",
102 _trees_path="compara/species_trees",
103 )
106# for bacteria we have, but complexities related to the bacterial collection
107# a species belongs to. For example
108# https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-57/fasta/bacteria_15_collection/_butyribacterium_methylotrophicum_gca_001753695/dna/
109# so to address this, the sitemap class needs to download the species table from ensembl bacteria
110# and cache the collection/species mapping
111# site = "ftp.ensemblgenomes.ebi.ac.uk",
112# _genomes_path = "fasta",
113# _annotations_path = "gff3",
114# _homologies_path = "pan_ensembl/tsv/ensembl-compara/homologies",
117@cache
118def get_site_map(domain: str) -> SiteMapABC:
119 """returns a site map instance"""
120 return _ensembl_site_map[domain]()