Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_site_map.py: 91%

56 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1import typing 

2 

3from abc import ABC, abstractmethod 

4from dataclasses import dataclass 

5from functools import cache 

6 

7from cogent3.util.misc import extend_docstring_from 

8 

9 

10_ensembl_site_map = {} 

11 

12 

13class register_ensembl_site_map: 

14 """ 

15 registration decorator for Ensembl site-map classes 

16 

17 The registration key must be a string that of the domain name. 

18 

19 Parameters 

20 ---------- 

21 domain: str of domain name, must be unique 

22 """ 

23 

24 def __init__(self, domain: str): 

25 if not isinstance(domain, str): 

26 raise TypeError(f"{domain!r} is not a string") 

27 

28 domain = domain.strip() 

29 if not domain: 

30 raise ValueError("cannot have empty string domain") 

31 

32 assert ( 

33 domain not in _ensembl_site_map 

34 ), f"{domain!r} already in {list(_ensembl_site_map)}" 

35 

36 self._domain = domain 

37 

38 def __call__(self, func): 

39 # pass through 

40 _ensembl_site_map[self._domain] = func 

41 return func 

42 

43 

44StrOrNone = typing.Union[str, type(None)] 

45 

46 

47class SiteMapABC(ABC): 

48 @abstractmethod 

49 def get_seqs_path(self, ensembl_name: str) -> str: 

50 """returns the path to genome sequences for species_db_name""" 

51 ... 

52 

53 @abstractmethod 

54 def get_annotations_path(self, ensembl_name: str) -> str: ... 

55 

56 @property 

57 def alignments_path(self) -> StrOrNone: 

58 return self._alignments_path 

59 

60 @property 

61 def homologies_path(self) -> StrOrNone: 

62 return self._homologies_path 

63 

64 @property 

65 def trees_path(self) -> StrOrNone: 

66 return self._trees_path 

67 

68 

69@dataclass(slots=True) 

70class SiteMap: 

71 """records the locations of specific attributes relative to an Ensembl release""" 

72 

73 site: str 

74 _seqs_path: str = "fasta" 

75 _annotations_path: str = "gff3" 

76 _alignments_path: str | None = None 

77 _homologies_path: str | None = None 

78 _trees_path: str | None = None 

79 

80 

81class EnsemblPrimary(SiteMapABC, SiteMap): 

82 def __init__(self, *args, **kwargs): 

83 super().__init__(*args, **kwargs) 

84 

85 def get_seqs_path(self, ensembl_name: str) -> str: 

86 """path to unmasked genome sequences""" 

87 return f"{self._seqs_path}/{ensembl_name}/dna" 

88 

89 def get_annotations_path(self, ensembl_name: str) -> str: 

90 return f"{self._annotations_path}/{ensembl_name}" 

91 

92 

93@extend_docstring_from(SiteMap) 

94@register_ensembl_site_map("ftp.ensembl.org") 

95def ensembl_main_sitemap(): 

96 """the main Ensembl site map""" 

97 return EnsemblPrimary( 

98 site="ftp.ensembl.org", 

99 _alignments_path="maf/ensembl-compara/multiple_alignments", 

100 _homologies_path="tsv/ensembl-compara/homologies", 

101 _trees_path="compara/species_trees", 

102 ) 

103 

104 

105# for bacteria we have, but complexities related to the bacterial collection 

106# a species belongs to. For example 

107# https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-57/fasta/bacteria_15_collection/_butyribacterium_methylotrophicum_gca_001753695/dna/ 

108# so to address this, the sitemap class needs to download the species table from ensembl bacteria 

109# and cache the collection/species mapping 

110# site = "ftp.ensemblgenomes.ebi.ac.uk", 

111# _genomes_path = "fasta", 

112# _annotations_path = "gff3", 

113# _homologies_path = "pan_ensembl/tsv/ensembl-compara/homologies", 

114 

115 

116@cache 

117def get_site_map(domain: str) -> SiteMapABC: 

118 """returns a site map instance""" 

119 return _ensembl_site_map[domain]()