Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_site_map.py: 89%

57 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1import typing 

2 

3from abc import ABC, abstractmethod 

4from dataclasses import dataclass 

5from functools import cache 

6 

7from cogent3.util.misc import extend_docstring_from 

8 

9 

10_ensembl_site_map = {} 

11 

12 

13class register_ensembl_site_map: 

14 """ 

15 registration decorator for Ensembl site-map classes 

16 

17 The registration key must be a string that of the domain name. 

18 

19 Parameters 

20 ---------- 

21 domain: str of domain name, must be unique 

22 """ 

23 

24 def __init__(self, domain: str): 

25 if not isinstance(domain, str): 

26 raise TypeError(f"{domain!r} is not a string") 

27 

28 domain = domain.strip() 

29 if not domain: 

30 raise ValueError("cannot have empty string domain") 

31 

32 assert ( 

33 domain not in _ensembl_site_map 

34 ), f"{domain!r} already in {list(_ensembl_site_map)}" 

35 

36 self._domain = domain 

37 

38 def __call__(self, func): 

39 # pass through 

40 _ensembl_site_map[self._domain] = func 

41 return func 

42 

43 

44StrOrNone = typing.Union[str, type(None)] 

45 

46 

47class SiteMapABC(ABC): 

48 @abstractmethod 

49 def get_seqs_path(self, ensembl_name: str) -> str: 

50 """returns the path to genome sequences for species_db_name""" 

51 ... 

52 

53 @abstractmethod 

54 def get_annotations_path(self, ensembl_name: str) -> str: 

55 ... 

56 

57 @property 

58 def alignments_path(self) -> StrOrNone: 

59 return self._alignments_path 

60 

61 @property 

62 def homologies_path(self) -> StrOrNone: 

63 return self._homologies_path 

64 

65 @property 

66 def trees_path(self) -> StrOrNone: 

67 return self._trees_path 

68 

69 

70@dataclass(slots=True) 

71class SiteMap: 

72 """records the locations of specific attributes relative to an Ensembl release""" 

73 

74 site: str 

75 _seqs_path: str = "fasta" 

76 _annotations_path: str = "gff3" 

77 _alignments_path: str | None = None 

78 _homologies_path: str | None = None 

79 _trees_path: str | None = None 

80 

81 

82class EnsemblPrimary(SiteMapABC, SiteMap): 

83 def __init__(self, *args, **kwargs): 

84 super().__init__(*args, **kwargs) 

85 

86 def get_seqs_path(self, ensembl_name: str) -> str: 

87 """path to unmasked genome sequences""" 

88 return f"{self._seqs_path}/{ensembl_name}/dna" 

89 

90 def get_annotations_path(self, ensembl_name: str) -> str: 

91 return f"{self._annotations_path}/{ensembl_name}" 

92 

93 

94@extend_docstring_from(SiteMap) 

95@register_ensembl_site_map("ftp.ensembl.org") 

96def ensembl_main_sitemap(): 

97 """the main Ensembl site map""" 

98 return EnsemblPrimary( 

99 site="ftp.ensembl.org", 

100 _alignments_path="maf/ensembl-compara/multiple_alignments", 

101 _homologies_path="tsv/ensembl-compara/homologies", 

102 _trees_path="compara/species_trees", 

103 ) 

104 

105 

106# for bacteria we have, but complexities related to the bacterial collection 

107# a species belongs to. For example 

108# https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-57/fasta/bacteria_15_collection/_butyribacterium_methylotrophicum_gca_001753695/dna/ 

109# so to address this, the sitemap class needs to download the species table from ensembl bacteria 

110# and cache the collection/species mapping 

111# site = "ftp.ensemblgenomes.ebi.ac.uk", 

112# _genomes_path = "fasta", 

113# _annotations_path = "gff3", 

114# _homologies_path = "pan_ensembl/tsv/ensembl-compara/homologies", 

115 

116 

117@cache 

118def get_site_map(domain: str) -> SiteMapABC: 

119 """returns a site map instance""" 

120 return _ensembl_site_map[domain]()