Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_species.py: 87%

135 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1from __future__ import annotations 

2 

3import os 

4import pathlib 

5import typing 

6 

7from cogent3 import load_table 

8from cogent3.core.tree import TreeNode 

9from cogent3.util.table import Table 

10 

11from ._util import ( 

12 ENSEMBLDBRC, 

13 CaseInsensitiveString, 

14 get_resource_path, 

15 get_stableid_prefix, 

16) 

17 

18 

19SPECIES_NAME = "species.tsv" 

20StrOrNone = typing.Union[str, type(None)] 

21 

22 

23def load_species(species_path): 

24 """returns [[latin_name, common_name, stableid prefix],..] from species_path 

25 

26 if species_path does not exist, defaults to default one""" 

27 if not os.path.exists(species_path): 

28 species_path = get_resource_path("species.tsv") 

29 

30 table = load_table(species_path) 

31 return table.to_list() 

32 

33 

34_species_common_map = load_species(os.path.join(ENSEMBLDBRC, "species.tsv")) 

35 

36 

37class SpeciesNameMap: 

38 """mapping between common names and latin names""" 

39 

40 def __init__(self, species_common=_species_common_map): 

41 """provides latin name:common name mappings""" 

42 self._species_common = {} 

43 self._common_species = {} 

44 self._species_ensembl = {} 

45 self._ensembl_species = {} 

46 self._stableid_species = {} # stable id prefix to species map 

47 for names in species_common: 

48 names = list(map(CaseInsensitiveString, names)) 

49 self.amend_species(*names) 

50 

51 def __str__(self) -> str: 

52 return str(self.to_table()) 

53 

54 def __repr__(self) -> str: 

55 return repr(self.to_table()) 

56 

57 def __contains__(self, item) -> bool: 

58 item = CaseInsensitiveString(item) 

59 return any( 

60 item in attr 

61 for attr in ( 

62 self._species_common, 

63 self._common_species, 

64 self._ensembl_species, 

65 ) 

66 ) 

67 

68 def _repr_html_(self) -> str: 

69 table = self.to_table() 

70 return table._repr_html_() 

71 

72 def get_common_name(self, name: str, level="raise") -> StrOrNone: 

73 """returns the common name for the given name (which can be either a 

74 species name or the ensembl version)""" 

75 name = CaseInsensitiveString(name) 

76 if name in self._ensembl_species: 

77 name = self._ensembl_species[name] 

78 

79 if name in self._species_common: 

80 common_name = self._species_common[name] 

81 elif name in self._common_species: 

82 common_name = name 

83 else: 

84 common_name = None 

85 

86 if common_name is None: 

87 msg = f"Unknown species name: {name}" 

88 if level == "raise": 

89 raise ValueError(msg) 

90 elif level == "warn": 

91 print(f"WARN: {msg}") 

92 

93 return common_name 

94 

95 def get_species_name(self, name: str, level="ignore") -> StrOrNone: 

96 """returns the species name for the given common name""" 

97 name = CaseInsensitiveString(name) 

98 if name in self._species_common: 

99 return name 

100 

101 species_name = None 

102 level = level.lower().strip() 

103 for data in [self._common_species, self._ensembl_species]: 

104 if name in data: 

105 species_name = data[name] 

106 if species_name is None: 

107 msg = f"Unknown common name: {name}" 

108 if level == "raise": 

109 raise ValueError(msg) 

110 elif level == "warn": 

111 print(f"WARN: {msg}") 

112 

113 return species_name 

114 

115 def get_species_names(self) -> typing.Sequence[StrOrNone]: 

116 """returns the list of species names""" 

117 return sorted(self._species_common.keys()) 

118 

119 def get_ensembl_db_prefix(self, name: str) -> str: 

120 """returns a string of the species name in the format used by 

121 ensembl""" 

122 name = CaseInsensitiveString(name) 

123 if name in self._common_species: 

124 name = self._common_species[name] 

125 try: 

126 species_name = self.get_species_name(name, level="raise") 

127 except ValueError as e: 

128 if name not in self._species_common: 

129 raise ValueError(f"Unknown name {name}") from e 

130 species_name = name 

131 

132 return str(species_name.lower().replace(" ", "_")) 

133 

134 def get_db_prefix_from_stableid(self, stableid: str) -> str: 

135 """returns the db name from a stableid""" 

136 prefix = get_stableid_prefix(stableid) 

137 species = self._stableid_species[prefix] 

138 return species.replace(" ", "_").lower() 

139 

140 def _purge_species(self, species_name): 

141 """removes a species record""" 

142 species_name = CaseInsensitiveString(species_name) 

143 if species_name not in self._species_common: 

144 return 

145 common_name = self._species_common.pop(species_name) 

146 ensembl_name = self._species_ensembl.pop(species_name) 

147 self._ensembl_species.pop(ensembl_name) 

148 self._common_species.pop(common_name) 

149 

150 def amend_species(self, species_name, common_name, stableid_prefix=None): 

151 """add a new species, and common name""" 

152 species_name = CaseInsensitiveString(species_name) 

153 common_name = CaseInsensitiveString(common_name) 

154 assert "_" not in species_name, "'_' in species_name, not a Latin name?" 

155 self._purge_species(species_name) # remove if existing 

156 self._species_common[species_name] = common_name 

157 self._common_species[common_name] = species_name 

158 ensembl_name = species_name.lower().replace(" ", "_") 

159 self._species_ensembl[species_name] = ensembl_name 

160 self._ensembl_species[ensembl_name] = species_name 

161 if stableid_prefix: 

162 # make sure stableid just a string 

163 for prefix in stableid_prefix.split(","): 

164 self._stableid_species[prefix] = species_name 

165 

166 def add_stableid_prefix( 

167 self, species_name: str, stableid_prefix: str | CaseInsensitiveString 

168 ): 

169 self._stableid_species[str(stableid_prefix)] = self.get_species_name( 

170 species_name 

171 ) 

172 

173 def to_table(self): 

174 """returns cogent3 Table""" 

175 rows = [] 

176 for common in self._common_species: 

177 species = self._common_species[common] 

178 ensembl = self._species_ensembl[species] 

179 # all prefixes for this species 

180 stableids = ",".join( 

181 [k for k, v in self._stableid_species.items() if v == ensembl] 

182 ) 

183 rows += [[species, common, ensembl, stableids]] 

184 return Table( 

185 [ 

186 "Species name", 

187 "Common name", 

188 "Ensembl Db Prefix", 

189 "Ensembl stableid Prefix", 

190 ], 

191 data=rows, 

192 space=2, 

193 ).sorted() 

194 

195 def update_from_file(self, species_path: pathlib.Path) -> None: 

196 """updates instance from tab delimited table at species_path""" 

197 table = load_table(species_path) 

198 columns = "Ensembl Db Prefix", "Ensembl stableid Prefix" 

199 for db_name, prefixes in table.to_list(columns=columns): 

200 for prefix in prefixes.split(","): 

201 self._stableid_species[prefix] = db_name 

202 

203 

204Species = SpeciesNameMap() 

205 

206 

207def species_from_ensembl_tree(tree: TreeNode) -> dict[str, str]: 

208 """get species identifiers from an Ensembl tree""" 

209 tip_names = tree.get_tip_names() 

210 selected_species = {} 

211 for tip_name in tip_names: 

212 name_fields = tip_name.lower().split("_") 

213 # produce parts of name starting with highly specific to 

214 # more general and look for matches 

215 for j in range(len(name_fields) + 1, 1, -1): 

216 n = "_".join(name_fields[:j]) 

217 if n in Species: 

218 selected_species[Species.get_common_name(n)] = n 

219 break 

220 else: 

221 raise ValueError(f"cannot establish species for {'_'.join(name_fields)}") 

222 

223 return selected_species