Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/species.py: 89%

121 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 11:45 +1100

1from __future__ import annotations 

2 

3import os 

4import re 

5import typing 

6 

7from cogent3 import load_table 

8from cogent3.core.tree import TreeNode 

9from cogent3.util.table import Table 

10 

11from .util import ENSEMBLDBRC, CaseInsensitiveString, get_resource_path 

12 

13 

14_invalid_chars = re.compile("[^a-zA-Z _]") 

15 

16StrOrNone = typing.Union[str, type(None)] 

17 

18 

19def load_species(species_path): 

20 """returns [[latin_name, common_name],..] from species_path 

21 

22 if species_path does not exist, defaults to default one""" 

23 if not os.path.exists(species_path): 

24 species_path = get_resource_path("species.tsv") 

25 

26 table = load_table(species_path) 

27 try: 

28 # remove this after 2023 Q4 (when the to_list() deprecation has been released) 

29 return table.to_list() 

30 except AttributeError: 

31 return table.tolist() 

32 

33 

34_species_common_map = load_species(os.path.join(ENSEMBLDBRC, "species.tsv")) 

35 

36 

37class SpeciesNameMap: 

38 """mapping between common names and latin names""" 

39 

40 def __init__(self, species_common=_species_common_map): 

41 """provides latin name:common name mappings""" 

42 self._species_common = {} 

43 self._common_species = {} 

44 self._species_ensembl = {} 

45 self._ensembl_species = {} 

46 for names in species_common: 

47 names = list(map(CaseInsensitiveString, names)) 

48 self.amend_species(*names) 

49 

50 def __str__(self) -> str: 

51 return str(self.to_table()) 

52 

53 def __repr__(self) -> str: 

54 return repr(self.to_table()) 

55 

56 def __contains__(self, item) -> bool: 

57 item = CaseInsensitiveString(item) 

58 return any( 

59 item in attr 

60 for attr in ( 

61 self._species_common, 

62 self._common_species, 

63 self._ensembl_species, 

64 ) 

65 ) 

66 

67 def _repr_html_(self) -> str: 

68 table = self.to_table() 

69 return table._repr_html_() 

70 

71 def get_common_name(self, name: str, level="raise") -> StrOrNone: 

72 """returns the common name for the given name (which can be either a 

73 species name or the ensembl version)""" 

74 name = CaseInsensitiveString(name) 

75 if name in self._ensembl_species: 

76 name = self._ensembl_species[name] 

77 

78 if name in self._species_common: 

79 common_name = self._species_common[name] 

80 elif name in self._common_species: 

81 common_name = name 

82 else: 

83 common_name = None 

84 

85 if common_name is None: 

86 msg = f"Unknown species name: {name}" 

87 if level == "raise": 

88 raise ValueError(msg) 

89 elif level == "warn": 

90 print(f"WARN: {msg}") 

91 

92 return common_name 

93 

94 def get_species_name(self, name: str, level="ignore") -> StrOrNone: 

95 """returns the species name for the given common name""" 

96 name = CaseInsensitiveString(name) 

97 if name in self._species_common: 

98 return name 

99 

100 species_name = None 

101 level = level.lower().strip() 

102 for data in [self._common_species, self._ensembl_species]: 

103 if name in data: 

104 species_name = data[name] 

105 if species_name is None: 

106 msg = f"Unknown common name: {name}" 

107 if level == "raise": 

108 raise ValueError(msg) 

109 elif level == "warn": 

110 print(f"WARN: {msg}") 

111 

112 return species_name 

113 

114 def get_species_names(self) -> typing.Sequence[StrOrNone]: 

115 """returns the list of species names""" 

116 return sorted(self._species_common.keys()) 

117 

118 def get_ensembl_db_prefix(self, name) -> str: 

119 """returns a string of the species name in the format used by 

120 ensembl""" 

121 name = CaseInsensitiveString(name) 

122 if name in self._common_species: 

123 name = self._common_species[name] 

124 try: 

125 species_name = self.get_species_name(name, level="raise") 

126 except ValueError as e: 

127 if name not in self._species_common: 

128 raise ValueError(f"Unknown name {name}") from e 

129 species_name = name 

130 

131 return str(species_name.lower().replace(" ", "_")) 

132 

133 def _purge_species(self, species_name): 

134 """removes a species record""" 

135 species_name = CaseInsensitiveString(species_name) 

136 if species_name not in self._species_common: 

137 return 

138 common_name = self._species_common.pop(species_name) 

139 ensembl_name = self._species_ensembl.pop(species_name) 

140 self._ensembl_species.pop(ensembl_name) 

141 self._common_species.pop(common_name) 

142 

143 def amend_species(self, species_name, common_name): 

144 """add a new species, and common name""" 

145 species_name = CaseInsensitiveString(species_name) 

146 common_name = CaseInsensitiveString(common_name) 

147 assert "_" not in species_name, "'_' in species_name, not a Latin name?" 

148 self._purge_species(species_name) # remove if existing 

149 self._species_common[species_name] = common_name 

150 self._common_species[common_name] = species_name 

151 ensembl_name = species_name.lower().replace(" ", "_") 

152 self._species_ensembl[species_name] = ensembl_name 

153 self._ensembl_species[ensembl_name] = species_name 

154 

155 def to_table(self): 

156 """returns cogent3 Table""" 

157 rows = [] 

158 for common in self._common_species: 

159 species = self._common_species[common] 

160 ensembl = self._species_ensembl[species] 

161 

162 rows += [[species, common, ensembl]] 

163 return Table( 

164 [ 

165 "Species name", 

166 "Common name", 

167 "Ensembl Db Prefix", 

168 ], 

169 data=rows, 

170 space=2, 

171 ).sorted() 

172 

173 

174Species = SpeciesNameMap() 

175 

176 

177def species_from_ensembl_tree(tree: TreeNode) -> dict[str, str]: 

178 """get species identifiers from an Ensembl tree""" 

179 tip_names = tree.get_tip_names() 

180 selected_species = {} 

181 for tip_name in tip_names: 

182 name_fields = tip_name.lower().split("_") 

183 # produce parts of name starting with highly specific to 

184 # more general and look for matches 

185 for j in range(len(name_fields) + 1, 1, -1): 

186 n = "_".join(name_fields[:j]) 

187 if n in Species: 

188 selected_species[Species.get_common_name(n)] = n 

189 break 

190 else: 

191 raise ValueError(f"cannot establish species for {'_'.join(name_fields)}") 

192 

193 return selected_species