Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_species.py: 87%
135 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1from __future__ import annotations
3import os
4import pathlib
5import typing
7from cogent3 import load_table
8from cogent3.core.tree import TreeNode
9from cogent3.util.table import Table
11from ._util import (
12 ENSEMBLDBRC,
13 CaseInsensitiveString,
14 get_resource_path,
15 get_stableid_prefix,
16)
19SPECIES_NAME = "species.tsv"
20StrOrNone = typing.Union[str, type(None)]
23def load_species(species_path):
24 """returns [[latin_name, common_name, stableid prefix],..] from species_path
26 if species_path does not exist, defaults to default one"""
27 if not os.path.exists(species_path):
28 species_path = get_resource_path("species.tsv")
30 table = load_table(species_path)
31 return table.to_list()
34_species_common_map = load_species(os.path.join(ENSEMBLDBRC, "species.tsv"))
37class SpeciesNameMap:
38 """mapping between common names and latin names"""
40 def __init__(self, species_common=_species_common_map):
41 """provides latin name:common name mappings"""
42 self._species_common = {}
43 self._common_species = {}
44 self._species_ensembl = {}
45 self._ensembl_species = {}
46 self._stableid_species = {} # stable id prefix to species map
47 for names in species_common:
48 names = list(map(CaseInsensitiveString, names))
49 self.amend_species(*names)
51 def __str__(self) -> str:
52 return str(self.to_table())
54 def __repr__(self) -> str:
55 return repr(self.to_table())
57 def __contains__(self, item) -> bool:
58 item = CaseInsensitiveString(item)
59 return any(
60 item in attr
61 for attr in (
62 self._species_common,
63 self._common_species,
64 self._ensembl_species,
65 )
66 )
68 def _repr_html_(self) -> str:
69 table = self.to_table()
70 return table._repr_html_()
72 def get_common_name(self, name: str, level="raise") -> StrOrNone:
73 """returns the common name for the given name (which can be either a
74 species name or the ensembl version)"""
75 name = CaseInsensitiveString(name)
76 if name in self._ensembl_species:
77 name = self._ensembl_species[name]
79 if name in self._species_common:
80 common_name = self._species_common[name]
81 elif name in self._common_species:
82 common_name = name
83 else:
84 common_name = None
86 if common_name is None:
87 msg = f"Unknown species name: {name}"
88 if level == "raise":
89 raise ValueError(msg)
90 elif level == "warn":
91 print(f"WARN: {msg}")
93 return common_name
95 def get_species_name(self, name: str, level="ignore") -> StrOrNone:
96 """returns the species name for the given common name"""
97 name = CaseInsensitiveString(name)
98 if name in self._species_common:
99 return name
101 species_name = None
102 level = level.lower().strip()
103 for data in [self._common_species, self._ensembl_species]:
104 if name in data:
105 species_name = data[name]
106 if species_name is None:
107 msg = f"Unknown common name: {name}"
108 if level == "raise":
109 raise ValueError(msg)
110 elif level == "warn":
111 print(f"WARN: {msg}")
113 return species_name
115 def get_species_names(self) -> typing.Sequence[StrOrNone]:
116 """returns the list of species names"""
117 return sorted(self._species_common.keys())
119 def get_ensembl_db_prefix(self, name: str) -> str:
120 """returns a string of the species name in the format used by
121 ensembl"""
122 name = CaseInsensitiveString(name)
123 if name in self._common_species:
124 name = self._common_species[name]
125 try:
126 species_name = self.get_species_name(name, level="raise")
127 except ValueError as e:
128 if name not in self._species_common:
129 raise ValueError(f"Unknown name {name}") from e
130 species_name = name
132 return str(species_name.lower().replace(" ", "_"))
134 def get_db_prefix_from_stableid(self, stableid: str) -> str:
135 """returns the db name from a stableid"""
136 prefix = get_stableid_prefix(stableid)
137 species = self._stableid_species[prefix]
138 return species.replace(" ", "_").lower()
140 def _purge_species(self, species_name):
141 """removes a species record"""
142 species_name = CaseInsensitiveString(species_name)
143 if species_name not in self._species_common:
144 return
145 common_name = self._species_common.pop(species_name)
146 ensembl_name = self._species_ensembl.pop(species_name)
147 self._ensembl_species.pop(ensembl_name)
148 self._common_species.pop(common_name)
150 def amend_species(self, species_name, common_name, stableid_prefix=None):
151 """add a new species, and common name"""
152 species_name = CaseInsensitiveString(species_name)
153 common_name = CaseInsensitiveString(common_name)
154 assert "_" not in species_name, "'_' in species_name, not a Latin name?"
155 self._purge_species(species_name) # remove if existing
156 self._species_common[species_name] = common_name
157 self._common_species[common_name] = species_name
158 ensembl_name = species_name.lower().replace(" ", "_")
159 self._species_ensembl[species_name] = ensembl_name
160 self._ensembl_species[ensembl_name] = species_name
161 if stableid_prefix:
162 # make sure stableid just a string
163 for prefix in stableid_prefix.split(","):
164 self._stableid_species[prefix] = species_name
166 def add_stableid_prefix(
167 self, species_name: str, stableid_prefix: str | CaseInsensitiveString
168 ):
169 self._stableid_species[str(stableid_prefix)] = self.get_species_name(
170 species_name
171 )
173 def to_table(self):
174 """returns cogent3 Table"""
175 rows = []
176 for common in self._common_species:
177 species = self._common_species[common]
178 ensembl = self._species_ensembl[species]
179 # all prefixes for this species
180 stableids = ",".join(
181 [k for k, v in self._stableid_species.items() if v == ensembl]
182 )
183 rows += [[species, common, ensembl, stableids]]
184 return Table(
185 [
186 "Species name",
187 "Common name",
188 "Ensembl Db Prefix",
189 "Ensembl stableid Prefix",
190 ],
191 data=rows,
192 space=2,
193 ).sorted()
195 def update_from_file(self, species_path: pathlib.Path) -> None:
196 """updates instance from tab delimited table at species_path"""
197 table = load_table(species_path)
198 columns = "Ensembl Db Prefix", "Ensembl stableid Prefix"
199 for db_name, prefixes in table.to_list(columns=columns):
200 for prefix in prefixes.split(","):
201 self._stableid_species[prefix] = db_name
204Species = SpeciesNameMap()
207def species_from_ensembl_tree(tree: TreeNode) -> dict[str, str]:
208 """get species identifiers from an Ensembl tree"""
209 tip_names = tree.get_tip_names()
210 selected_species = {}
211 for tip_name in tip_names:
212 name_fields = tip_name.lower().split("_")
213 # produce parts of name starting with highly specific to
214 # more general and look for matches
215 for j in range(len(name_fields) + 1, 1, -1):
216 n = "_".join(name_fields[:j])
217 if n in Species:
218 selected_species[Species.get_common_name(n)] = n
219 break
220 else:
221 raise ValueError(f"cannot establish species for {'_'.join(name_fields)}")
223 return selected_species