Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/species.py: 89%
121 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:45 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:45 +1100
1from __future__ import annotations
3import os
4import re
5import typing
7from cogent3 import load_table
8from cogent3.core.tree import TreeNode
9from cogent3.util.table import Table
11from .util import ENSEMBLDBRC, CaseInsensitiveString, get_resource_path
14_invalid_chars = re.compile("[^a-zA-Z _]")
16StrOrNone = typing.Union[str, type(None)]
19def load_species(species_path):
20 """returns [[latin_name, common_name],..] from species_path
22 if species_path does not exist, defaults to default one"""
23 if not os.path.exists(species_path):
24 species_path = get_resource_path("species.tsv")
26 table = load_table(species_path)
27 try:
28 # remove this after 2023 Q4 (when the to_list() deprecation has been released)
29 return table.to_list()
30 except AttributeError:
31 return table.tolist()
34_species_common_map = load_species(os.path.join(ENSEMBLDBRC, "species.tsv"))
37class SpeciesNameMap:
38 """mapping between common names and latin names"""
40 def __init__(self, species_common=_species_common_map):
41 """provides latin name:common name mappings"""
42 self._species_common = {}
43 self._common_species = {}
44 self._species_ensembl = {}
45 self._ensembl_species = {}
46 for names in species_common:
47 names = list(map(CaseInsensitiveString, names))
48 self.amend_species(*names)
50 def __str__(self) -> str:
51 return str(self.to_table())
53 def __repr__(self) -> str:
54 return repr(self.to_table())
56 def __contains__(self, item) -> bool:
57 item = CaseInsensitiveString(item)
58 return any(
59 item in attr
60 for attr in (
61 self._species_common,
62 self._common_species,
63 self._ensembl_species,
64 )
65 )
67 def _repr_html_(self) -> str:
68 table = self.to_table()
69 return table._repr_html_()
71 def get_common_name(self, name: str, level="raise") -> StrOrNone:
72 """returns the common name for the given name (which can be either a
73 species name or the ensembl version)"""
74 name = CaseInsensitiveString(name)
75 if name in self._ensembl_species:
76 name = self._ensembl_species[name]
78 if name in self._species_common:
79 common_name = self._species_common[name]
80 elif name in self._common_species:
81 common_name = name
82 else:
83 common_name = None
85 if common_name is None:
86 msg = f"Unknown species name: {name}"
87 if level == "raise":
88 raise ValueError(msg)
89 elif level == "warn":
90 print(f"WARN: {msg}")
92 return common_name
94 def get_species_name(self, name: str, level="ignore") -> StrOrNone:
95 """returns the species name for the given common name"""
96 name = CaseInsensitiveString(name)
97 if name in self._species_common:
98 return name
100 species_name = None
101 level = level.lower().strip()
102 for data in [self._common_species, self._ensembl_species]:
103 if name in data:
104 species_name = data[name]
105 if species_name is None:
106 msg = f"Unknown common name: {name}"
107 if level == "raise":
108 raise ValueError(msg)
109 elif level == "warn":
110 print(f"WARN: {msg}")
112 return species_name
114 def get_species_names(self) -> typing.Sequence[StrOrNone]:
115 """returns the list of species names"""
116 return sorted(self._species_common.keys())
118 def get_ensembl_db_prefix(self, name) -> str:
119 """returns a string of the species name in the format used by
120 ensembl"""
121 name = CaseInsensitiveString(name)
122 if name in self._common_species:
123 name = self._common_species[name]
124 try:
125 species_name = self.get_species_name(name, level="raise")
126 except ValueError as e:
127 if name not in self._species_common:
128 raise ValueError(f"Unknown name {name}") from e
129 species_name = name
131 return str(species_name.lower().replace(" ", "_"))
133 def _purge_species(self, species_name):
134 """removes a species record"""
135 species_name = CaseInsensitiveString(species_name)
136 if species_name not in self._species_common:
137 return
138 common_name = self._species_common.pop(species_name)
139 ensembl_name = self._species_ensembl.pop(species_name)
140 self._ensembl_species.pop(ensembl_name)
141 self._common_species.pop(common_name)
143 def amend_species(self, species_name, common_name):
144 """add a new species, and common name"""
145 species_name = CaseInsensitiveString(species_name)
146 common_name = CaseInsensitiveString(common_name)
147 assert "_" not in species_name, "'_' in species_name, not a Latin name?"
148 self._purge_species(species_name) # remove if existing
149 self._species_common[species_name] = common_name
150 self._common_species[common_name] = species_name
151 ensembl_name = species_name.lower().replace(" ", "_")
152 self._species_ensembl[species_name] = ensembl_name
153 self._ensembl_species[ensembl_name] = species_name
155 def to_table(self):
156 """returns cogent3 Table"""
157 rows = []
158 for common in self._common_species:
159 species = self._common_species[common]
160 ensembl = self._species_ensembl[species]
162 rows += [[species, common, ensembl]]
163 return Table(
164 [
165 "Species name",
166 "Common name",
167 "Ensembl Db Prefix",
168 ],
169 data=rows,
170 space=2,
171 ).sorted()
174Species = SpeciesNameMap()
177def species_from_ensembl_tree(tree: TreeNode) -> dict[str, str]:
178 """get species identifiers from an Ensembl tree"""
179 tip_names = tree.get_tip_names()
180 selected_species = {}
181 for tip_name in tip_names:
182 name_fields = tip_name.lower().split("_")
183 # produce parts of name starting with highly specific to
184 # more general and look for matches
185 for j in range(len(name_fields) + 1, 1, -1):
186 n = "_".join(name_fields[:j])
187 if n in Species:
188 selected_species[Species.get_common_name(n)] = n
189 break
190 else:
191 raise ValueError(f"cannot establish species for {'_'.join(name_fields)}")
193 return selected_species