Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_name.py: 96%
104 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1from __future__ import annotations
3import re
4import typing
6from dataclasses import dataclass
8from ._species import Species
11_release = re.compile(r"\d+")
14def get_version_from_name(name):
15 """returns the release and build identifiers from an ensembl db_name"""
16 r = _release.search(name)
17 if r is None:
18 return None, None
20 # first number run is release, followed by build
21 # note, for the ensemblgenomes naming system, the second digit run is the
22 # standard Ensembl release and the first is for the specified genome
23 release = name[r.start() : r.end()]
24 b = [s for s in _name_delim.split(name[r.end() :]) if s]
26 return release, b
29_name_delim = re.compile("_")
32def get_dbtype_from_name(name):
33 """returns the data base type from the name"""
34 name = _release.split(name)
35 name = [s for s in _name_delim.split(name[0]) if s]
36 return name[1] if name[0] == "ensembl" else name[-1]
39def get_db_prefix(name):
40 """returns the db prefix, typically an organism or `ensembl'"""
41 name = _release.split(name)
42 name = [s for s in _name_delim.split(name[0]) if s]
43 if name[0] == "ensembl":
44 prefix = "ensembl"
45 elif len(name) > 2:
46 prefix = "_".join(name[:-1])
47 else:
48 raise ValueError(f"Unknown name structure: {'_'.join(name)}")
49 return prefix
52class EnsemblDbName:
53 """container for a db name, inferring different attributes from the name,
54 such as species, version, build"""
56 def __init__(self, db_name):
57 """db_name: and Emsembl database name"""
58 self.name = db_name
59 self.type = get_dbtype_from_name(db_name)
60 self.prefix = get_db_prefix(db_name)
62 release, build = get_version_from_name(db_name)
63 self.release = release
64 self.general_release = self.release
66 self.build = None
67 if build and len(build) == 1:
68 if self.type != "compara":
69 self.build = build[0]
70 else:
71 self.general_release = build[0]
72 elif build:
73 self.build = build[1]
74 self.general_release = build[0]
76 self.species = Species.get_species_name(self.prefix)
78 def __repr__(self):
79 build = f"; build='{self.build}'" if self.build is not None else ""
80 return f"db(prefix='{self.prefix}'; type='{self.type}'; release='{self.release}'{build})"
82 def __str__(self):
83 return self.name
85 def __lt__(self, other):
86 if isinstance(other, type(self)):
87 other = other.name
88 return self.name < other
90 def __eq__(self, other):
91 if isinstance(other, type(self)):
92 other = other.name
93 return self.name == other
95 def __ne__(self, other):
96 if isinstance(other, type(self)):
97 other = other.name
98 return self.name != other
100 def __hash__(self):
101 return hash(self.name)
104@dataclass(slots=True)
105class EmfName:
106 """stores information from EMF SEQ records"""
108 species: str
109 seqid: str
110 start: int
111 stop: int
112 strand: str
113 coord_length: str
115 def __post_init__(self):
116 # adjust the lengths to be ints and put into python coord
117 self.start = int(self.start) - 1
118 self.stop = int(self.stop)
120 def __str__(self):
121 attrs = "species", "seqid", "start", "stop", "strand"
122 n = [str(getattr(self, attr)) for attr in attrs]
123 return ":".join(n)
125 def __hash__(self):
126 return hash(str(self))
128 def to_dict(self) -> dict:
129 attrs = "species", "seqid", "start", "stop", "strand"
130 return {attr: getattr(self, attr) for attr in attrs}
133@dataclass(slots=True)
134class MafName:
135 """stores source information from Maf records"""
137 species: str
138 seqid: str
139 start: int
140 stop: int
141 strand: str
142 coord_length: typing.Optional[str | int]
144 def __post_init__(self):
145 # adjust the lengths to be ints
146 self.start = int(self.start)
147 self.stop = int(self.stop)
148 self.coord_length = int(self.coord_length) if self.coord_length else None
150 def __str__(self):
151 attrs = "species", "seqid", "start", "stop", "strand"
152 n = [str(getattr(self, attr)) for attr in attrs]
153 return ":".join(n)
155 def __hash__(self):
156 return hash(str(self))
158 def to_dict(self) -> dict:
159 attrs = "species", "seqid", "start", "stop", "strand"
160 return {attr: getattr(self, attr) for attr in attrs}