Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_util.py: 87%
194 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1from __future__ import annotations
3import contextlib
4import functools
5import os
6import pathlib
7import re
8import shutil
9import subprocess
10import sys
11import uuid
13from hashlib import md5
14from tempfile import mkdtemp
15from typing import IO, Callable, Union
17import blosc2
18import numba
19import numpy
21from cogent3.app.composable import define_app
24def md5sum(data: bytes, *args) -> str:
25 """computes MD5SUM
27 Notes
28 -----
29 *args is for signature compatability with checksum
30 """
31 return md5(data).hexdigest()
34# based on https://www.reddit.com/r/learnpython/comments/9bpgjl/implementing_bsd_16bit_checksum/
35# and https://www.gnu.org/software/coreutils/manual/html_node/sum-invocation.html#sum-invocation
36@numba.jit(nopython=True)
37def checksum(data: bytes, size: int): # pragma: no cover
38 """computes BSD style checksum"""
39 # equivalent to command line BSD sum
40 nb = numpy.ceil(size / 1024)
41 cksum = 0
42 for c in data:
43 cksum = (cksum >> 1) + ((cksum & 1) << 15)
44 cksum += c
45 cksum &= 0xFFFF
46 return cksum, int(nb)
49def _get_resource_dir() -> os.PathLike:
50 """returns path to resource directory"""
51 if "ENSEMBLDBRC" in os.environ:
52 path = os.environ["ENSEMBLDBRC"]
53 else:
54 from ensembl_lite import data
56 path = pathlib.Path(data.__file__).parent
58 path = pathlib.Path(path).expanduser().absolute()
59 if not path.exists():
60 raise ValueError(f"ENSEMBLDBRC directory {str(path)!r} does not exist")
62 return pathlib.Path(path)
65def get_resource_path(resource: Union[str, os.PathLike]) -> os.PathLike:
66 path = ENSEMBLDBRC / resource
67 assert path.exists()
68 return path
71# the following is where essential files live, such as
72# the species/common name map and sample download.cfg
73ENSEMBLDBRC = _get_resource_dir()
76def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
77 """executes shell command and returns stdout if completes exit code 0
79 Parameters
80 ----------
82 cmnd : str
83 shell command to be executed
84 stdout, stderr : streams
85 Default value (PIPE) intercepts process output, setting to None
86 blocks this."""
87 proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
88 out, err = proc.communicate()
89 if proc.returncode != 0:
90 msg = err
91 sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}")
92 sys.exit(proc.returncode)
93 return out.decode("utf8") if out is not None else None
96class CaseInsensitiveString(str):
97 """A case-insensitive string class. Comparisons are also case-insensitive."""
99 def __new__(cls, arg, h=None):
100 n = str.__new__(cls, str(arg))
101 n._lower = "".join(list(n)).lower()
102 n._hash = hash(n._lower)
103 return n
105 def __eq__(self, other):
106 return self._lower == "".join(list(other)).lower()
108 def __hash__(self):
109 # dict hashing done via lower case
110 return self._hash
112 def __str__(self):
113 return "".join(list(self))
116def load_ensembl_checksum(path: os.PathLike) -> dict:
117 """loads the BSD checksums from Ensembl CHECKSUMS file"""
118 result = {}
119 for line in path.read_text().splitlines():
120 line = line.strip()
121 if not line:
122 continue
123 s, b, p = line.split()
124 result[p] = int(s), int(b)
125 result.pop("README", None)
126 return result
129def load_ensembl_md5sum(path: os.PathLike) -> dict:
130 """loads the md5 sum from Ensembl MD5SUM file"""
131 result = {}
132 for line in path.read_text().splitlines():
133 line = line.strip()
134 if not line:
135 continue
136 s, p = line.split()
137 result[p] = s
138 result.pop("README", None)
139 return result
142class atomic_write:
143 """performs atomic write operations, cleans up if fails"""
145 def __init__(self, path: os.PathLike, tmpdir=None, mode="wb", encoding=None):
146 """
148 Parameters
149 ----------
150 path
151 path to file
152 tmpdir
153 directory where temporary file will be created
154 mode
155 file writing mode
156 encoding
157 text encoding
158 """
159 path = pathlib.Path(path).expanduser()
161 self._path = path
162 self._mode = mode
163 self._file = None
164 self._encoding = encoding
165 self._tmppath = self._make_tmppath(tmpdir)
167 self.succeeded = None
168 self._close_func = self._close_rename_standard
170 def _make_tmppath(self, tmpdir):
171 """returns path of temporary file
173 Parameters
174 ----------
175 tmpdir: Path
176 to directory
178 Returns
179 -------
180 full path to a temporary file
182 Notes
183 -----
184 Uses a random uuid as the file name, adds suffixes from path
185 """
186 suffixes = "".join(self._path.suffixes)
187 parent = self._path.parent
188 name = f"{uuid.uuid4()}{suffixes}"
189 tmpdir = (
190 pathlib.Path(mkdtemp(dir=parent))
191 if tmpdir is None
192 else pathlib.Path(tmpdir)
193 )
195 if not tmpdir.exists():
196 raise FileNotFoundError(f"{tmpdir} directory does not exist")
198 return tmpdir / name
200 def _get_fileobj(self):
201 """returns file to be written to"""
202 if self._file is None:
203 self._file = open(self._tmppath, self._mode)
205 return self._file
207 def __enter__(self) -> IO:
208 return self._get_fileobj()
210 def _close_rename_standard(self, src):
211 dest = pathlib.Path(self._path)
212 try:
213 dest.unlink()
214 except FileNotFoundError:
215 pass
216 finally:
217 src.rename(dest)
219 shutil.rmtree(src.parent)
221 def __exit__(self, exc_type, exc_val, exc_tb):
222 self._file.close()
223 if exc_type is None:
224 self._close_func(self._tmppath)
225 self.succeeded = True
226 else:
227 self.succeeded = False
229 shutil.rmtree(self._tmppath.parent, ignore_errors=True)
231 def write(self, text):
232 """writes text to file"""
233 fileobj = self._get_fileobj()
234 fileobj.write(text)
236 def close(self):
237 """closes file"""
238 self.__exit__(None, None, None)
241_sig_load_funcs = dict(CHECKSUMS=load_ensembl_checksum, MD5SUM=load_ensembl_md5sum)
242_sig_calc_funcs = dict(CHECKSUMS=checksum, MD5SUM=md5sum)
243_dont_checksum = re.compile("(CHECKSUMS|MD5SUM|README)")
244_sig_file = re.compile("(CHECKSUMS|MD5SUM)")
247def dont_checksum(path: os.PathLike) -> bool:
248 return _dont_checksum.search(str(path)) is not None
251@functools.singledispatch
252def is_signature(path: os.PathLike) -> bool:
253 return _sig_file.search(path.name) is not None
256@is_signature.register
257def _(path: str) -> bool:
258 return _sig_file.search(path) is not None
261@functools.singledispatch
262def get_sig_calc_func(sig_path) -> Callable:
263 """returns signature calculating function based on Ensembl path name"""
264 raise NotImplementedError(f"{type(sig_path)} not supported")
267@get_sig_calc_func.register
268def _(sig_path: str) -> Callable:
269 return _sig_calc_funcs[sig_path]
272def get_signature_data(path: os.PathLike) -> Callable:
273 return _sig_load_funcs[path.name](path)
276def rich_display(c3t, title_justify="left"):
277 """converts a cogent3 Table to a Rich Table and displays it"""
278 from cogent3.format.table import formatted_array
279 from rich.console import Console
280 from rich.table import Table
282 cols = c3t.columns
283 columns = [formatted_array(cols[c], pad=False)[0] for c in c3t.header]
284 rich_table = Table(
285 title=c3t.title,
286 highlight=True,
287 title_justify=title_justify,
288 title_style="bold blue",
289 )
290 for col in c3t.header:
291 numeric_type = any(v in cols[col].dtype.name for v in ("int", "float"))
292 j = "right" if numeric_type else "left"
293 rich_table.add_column(col, justify=j, no_wrap=numeric_type)
295 for row in zip(*columns):
296 rich_table.add_row(*row)
298 console = Console()
299 console.print(rich_table)
302_seps = re.compile(r"[-._\s]")
305def _name_parts(path: str) -> list[str]:
306 return _seps.split(pathlib.Path(path).name.lower())
309def _simple_check(align_parts: str, tree_parts: str) -> int:
310 """evaluates whether the start of the two paths match"""
311 matches = 0
312 for a, b in zip(align_parts, tree_parts):
313 if a != b:
314 break
315 matches += 1
317 return matches
320def trees_for_aligns(aligns, trees) -> dict[str, str]:
321 aligns = {p: _name_parts(p) for p in aligns}
322 trees = {p: _name_parts(p) for p in trees}
323 result = {}
324 for align, align_parts in aligns.items():
325 dists = [
326 (_simple_check(align_parts, tree_parts), tree)
327 for tree, tree_parts in trees.items()
328 ]
329 v, p = max(dists)
330 if v == 0:
331 raise ValueError(f"no tree for {align}")
333 result[align] = p
335 return result
338@define_app
339def _str_to_bytes(data: str) -> bytes:
340 """converts string to bytes"""
341 return data.encode("utf8")
344@define_app
345def _bytes_to_str(data: bytes) -> str:
346 """converts bytes into string"""
347 return data.decode("utf8")
350@define_app
351def blosc_compress_it(data: bytes) -> bytes:
352 return blosc2.compress(data, clevel=9, filter=blosc2.Filter.SHUFFLE)
355@define_app
356def blosc_decompress_it(data: bytes, as_bytearray=True) -> bytes:
357 return bytes(blosc2.decompress(data, as_bytearray=as_bytearray))
360elt_compress_it = _str_to_bytes() + blosc_compress_it()
361elt_decompress_it = blosc_decompress_it() + _bytes_to_str()
363_biotypes = re.compile(r"(gene|transcript|exon|mRNA|rRNA|protein):")
366def sanitise_stableid(stableid: str) -> str:
367 """remove <biotype>:E.. from Ensembl stable ID
369 Notes
370 -----
371 The GFF3 files from Ensembl store identifiers as <biotype>:<identifier>,
372 this function removes redundant biotype component.
373 """
374 return _biotypes.sub("", stableid)
377@contextlib.contextmanager
378def fake_wake(*args, **kwargs):
379 yield