Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/util.py: 80%
196 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
1from __future__ import annotations
3import functools
4import os
5import pathlib
6import re
7import shutil
8import subprocess
9import sys
10import uuid
12from hashlib import md5
13from tempfile import mkdtemp
14from typing import IO, Callable, Union
16import blosc2
17import numba
18import numpy
20from cogent3.app.composable import define_app
23def md5sum(data: bytes, *args) -> str:
24 """computes MD5SUM
26 Notes
27 -----
28 *args is for signature compatability with checksum
29 """
30 return md5(data).hexdigest()
33# based on https://www.reddit.com/r/learnpython/comments/9bpgjl/implementing_bsd_16bit_checksum/
34# and https://www.gnu.org/software/coreutils/manual/html_node/sum-invocation.html#sum-invocation
35@numba.jit(nopython=True)
36def checksum(data: bytes, size: int):
37 """computes BSD style checksum"""
38 # equivalent to command line BSD sum
39 nb = numpy.ceil(size / 1024)
40 cksum = 0
41 for c in data:
42 cksum = (cksum >> 1) + ((cksum & 1) << 15)
43 cksum += c
44 cksum &= 0xFFFF
45 return cksum, int(nb)
48def _get_resource_dir() -> os.PathLike:
49 """returns path to resource directory"""
50 if "ENSEMBLDBRC" in os.environ:
51 path = os.environ["ENSEMBLDBRC"]
52 else:
53 from ensembl_lite import data
55 path = pathlib.Path(data.__file__).parent
57 path = pathlib.Path(path).expanduser().absolute()
58 if not path.exists():
59 raise ValueError("ENSEMBLDBRC directory '%s' does not exist")
61 return pathlib.Path(path)
64def get_resource_path(resource: Union[str, os.PathLike]) -> os.PathLike:
65 path = ENSEMBLDBRC / resource
66 assert path.exists()
67 return path
70# the following is where essential files live, such as
71# the species/common name map and sample download.cfg
72ENSEMBLDBRC = _get_resource_dir()
75def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
76 """executes shell command and returns stdout if completes exit code 0
78 Parameters
79 ----------
81 cmnd : str
82 shell command to be executed
83 stdout, stderr : streams
84 Default value (PIPE) intercepts process output, setting to None
85 blocks this."""
86 proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
87 out, err = proc.communicate()
88 if proc.returncode != 0:
89 msg = err
90 sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}")
91 sys.exit(proc.returncode)
92 return out.decode("utf8") if out is not None else None
95class CaseInsensitiveString(str):
96 """A case-insensitive string class. Comparisons are also case-insensitive."""
98 def __new__(cls, arg, h=None):
99 n = str.__new__(cls, str(arg))
100 n._lower = "".join(list(n)).lower()
101 n._hash = hash(n._lower)
102 return n
104 def __eq__(self, other):
105 return self._lower == "".join(list(other)).lower()
107 def __hash__(self):
108 # dict hashing done via lower case
109 return self._hash
111 def __str__(self):
112 return "".join(list(self))
115def load_ensembl_checksum(path: os.PathLike) -> dict:
116 """loads the BSD checksums from Ensembl CHECKSUMS file"""
117 result = {}
118 for line in path.read_text().splitlines():
119 line = line.strip()
120 if not line:
121 continue
122 s, b, p = line.split()
123 result[p] = int(s), int(b)
124 result.pop("README", None)
125 return result
128def load_ensembl_md5sum(path: os.PathLike) -> dict:
129 """loads the md5 sum from Ensembl MD5SUM file"""
130 result = {}
131 for line in path.read_text().splitlines():
132 line = line.strip()
133 if not line:
134 continue
135 s, p = line.split()
136 result[p] = s
137 result.pop("README", None)
138 return result
141class atomic_write:
142 """performs atomic write operations, cleans up if fails"""
144 def __init__(self, path: os.PathLike, tmpdir=None, mode="wb", encoding=None):
145 """
147 Parameters
148 ----------
149 path
150 path to file
151 tmpdir
152 directory where temporary file will be created
153 mode
154 file writing mode
155 encoding
156 text encoding
157 """
158 path = pathlib.Path(path).expanduser()
160 self._path = path
161 self._mode = mode
162 self._file = None
163 self._encoding = encoding
164 self._tmppath = self._make_tmppath(tmpdir)
166 self.succeeded = None
167 self._close_func = self._close_rename_standard
169 def _make_tmppath(self, tmpdir):
170 """returns path of temporary file
172 Parameters
173 ----------
174 tmpdir: Path
175 to directory
177 Returns
178 -------
179 full path to a temporary file
181 Notes
182 -----
183 Uses a random uuid as the file name, adds suffixes from path
184 """
185 suffixes = "".join(self._path.suffixes)
186 parent = self._path.parent
187 name = f"{uuid.uuid4()}{suffixes}"
188 tmpdir = (
189 pathlib.Path(mkdtemp(dir=parent))
190 if tmpdir is None
191 else pathlib.Path(tmpdir)
192 )
194 if not tmpdir.exists():
195 raise FileNotFoundError(f"{tmpdir} directory does not exist")
197 return tmpdir / name
199 def _get_fileobj(self):
200 """returns file to be written to"""
201 if self._file is None:
202 self._file = open(self._tmppath, self._mode)
204 return self._file
206 def __enter__(self) -> IO:
207 return self._get_fileobj()
209 def _close_rename_standard(self, src):
210 dest = pathlib.Path(self._path)
211 try:
212 dest.unlink()
213 except FileNotFoundError:
214 pass
215 finally:
216 src.rename(dest)
218 shutil.rmtree(src.parent)
220 def __exit__(self, exc_type, exc_val, exc_tb):
221 self._file.close()
222 if exc_type is None:
223 self._close_func(self._tmppath)
224 self.succeeded = True
225 else:
226 self.succeeded = False
228 shutil.rmtree(self._tmppath.parent, ignore_errors=True)
230 def write(self, text):
231 """writes text to file"""
232 fileobj = self._get_fileobj()
233 fileobj.write(text)
235 def close(self):
236 """closes file"""
237 self.__exit__(None, None, None)
240_sig_load_funcs = dict(CHECKSUMS=load_ensembl_checksum, MD5SUM=load_ensembl_md5sum)
241_sig_calc_funcs = dict(CHECKSUMS=checksum, MD5SUM=md5sum)
242_dont_checksum = re.compile("(CHECKSUMS|MD5SUM|README)")
243_sig_file = re.compile("(CHECKSUMS|MD5SUM)")
246def dont_checksum(path: os.PathLike) -> bool:
247 return _dont_checksum.search(str(path)) is not None
250@functools.singledispatch
251def is_signature(path: os.PathLike) -> bool:
252 return _sig_file.search(path.name) is not None
255@is_signature.register
256def _(path: str) -> bool:
257 return _sig_file.search(path) is not None
260@functools.singledispatch
261def get_sig_calc_func(sig_path: os.PathLike) -> Callable:
262 return _sig_calc_funcs[sig_path.name]
265@get_sig_calc_func.register
266def _(sig_path: str) -> Callable:
267 return _sig_calc_funcs[sig_path]
270def get_signature_data(path: os.PathLike) -> Callable:
271 return _sig_load_funcs[path.name](path)
274def rich_display(c3t, title_justify="left"):
275 """converts a cogent3 Table to a Rich Table and displays it"""
276 from cogent3.format.table import formatted_array
277 from rich.console import Console
278 from rich.table import Table
280 cols = c3t.columns
281 columns = [formatted_array(cols[c], pad=False)[0] for c in c3t.header]
282 rich_table = Table(
283 title=c3t.title,
284 highlight=True,
285 title_justify=title_justify,
286 title_style="bold blue",
287 )
288 for col in c3t.header:
289 numeric_type = any(v in cols[col].dtype.name for v in ("int", "float"))
290 j = "right" if numeric_type else "left"
291 rich_table.add_column(col, justify=j, no_wrap=numeric_type)
293 for row in zip(*columns):
294 rich_table.add_row(*row)
296 console = Console()
297 console.print(rich_table)
300_seps = re.compile(r"[-._\s]")
303def _name_parts(path: str) -> list[str]:
304 return _seps.split(pathlib.Path(path).name.lower())
307def _simple_check(align_parts: str, tree_parts: str) -> int:
308 """evaluates whether the start of the two paths match"""
309 matches = 0
310 for a, b in zip(align_parts, tree_parts):
311 if a != b:
312 break
313 matches += 1
315 return matches
318def trees_for_aligns(aligns, trees) -> dict[str, str]:
319 from cogent3.maths.distance_transform import jaccard
321 aligns = {p: _name_parts(p) for p in aligns}
322 trees = {p: _name_parts(p) for p in trees}
323 result = {}
324 for align, align_parts in aligns.items():
325 dists = [
326 (_simple_check(align_parts, tree_parts), tree)
327 for tree, tree_parts in trees.items()
328 ]
329 v, p = max(dists)
330 if v == 0:
331 raise ValueError(f"no tree for {align}")
333 result[align] = p
335 return result
338@define_app
339def _str_to_bytes(data: str) -> bytes:
340 """converts string to bytes"""
341 return data.encode("utf8")
344@define_app
345def _bytes_to_str(data: bytes) -> str:
346 """converts bytes into string"""
347 return data.decode("utf8")
350@define_app
351def blosc_compress_it(data: bytes) -> bytes:
352 return blosc2.compress(data, clevel=9, filter=blosc2.Filter.SHUFFLE)
355@define_app
356def blosc_decompress_it(data: bytes, as_bytearray=True) -> bytes:
357 return bytes(blosc2.decompress(data, as_bytearray=as_bytearray))
360elt_compress_it = _str_to_bytes() + blosc_compress_it()
361elt_decompress_it = blosc_decompress_it() + _bytes_to_str()