Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/util.py: 80%

196 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 11:36 +1100

1from __future__ import annotations 

2 

3import functools 

4import os 

5import pathlib 

6import re 

7import shutil 

8import subprocess 

9import sys 

10import uuid 

11 

12from hashlib import md5 

13from tempfile import mkdtemp 

14from typing import IO, Callable, Union 

15 

16import blosc2 

17import numba 

18import numpy 

19 

20from cogent3.app.composable import define_app 

21 

22 

23def md5sum(data: bytes, *args) -> str: 

24 """computes MD5SUM 

25 

26 Notes 

27 ----- 

28 *args is for signature compatability with checksum 

29 """ 

30 return md5(data).hexdigest() 

31 

32 

33# based on https://www.reddit.com/r/learnpython/comments/9bpgjl/implementing_bsd_16bit_checksum/ 

34# and https://www.gnu.org/software/coreutils/manual/html_node/sum-invocation.html#sum-invocation 

35@numba.jit(nopython=True) 

36def checksum(data: bytes, size: int): 

37 """computes BSD style checksum""" 

38 # equivalent to command line BSD sum 

39 nb = numpy.ceil(size / 1024) 

40 cksum = 0 

41 for c in data: 

42 cksum = (cksum >> 1) + ((cksum & 1) << 15) 

43 cksum += c 

44 cksum &= 0xFFFF 

45 return cksum, int(nb) 

46 

47 

48def _get_resource_dir() -> os.PathLike: 

49 """returns path to resource directory""" 

50 if "ENSEMBLDBRC" in os.environ: 

51 path = os.environ["ENSEMBLDBRC"] 

52 else: 

53 from ensembl_lite import data 

54 

55 path = pathlib.Path(data.__file__).parent 

56 

57 path = pathlib.Path(path).expanduser().absolute() 

58 if not path.exists(): 

59 raise ValueError("ENSEMBLDBRC directory '%s' does not exist") 

60 

61 return pathlib.Path(path) 

62 

63 

64def get_resource_path(resource: Union[str, os.PathLike]) -> os.PathLike: 

65 path = ENSEMBLDBRC / resource 

66 assert path.exists() 

67 return path 

68 

69 

70# the following is where essential files live, such as 

71# the species/common name map and sample download.cfg 

72ENSEMBLDBRC = _get_resource_dir() 

73 

74 

75def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): 

76 """executes shell command and returns stdout if completes exit code 0 

77 

78 Parameters 

79 ---------- 

80 

81 cmnd : str 

82 shell command to be executed 

83 stdout, stderr : streams 

84 Default value (PIPE) intercepts process output, setting to None 

85 blocks this.""" 

86 proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr) 

87 out, err = proc.communicate() 

88 if proc.returncode != 0: 

89 msg = err 

90 sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}") 

91 sys.exit(proc.returncode) 

92 return out.decode("utf8") if out is not None else None 

93 

94 

95class CaseInsensitiveString(str): 

96 """A case-insensitive string class. Comparisons are also case-insensitive.""" 

97 

98 def __new__(cls, arg, h=None): 

99 n = str.__new__(cls, str(arg)) 

100 n._lower = "".join(list(n)).lower() 

101 n._hash = hash(n._lower) 

102 return n 

103 

104 def __eq__(self, other): 

105 return self._lower == "".join(list(other)).lower() 

106 

107 def __hash__(self): 

108 # dict hashing done via lower case 

109 return self._hash 

110 

111 def __str__(self): 

112 return "".join(list(self)) 

113 

114 

115def load_ensembl_checksum(path: os.PathLike) -> dict: 

116 """loads the BSD checksums from Ensembl CHECKSUMS file""" 

117 result = {} 

118 for line in path.read_text().splitlines(): 

119 line = line.strip() 

120 if not line: 

121 continue 

122 s, b, p = line.split() 

123 result[p] = int(s), int(b) 

124 result.pop("README", None) 

125 return result 

126 

127 

128def load_ensembl_md5sum(path: os.PathLike) -> dict: 

129 """loads the md5 sum from Ensembl MD5SUM file""" 

130 result = {} 

131 for line in path.read_text().splitlines(): 

132 line = line.strip() 

133 if not line: 

134 continue 

135 s, p = line.split() 

136 result[p] = s 

137 result.pop("README", None) 

138 return result 

139 

140 

141class atomic_write: 

142 """performs atomic write operations, cleans up if fails""" 

143 

144 def __init__(self, path: os.PathLike, tmpdir=None, mode="wb", encoding=None): 

145 """ 

146 

147 Parameters 

148 ---------- 

149 path 

150 path to file 

151 tmpdir 

152 directory where temporary file will be created 

153 mode 

154 file writing mode 

155 encoding 

156 text encoding 

157 """ 

158 path = pathlib.Path(path).expanduser() 

159 

160 self._path = path 

161 self._mode = mode 

162 self._file = None 

163 self._encoding = encoding 

164 self._tmppath = self._make_tmppath(tmpdir) 

165 

166 self.succeeded = None 

167 self._close_func = self._close_rename_standard 

168 

169 def _make_tmppath(self, tmpdir): 

170 """returns path of temporary file 

171 

172 Parameters 

173 ---------- 

174 tmpdir: Path 

175 to directory 

176 

177 Returns 

178 ------- 

179 full path to a temporary file 

180 

181 Notes 

182 ----- 

183 Uses a random uuid as the file name, adds suffixes from path 

184 """ 

185 suffixes = "".join(self._path.suffixes) 

186 parent = self._path.parent 

187 name = f"{uuid.uuid4()}{suffixes}" 

188 tmpdir = ( 

189 pathlib.Path(mkdtemp(dir=parent)) 

190 if tmpdir is None 

191 else pathlib.Path(tmpdir) 

192 ) 

193 

194 if not tmpdir.exists(): 

195 raise FileNotFoundError(f"{tmpdir} directory does not exist") 

196 

197 return tmpdir / name 

198 

199 def _get_fileobj(self): 

200 """returns file to be written to""" 

201 if self._file is None: 

202 self._file = open(self._tmppath, self._mode) 

203 

204 return self._file 

205 

206 def __enter__(self) -> IO: 

207 return self._get_fileobj() 

208 

209 def _close_rename_standard(self, src): 

210 dest = pathlib.Path(self._path) 

211 try: 

212 dest.unlink() 

213 except FileNotFoundError: 

214 pass 

215 finally: 

216 src.rename(dest) 

217 

218 shutil.rmtree(src.parent) 

219 

220 def __exit__(self, exc_type, exc_val, exc_tb): 

221 self._file.close() 

222 if exc_type is None: 

223 self._close_func(self._tmppath) 

224 self.succeeded = True 

225 else: 

226 self.succeeded = False 

227 

228 shutil.rmtree(self._tmppath.parent, ignore_errors=True) 

229 

230 def write(self, text): 

231 """writes text to file""" 

232 fileobj = self._get_fileobj() 

233 fileobj.write(text) 

234 

235 def close(self): 

236 """closes file""" 

237 self.__exit__(None, None, None) 

238 

239 

240_sig_load_funcs = dict(CHECKSUMS=load_ensembl_checksum, MD5SUM=load_ensembl_md5sum) 

241_sig_calc_funcs = dict(CHECKSUMS=checksum, MD5SUM=md5sum) 

242_dont_checksum = re.compile("(CHECKSUMS|MD5SUM|README)") 

243_sig_file = re.compile("(CHECKSUMS|MD5SUM)") 

244 

245 

246def dont_checksum(path: os.PathLike) -> bool: 

247 return _dont_checksum.search(str(path)) is not None 

248 

249 

250@functools.singledispatch 

251def is_signature(path: os.PathLike) -> bool: 

252 return _sig_file.search(path.name) is not None 

253 

254 

255@is_signature.register 

256def _(path: str) -> bool: 

257 return _sig_file.search(path) is not None 

258 

259 

260@functools.singledispatch 

261def get_sig_calc_func(sig_path: os.PathLike) -> Callable: 

262 return _sig_calc_funcs[sig_path.name] 

263 

264 

265@get_sig_calc_func.register 

266def _(sig_path: str) -> Callable: 

267 return _sig_calc_funcs[sig_path] 

268 

269 

270def get_signature_data(path: os.PathLike) -> Callable: 

271 return _sig_load_funcs[path.name](path) 

272 

273 

274def rich_display(c3t, title_justify="left"): 

275 """converts a cogent3 Table to a Rich Table and displays it""" 

276 from cogent3.format.table import formatted_array 

277 from rich.console import Console 

278 from rich.table import Table 

279 

280 cols = c3t.columns 

281 columns = [formatted_array(cols[c], pad=False)[0] for c in c3t.header] 

282 rich_table = Table( 

283 title=c3t.title, 

284 highlight=True, 

285 title_justify=title_justify, 

286 title_style="bold blue", 

287 ) 

288 for col in c3t.header: 

289 numeric_type = any(v in cols[col].dtype.name for v in ("int", "float")) 

290 j = "right" if numeric_type else "left" 

291 rich_table.add_column(col, justify=j, no_wrap=numeric_type) 

292 

293 for row in zip(*columns): 

294 rich_table.add_row(*row) 

295 

296 console = Console() 

297 console.print(rich_table) 

298 

299 

300_seps = re.compile(r"[-._\s]") 

301 

302 

303def _name_parts(path: str) -> list[str]: 

304 return _seps.split(pathlib.Path(path).name.lower()) 

305 

306 

307def _simple_check(align_parts: str, tree_parts: str) -> int: 

308 """evaluates whether the start of the two paths match""" 

309 matches = 0 

310 for a, b in zip(align_parts, tree_parts): 

311 if a != b: 

312 break 

313 matches += 1 

314 

315 return matches 

316 

317 

318def trees_for_aligns(aligns, trees) -> dict[str, str]: 

319 from cogent3.maths.distance_transform import jaccard 

320 

321 aligns = {p: _name_parts(p) for p in aligns} 

322 trees = {p: _name_parts(p) for p in trees} 

323 result = {} 

324 for align, align_parts in aligns.items(): 

325 dists = [ 

326 (_simple_check(align_parts, tree_parts), tree) 

327 for tree, tree_parts in trees.items() 

328 ] 

329 v, p = max(dists) 

330 if v == 0: 

331 raise ValueError(f"no tree for {align}") 

332 

333 result[align] = p 

334 

335 return result 

336 

337 

338@define_app 

339def _str_to_bytes(data: str) -> bytes: 

340 """converts string to bytes""" 

341 return data.encode("utf8") 

342 

343 

344@define_app 

345def _bytes_to_str(data: bytes) -> str: 

346 """converts bytes into string""" 

347 return data.decode("utf8") 

348 

349 

350@define_app 

351def blosc_compress_it(data: bytes) -> bytes: 

352 return blosc2.compress(data, clevel=9, filter=blosc2.Filter.SHUFFLE) 

353 

354 

355@define_app 

356def blosc_decompress_it(data: bytes, as_bytearray=True) -> bytes: 

357 return bytes(blosc2.decompress(data, as_bytearray=as_bytearray)) 

358 

359 

360elt_compress_it = _str_to_bytes() + blosc_compress_it() 

361elt_decompress_it = blosc_decompress_it() + _bytes_to_str()