Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_util.py: 87%

194 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1from __future__ import annotations 

2 

3import contextlib 

4import functools 

5import os 

6import pathlib 

7import re 

8import shutil 

9import subprocess 

10import sys 

11import uuid 

12 

13from hashlib import md5 

14from tempfile import mkdtemp 

15from typing import IO, Callable, Union 

16 

17import blosc2 

18import numba 

19import numpy 

20 

21from cogent3.app.composable import define_app 

22 

23 

24def md5sum(data: bytes, *args) -> str: 

25 """computes MD5SUM 

26 

27 Notes 

28 ----- 

29 *args is for signature compatability with checksum 

30 """ 

31 return md5(data).hexdigest() 

32 

33 

34# based on https://www.reddit.com/r/learnpython/comments/9bpgjl/implementing_bsd_16bit_checksum/ 

35# and https://www.gnu.org/software/coreutils/manual/html_node/sum-invocation.html#sum-invocation 

36@numba.jit(nopython=True) 

37def checksum(data: bytes, size: int): # pragma: no cover 

38 """computes BSD style checksum""" 

39 # equivalent to command line BSD sum 

40 nb = numpy.ceil(size / 1024) 

41 cksum = 0 

42 for c in data: 

43 cksum = (cksum >> 1) + ((cksum & 1) << 15) 

44 cksum += c 

45 cksum &= 0xFFFF 

46 return cksum, int(nb) 

47 

48 

49def _get_resource_dir() -> os.PathLike: 

50 """returns path to resource directory""" 

51 if "ENSEMBLDBRC" in os.environ: 

52 path = os.environ["ENSEMBLDBRC"] 

53 else: 

54 from ensembl_lite import data 

55 

56 path = pathlib.Path(data.__file__).parent 

57 

58 path = pathlib.Path(path).expanduser().absolute() 

59 if not path.exists(): 

60 raise ValueError(f"ENSEMBLDBRC directory {str(path)!r} does not exist") 

61 

62 return pathlib.Path(path) 

63 

64 

65def get_resource_path(resource: Union[str, os.PathLike]) -> os.PathLike: 

66 path = ENSEMBLDBRC / resource 

67 assert path.exists() 

68 return path 

69 

70 

71# the following is where essential files live, such as 

72# the species/common name map and sample download.cfg 

73ENSEMBLDBRC = _get_resource_dir() 

74 

75 

76def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): 

77 """executes shell command and returns stdout if completes exit code 0 

78 

79 Parameters 

80 ---------- 

81 

82 cmnd : str 

83 shell command to be executed 

84 stdout, stderr : streams 

85 Default value (PIPE) intercepts process output, setting to None 

86 blocks this.""" 

87 proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr) 

88 out, err = proc.communicate() 

89 if proc.returncode != 0: 

90 msg = err 

91 sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}") 

92 sys.exit(proc.returncode) 

93 return out.decode("utf8") if out is not None else None 

94 

95 

96class CaseInsensitiveString(str): 

97 """A case-insensitive string class. Comparisons are also case-insensitive.""" 

98 

99 def __new__(cls, arg, h=None): 

100 n = str.__new__(cls, str(arg)) 

101 n._lower = "".join(list(n)).lower() 

102 n._hash = hash(n._lower) 

103 return n 

104 

105 def __eq__(self, other): 

106 return self._lower == "".join(list(other)).lower() 

107 

108 def __hash__(self): 

109 # dict hashing done via lower case 

110 return self._hash 

111 

112 def __str__(self): 

113 return "".join(list(self)) 

114 

115 

116def load_ensembl_checksum(path: os.PathLike) -> dict: 

117 """loads the BSD checksums from Ensembl CHECKSUMS file""" 

118 result = {} 

119 for line in path.read_text().splitlines(): 

120 line = line.strip() 

121 if not line: 

122 continue 

123 s, b, p = line.split() 

124 result[p] = int(s), int(b) 

125 result.pop("README", None) 

126 return result 

127 

128 

129def load_ensembl_md5sum(path: os.PathLike) -> dict: 

130 """loads the md5 sum from Ensembl MD5SUM file""" 

131 result = {} 

132 for line in path.read_text().splitlines(): 

133 line = line.strip() 

134 if not line: 

135 continue 

136 s, p = line.split() 

137 result[p] = s 

138 result.pop("README", None) 

139 return result 

140 

141 

142class atomic_write: 

143 """performs atomic write operations, cleans up if fails""" 

144 

145 def __init__(self, path: os.PathLike, tmpdir=None, mode="wb", encoding=None): 

146 """ 

147 

148 Parameters 

149 ---------- 

150 path 

151 path to file 

152 tmpdir 

153 directory where temporary file will be created 

154 mode 

155 file writing mode 

156 encoding 

157 text encoding 

158 """ 

159 path = pathlib.Path(path).expanduser() 

160 

161 self._path = path 

162 self._mode = mode 

163 self._file = None 

164 self._encoding = encoding 

165 self._tmppath = self._make_tmppath(tmpdir) 

166 

167 self.succeeded = None 

168 self._close_func = self._close_rename_standard 

169 

170 def _make_tmppath(self, tmpdir): 

171 """returns path of temporary file 

172 

173 Parameters 

174 ---------- 

175 tmpdir: Path 

176 to directory 

177 

178 Returns 

179 ------- 

180 full path to a temporary file 

181 

182 Notes 

183 ----- 

184 Uses a random uuid as the file name, adds suffixes from path 

185 """ 

186 suffixes = "".join(self._path.suffixes) 

187 parent = self._path.parent 

188 name = f"{uuid.uuid4()}{suffixes}" 

189 tmpdir = ( 

190 pathlib.Path(mkdtemp(dir=parent)) 

191 if tmpdir is None 

192 else pathlib.Path(tmpdir) 

193 ) 

194 

195 if not tmpdir.exists(): 

196 raise FileNotFoundError(f"{tmpdir} directory does not exist") 

197 

198 return tmpdir / name 

199 

200 def _get_fileobj(self): 

201 """returns file to be written to""" 

202 if self._file is None: 

203 self._file = open(self._tmppath, self._mode) 

204 

205 return self._file 

206 

207 def __enter__(self) -> IO: 

208 return self._get_fileobj() 

209 

210 def _close_rename_standard(self, src): 

211 dest = pathlib.Path(self._path) 

212 try: 

213 dest.unlink() 

214 except FileNotFoundError: 

215 pass 

216 finally: 

217 src.rename(dest) 

218 

219 shutil.rmtree(src.parent) 

220 

221 def __exit__(self, exc_type, exc_val, exc_tb): 

222 self._file.close() 

223 if exc_type is None: 

224 self._close_func(self._tmppath) 

225 self.succeeded = True 

226 else: 

227 self.succeeded = False 

228 

229 shutil.rmtree(self._tmppath.parent, ignore_errors=True) 

230 

231 def write(self, text): 

232 """writes text to file""" 

233 fileobj = self._get_fileobj() 

234 fileobj.write(text) 

235 

236 def close(self): 

237 """closes file""" 

238 self.__exit__(None, None, None) 

239 

240 

241_sig_load_funcs = dict(CHECKSUMS=load_ensembl_checksum, MD5SUM=load_ensembl_md5sum) 

242_sig_calc_funcs = dict(CHECKSUMS=checksum, MD5SUM=md5sum) 

243_dont_checksum = re.compile("(CHECKSUMS|MD5SUM|README)") 

244_sig_file = re.compile("(CHECKSUMS|MD5SUM)") 

245 

246 

247def dont_checksum(path: os.PathLike) -> bool: 

248 return _dont_checksum.search(str(path)) is not None 

249 

250 

251@functools.singledispatch 

252def is_signature(path: os.PathLike) -> bool: 

253 return _sig_file.search(path.name) is not None 

254 

255 

256@is_signature.register 

257def _(path: str) -> bool: 

258 return _sig_file.search(path) is not None 

259 

260 

261@functools.singledispatch 

262def get_sig_calc_func(sig_path) -> Callable: 

263 """returns signature calculating function based on Ensembl path name""" 

264 raise NotImplementedError(f"{type(sig_path)} not supported") 

265 

266 

267@get_sig_calc_func.register 

268def _(sig_path: str) -> Callable: 

269 return _sig_calc_funcs[sig_path] 

270 

271 

272def get_signature_data(path: os.PathLike) -> Callable: 

273 return _sig_load_funcs[path.name](path) 

274 

275 

276def rich_display(c3t, title_justify="left"): 

277 """converts a cogent3 Table to a Rich Table and displays it""" 

278 from cogent3.format.table import formatted_array 

279 from rich.console import Console 

280 from rich.table import Table 

281 

282 cols = c3t.columns 

283 columns = [formatted_array(cols[c], pad=False)[0] for c in c3t.header] 

284 rich_table = Table( 

285 title=c3t.title, 

286 highlight=True, 

287 title_justify=title_justify, 

288 title_style="bold blue", 

289 ) 

290 for col in c3t.header: 

291 numeric_type = any(v in cols[col].dtype.name for v in ("int", "float")) 

292 j = "right" if numeric_type else "left" 

293 rich_table.add_column(col, justify=j, no_wrap=numeric_type) 

294 

295 for row in zip(*columns): 

296 rich_table.add_row(*row) 

297 

298 console = Console() 

299 console.print(rich_table) 

300 

301 

302_seps = re.compile(r"[-._\s]") 

303 

304 

305def _name_parts(path: str) -> list[str]: 

306 return _seps.split(pathlib.Path(path).name.lower()) 

307 

308 

309def _simple_check(align_parts: str, tree_parts: str) -> int: 

310 """evaluates whether the start of the two paths match""" 

311 matches = 0 

312 for a, b in zip(align_parts, tree_parts): 

313 if a != b: 

314 break 

315 matches += 1 

316 

317 return matches 

318 

319 

320def trees_for_aligns(aligns, trees) -> dict[str, str]: 

321 aligns = {p: _name_parts(p) for p in aligns} 

322 trees = {p: _name_parts(p) for p in trees} 

323 result = {} 

324 for align, align_parts in aligns.items(): 

325 dists = [ 

326 (_simple_check(align_parts, tree_parts), tree) 

327 for tree, tree_parts in trees.items() 

328 ] 

329 v, p = max(dists) 

330 if v == 0: 

331 raise ValueError(f"no tree for {align}") 

332 

333 result[align] = p 

334 

335 return result 

336 

337 

338@define_app 

339def _str_to_bytes(data: str) -> bytes: 

340 """converts string to bytes""" 

341 return data.encode("utf8") 

342 

343 

344@define_app 

345def _bytes_to_str(data: bytes) -> str: 

346 """converts bytes into string""" 

347 return data.decode("utf8") 

348 

349 

350@define_app 

351def blosc_compress_it(data: bytes) -> bytes: 

352 return blosc2.compress(data, clevel=9, filter=blosc2.Filter.SHUFFLE) 

353 

354 

355@define_app 

356def blosc_decompress_it(data: bytes, as_bytearray=True) -> bytes: 

357 return bytes(blosc2.decompress(data, as_bytearray=as_bytearray)) 

358 

359 

360elt_compress_it = _str_to_bytes() + blosc_compress_it() 

361elt_decompress_it = blosc_decompress_it() + _bytes_to_str() 

362 

363_biotypes = re.compile(r"(gene|transcript|exon|mRNA|rRNA|protein):") 

364 

365 

366def sanitise_stableid(stableid: str) -> str: 

367 """remove <biotype>:E.. from Ensembl stable ID 

368 

369 Notes 

370 ----- 

371 The GFF3 files from Ensembl store identifiers as <biotype>:<identifier>, 

372 this function removes redundant biotype component. 

373 """ 

374 return _biotypes.sub("", stableid) 

375 

376 

377@contextlib.contextmanager 

378def fake_wake(*args, **kwargs): 

379 yield