Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_ftp_download.py: 25%
48 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:32 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:32 -0400
1import pathlib
3from ftplib import FTP
4from typing import Callable, Iterable
6from rich.progress import track
7from unsync import unsync
9from ensembl_lite._util import (
10 PathType,
11 atomic_write,
12 dont_checksum,
13 get_sig_calc_func,
14 get_signature_data,
15 is_signature,
16)
19def configured_ftp(host: str = "ftp.ensembl.org") -> FTP:
20 ftp = FTP(host)
21 ftp.login()
22 return ftp
25def listdir(host: str, path: str, pattern: Callable = None):
26 """returns directory listing"""
27 pattern = pattern or (lambda x: True)
28 ftp = configured_ftp(host=host)
29 ftp.cwd(path)
30 for fn in ftp.nlst():
31 if pattern(fn):
32 yield f"{path}/{fn}"
33 ftp.close()
36def _copy_to_local(host: str, src: PathType, dest: PathType) -> PathType:
37 if dest.exists():
38 return dest
39 ftp = configured_ftp(host=host)
40 # pass in checksum and keep going until it's correct?
41 with atomic_write(dest, mode="wb") as outfile:
42 ftp.retrbinary(f"RETR {src}", outfile.write)
44 ftp.close()
45 return dest
48unsynced_copy_to_local = unsync(_copy_to_local)
51def _get_saved_paths_unsync(description, host, local_dest, remote_paths):
52 tasks = [
53 unsynced_copy_to_local(host, path, local_dest / pathlib.Path(path).name)
54 for path in remote_paths
55 ]
56 return [
57 task.result() for task in track(tasks, description=description, transient=True)
58 ]
61def _get_saved_paths(description, host, local_dest, remote_paths): # pragma: no cover
62 # keep this, it's useful for debugging
63 saved_paths = []
64 for path in track(remote_paths, description=description, transient=True):
65 saved = _copy_to_local(host, path, local_dest / pathlib.Path(path).name)
66 saved_paths.append(saved)
67 return saved_paths
70def download_data(
71 *,
72 host: str,
73 local_dest: PathType,
74 remote_paths: Iterable[PathType],
75 description,
76 do_checksum: bool,
77) -> bool:
78 saved_paths = _get_saved_paths_unsync(description, host, local_dest, remote_paths)
80 # load the signature data and sig calc keyed by parent dir
81 all_checksums = {}
82 all_check_funcs = {}
83 for path in saved_paths:
84 if is_signature(path):
85 all_checksums[str(path.parent)] = get_signature_data(path)
86 all_check_funcs[str(path.parent)] = get_sig_calc_func(path.name)
88 if do_checksum:
89 for path in track(
90 saved_paths, description="Validating checksums...", transient=True
91 ):
92 if dont_checksum(path):
93 continue
94 key = str(path.parent)
95 expect_sig = all_checksums[key][path.name]
96 calc_sig = all_check_funcs[key]
97 signature = calc_sig(path.read_bytes(), path.stat().st_size)
98 assert signature == expect_sig, path
100 return True