Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/ftp_download.py: 98%

49 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 11:36 +1100

1import os 

2import pathlib 

3 

4from ftplib import FTP 

5from typing import Callable, Iterable 

6 

7from rich.progress import track 

8from unsync import unsync 

9 

10from ensembl_lite.util import ( 

11 atomic_write, 

12 dont_checksum, 

13 get_sig_calc_func, 

14 get_signature_data, 

15 is_signature, 

16) 

17 

18 

19def configured_ftp(host: str = "ftp.ensembl.org") -> FTP: 

20 ftp = FTP(host) 

21 ftp.login() 

22 return ftp 

23 

24 

25def listdir(host: str, path: str, pattern: Callable = None): 

26 """returns directory listing""" 

27 pattern = pattern or (lambda x: True) 

28 ftp = configured_ftp(host=host) 

29 ftp.cwd(path) 

30 for fn in ftp.nlst(): 

31 if pattern(fn): 

32 yield f"{path}/{fn}" 

33 ftp.close() 

34 

35 

36def _copy_to_local(host: str, src: os.PathLike, dest: os.PathLike) -> os.PathLike: 

37 if dest.exists(): 

38 return dest 

39 ftp = configured_ftp(host=host) 

40 # pass in checksum and keep going until it's correct? 

41 with atomic_write(dest, mode="wb") as outfile: 

42 ftp.retrbinary(f"RETR {src}", outfile.write) 

43 

44 ftp.close() 

45 return dest 

46 

47 

48unsynced_copy_to_local = unsync(_copy_to_local) 

49 

50 

51def _get_saved_paths_unsync(description, host, local_dest, remote_paths): 

52 tasks = [ 

53 unsynced_copy_to_local(host, path, local_dest / pathlib.Path(path).name) 

54 for path in remote_paths 

55 ] 

56 return [ 

57 task.result() for task in track(tasks, description=description, transient=True) 

58 ] 

59 

60 

61def _get_saved_paths(description, host, local_dest, remote_paths): # pragma: no cover 

62 # keep this, it's useful for debugging 

63 saved_paths = [] 

64 for path in track(remote_paths, description=description, transient=True): 

65 saved = _copy_to_local(host, path, local_dest / pathlib.Path(path).name) 

66 saved_paths.append(saved) 

67 return saved_paths 

68 

69 

70def download_data( 

71 *, 

72 host: str, 

73 local_dest: os.PathLike, 

74 remote_paths: Iterable[os.PathLike], 

75 description, 

76 do_checksum: bool, 

77) -> bool: 

78 saved_paths = _get_saved_paths_unsync(description, host, local_dest, remote_paths) 

79 

80 # load the signature data and sig calc keyed by parent dir 

81 all_checksums = {} 

82 all_check_funcs = {} 

83 for path in saved_paths: 

84 if is_signature(path): 

85 all_checksums[str(path.parent)] = get_signature_data(path) 

86 all_check_funcs[str(path.parent)] = get_sig_calc_func(path.name) 

87 

88 if do_checksum: 

89 for path in track( 

90 saved_paths, description="Validating checksums...", transient=True 

91 ): 

92 if dont_checksum(path): 

93 continue 

94 key = str(path.parent) 

95 expect_sig = all_checksums[key][path.name] 

96 calc_sig = all_check_funcs[key] 

97 signature = calc_sig(path.read_bytes(), path.stat().st_size) 

98 assert signature == expect_sig, path 

99 

100 return True