Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_download.py: 53%

120 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1from __future__ import annotations 

2 

3import os 

4import pathlib 

5import re 

6import shutil 

7 

8import click 

9 

10from cogent3 import load_tree 

11 

12from ensembl_lite._config import Config 

13from ensembl_lite._ftp_download import download_data, listdir 

14from ensembl_lite._site_map import get_site_map 

15from ensembl_lite._species import Species, species_from_ensembl_tree 

16from ensembl_lite._util import ( 

17 dont_checksum, 

18 get_resource_path, 

19 is_signature, 

20 trees_for_aligns, 

21) 

22 

23 

24_cfg = get_resource_path("sample.cfg") 

25 

26_invalid_seq = re.compile("(dna_(sm|rm)|(toplevel|primary_assembly).fa.gz)") 

27 

28 

29def valid_seq_file(name: str) -> bool: 

30 """unmasked genomic DNA sequences""" 

31 return _invalid_seq.search(name) is None 

32 

33 

34class valid_gff3_file: 

35 """whole genome gff3""" 

36 

37 def __init__(self, release: str) -> None: 

38 self._valid = re.compile(f"([.]{release}[.]gff3[.]gz|README|CHECKSUMS)") 

39 

40 def __call__(self, name: str) -> bool: 

41 return self._valid.search(name) is not None 

42 

43 

44def _remove_tmpdirs(path: os.PathLike): 

45 """delete any tmp dirs left over from unsuccessful runs""" 

46 tmpdirs = [p for p in path.glob("tmp*") if p.is_dir()] 

47 for tmpdir in tmpdirs: 

48 shutil.rmtree(tmpdir) 

49 

50 

51def download_species(config: Config, debug: bool, verbose: bool): 

52 """download seq and gff data""" 

53 remote_template = f"{config.remote_path}/release-{config.release}/" + "{}" 

54 site_map = get_site_map(config.host) 

55 if verbose: 

56 click.secho(f"DOWNLOADING\n ensembl release={config.release}", fg="green") 

57 click.secho("\n".join(f" {d}" for d in config.species_dbs), fg="green") 

58 click.secho(f"\nWRITING to output path={config.staging_genomes}\n", fg="green") 

59 

60 patterns = dict(fasta=valid_seq_file, gff3=valid_gff3_file(config.release)) 

61 for key in config.species_dbs: 

62 db_prefix = Species.get_ensembl_db_prefix(key) 

63 local_root = config.staging_genomes / db_prefix 

64 local_root.mkdir(parents=True, exist_ok=True) 

65 for subdir in ("fasta", "gff3"): 

66 if subdir == "fasta": 

67 remote = site_map.get_seqs_path(db_prefix) 

68 else: 

69 remote = site_map.get_annotations_path(db_prefix) 

70 

71 remote_dir = remote_template.format(remote) 

72 remote_paths = list( 

73 listdir(config.host, path=remote_dir, pattern=patterns[subdir]) 

74 ) 

75 if verbose: 

76 print(f"{remote_paths=}") 

77 if debug: 

78 # we need the checksum files 

79 paths = [p for p in remote_paths if is_signature(p)] 

80 # but fewer data files, to reduce time for debugging 

81 remote_paths = [p for p in remote_paths if not dont_checksum(p)] 

82 remote_paths = remote_paths[:4] + paths 

83 

84 dest_path = config.staging_genomes / db_prefix / subdir 

85 dest_path.mkdir(parents=True, exist_ok=True) 

86 _remove_tmpdirs(dest_path) 

87 download_data( 

88 host=config.host, 

89 local_dest=dest_path, 

90 remote_paths=remote_paths, 

91 description=f"{db_prefix[:10]}.../{subdir}", 

92 do_checksum=True, 

93 ) 

94 

95 return 

96 

97 

98class valid_compara_align: 

99 """whole genome alignment data""" 

100 

101 def __init__(self) -> None: 

102 self._valid = re.compile("([.](emf|maf)[.]gz|README|MD5SUM)") 

103 

104 def __call__(self, name: str) -> bool: 

105 return self._valid.search(name) is not None 

106 

107 

108def download_aligns(config: Config, debug: bool, verbose: bool): 

109 """download whole genome alignments""" 

110 if not config.align_names: 

111 return 

112 

113 site_map = get_site_map(config.host) 

114 remote_template = ( 

115 f"{config.remote_path}/release-{config.release}/{site_map.alignments_path}/" 

116 + "{}" 

117 ) 

118 valid_compara = valid_compara_align() 

119 for align_name in config.align_names: 

120 remote_path = remote_template.format(align_name) 

121 remote_paths = list(listdir(config.host, remote_path, valid_compara)) 

122 if verbose: 

123 print(remote_paths) 

124 

125 if debug: 

126 # we need the checksum files 

127 paths = [p for p in remote_paths if is_signature(p)] 

128 remote_paths = [p for p in remote_paths if not is_signature(p)] 

129 remote_paths = remote_paths[:4] + paths 

130 

131 local_dir = config.staging_aligns / align_name 

132 local_dir.mkdir(parents=True, exist_ok=True) 

133 _remove_tmpdirs(local_dir) 

134 download_data( 

135 host=config.host, 

136 local_dest=local_dir, 

137 remote_paths=remote_paths, 

138 description=f"compara/{align_name[:10]}...", 

139 do_checksum=True, 

140 ) 

141 

142 return 

143 

144 

145class valid_compara_homology: 

146 """homology tsv files""" 

147 

148 def __init__(self) -> None: 

149 self._valid = re.compile("([.]tsv[.]gz|README|MD5SUM)") 

150 

151 def __call__(self, name: str) -> bool: 

152 return self._valid.search(name) is not None 

153 

154 

155def download_homology(config: Config, debug: bool, verbose: bool): 

156 """downloads tsv homology files for each genome""" 

157 if not any((config.align_names, config.tree_names)): 

158 return 

159 

160 site_map = get_site_map(config.host) 

161 remote_template = ( 

162 f"{config.remote_path}/release-{config.release}/{site_map.homologies_path}/" 

163 + "{}" 

164 ) 

165 

166 local = config.staging_homologies 

167 

168 for db_name in config.db_names: 

169 remote_path = remote_template.format(db_name) 

170 remote_paths = list(listdir(config.host, remote_path, valid_compara_homology())) 

171 if verbose: 

172 print(remote_paths) 

173 

174 if debug: 

175 # we need the checksum files 

176 remote_paths = [p for p in remote_paths if not is_signature(p)] 

177 remote_paths = remote_paths[:4] 

178 

179 local_dir = local / db_name 

180 local_dir.mkdir(parents=True, exist_ok=True) 

181 _remove_tmpdirs(local_dir) 

182 download_data( 

183 host=config.host, 

184 local_dest=local_dir, 

185 remote_paths=remote_paths, 

186 description=f"homologies/{db_name[:10]}...", 

187 do_checksum=False, # no checksums for species homology files 

188 ) 

189 return 

190 

191 

192def download_ensembl_tree(host: str, remote_path: str, release: str, tree_fname: str): 

193 """loads a tree from Ensembl""" 

194 site_map = get_site_map(host) 

195 url = f"https://{host}/{remote_path}/release-{release}/{site_map.trees_path}/{tree_fname}" 

196 return load_tree(url) 

197 

198 

199def get_ensembl_trees(host: str, remote_path: str, release: str) -> list[str]: 

200 """returns trees from ensembl compara""" 

201 site_map = get_site_map(host) 

202 path = f"{remote_path}/release-{release}/{site_map.trees_path}" 

203 return list(listdir(host=host, path=path, pattern=lambda x: x.endswith(".nh"))) 

204 

205 

206def get_species_for_alignments( 

207 host: str, remote_path: str, release: str, align_names: list[str] 

208) -> dict[str, list[str]]: 

209 """return the species for the indicated alignments""" 

210 ensembl_trees = get_ensembl_trees( 

211 host=host, remote_path=remote_path, release=release 

212 ) 

213 aligns_trees = trees_for_aligns(align_names, ensembl_trees) 

214 species = {} 

215 for tree_path in aligns_trees.values(): 

216 tree_path = pathlib.Path(tree_path) 

217 tree = download_ensembl_tree( 

218 host=host, 

219 remote_path=remote_path, 

220 release=release, 

221 tree_fname=tree_path.name, 

222 ) 

223 # dict structure is {common name: db prefix}, just use common name 

224 species |= {n: ["core"] for n in species_from_ensembl_tree(tree).keys()} 

225 return species