Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/download.py: 53%
120 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
1from __future__ import annotations
3import os
4import pathlib
5import re
6import shutil
8import click
10from cogent3 import load_tree
12from ensembl_lite._config import Config
13from ensembl_lite._site_map import get_site_map
14from ensembl_lite.ftp_download import download_data, listdir
15from ensembl_lite.species import Species, species_from_ensembl_tree
16from ensembl_lite.util import (
17 dont_checksum,
18 get_resource_path,
19 is_signature,
20 trees_for_aligns,
21)
24_cfg = get_resource_path("sample.cfg")
26_invalid_seq = re.compile("(dna_(sm|rm)|(toplevel|primary_assembly).fa.gz)")
29def valid_seq_file(name: str) -> bool:
30 """unmasked genomic DNA sequences"""
31 return _invalid_seq.search(name) is None
34class valid_gff3_file:
35 """whole genome gff3"""
37 def __init__(self, release: str) -> None:
38 self._valid = re.compile(f"([.]{release}[.]gff3[.]gz|README|CHECKSUMS)")
40 def __call__(self, name: str) -> bool:
41 return self._valid.search(name) is not None
44def _remove_tmpdirs(path: os.PathLike):
45 """delete any tmp dirs left over from unsuccessful runs"""
46 tmpdirs = [p for p in path.glob("tmp*") if p.is_dir()]
47 for tmpdir in tmpdirs:
48 shutil.rmtree(tmpdir)
51def download_species(config: Config, debug: bool, verbose: bool):
52 """download seq and gff data"""
53 remote_template = f"{config.remote_path}/release-{config.release}/" + "{}"
54 site_map = get_site_map(config.host)
55 if verbose:
56 click.secho(f"DOWNLOADING\n ensembl release={config.release}", fg="green")
57 click.secho("\n".join(f" {d}" for d in config.species_dbs), fg="green")
58 click.secho(f"\nWRITING to output path={config.staging_genomes}\n", fg="green")
60 patterns = dict(fasta=valid_seq_file, gff3=valid_gff3_file(config.release))
61 for key in config.species_dbs:
62 db_prefix = Species.get_ensembl_db_prefix(key)
63 local_root = config.staging_genomes / db_prefix
64 local_root.mkdir(parents=True, exist_ok=True)
65 for subdir in ("fasta", "gff3"):
66 if subdir == "fasta":
67 remote = site_map.get_seqs_path(db_prefix)
68 else:
69 remote = site_map.get_annotations_path(db_prefix)
71 remote_dir = remote_template.format(remote)
72 remote_paths = list(
73 listdir(config.host, path=remote_dir, pattern=patterns[subdir])
74 )
75 if verbose:
76 print(f"{remote_paths=}")
77 if debug:
78 # we need the checksum files
79 paths = [p for p in remote_paths if is_signature(p)]
80 # but fewer data files, to reduce time for debugging
81 remote_paths = [p for p in remote_paths if not dont_checksum(p)]
82 remote_paths = remote_paths[:4] + paths
84 dest_path = config.staging_genomes / db_prefix / subdir
85 dest_path.mkdir(parents=True, exist_ok=True)
86 _remove_tmpdirs(dest_path)
87 download_data(
88 host=config.host,
89 local_dest=dest_path,
90 remote_paths=remote_paths,
91 description=f"{db_prefix[:10]}.../{subdir}",
92 do_checksum=True,
93 )
95 return
98class valid_compara_align:
99 """whole genome alignment data"""
101 def __init__(self) -> None:
102 self._valid = re.compile("([.](emf|maf)[.]gz|README|MD5SUM)")
104 def __call__(self, name: str) -> bool:
105 return self._valid.search(name) is not None
108def download_aligns(config: Config, debug: bool, verbose: bool):
109 """download whole genome alignments"""
110 if not config.align_names:
111 return
113 site_map = get_site_map(config.host)
114 remote_template = (
115 f"{config.remote_path}/release-{config.release}/{site_map.alignments_path}/"
116 + "{}"
117 )
118 valid_compara = valid_compara_align()
119 for align_name in config.align_names:
120 remote_path = remote_template.format(align_name)
121 remote_paths = list(listdir(config.host, remote_path, valid_compara))
122 if verbose:
123 print(remote_paths)
125 if debug:
126 # we need the checksum files
127 paths = [p for p in remote_paths if is_signature(p)]
128 remote_paths = [p for p in remote_paths if not is_signature(p)]
129 remote_paths = remote_paths[:4] + paths
131 local_dir = config.staging_aligns / align_name
132 local_dir.mkdir(parents=True, exist_ok=True)
133 _remove_tmpdirs(local_dir)
134 download_data(
135 host=config.host,
136 local_dest=local_dir,
137 remote_paths=remote_paths,
138 description=f"compara/{align_name[:10]}...",
139 do_checksum=True,
140 )
142 return
145class valid_compara_homology:
146 """homology tsv files"""
148 def __init__(self) -> None:
149 self._valid = re.compile("([.]tsv[.]gz|README|MD5SUM)")
151 def __call__(self, name: str) -> bool:
152 return self._valid.search(name) is not None
155def download_homology(config: Config, debug: bool, verbose: bool):
156 """downloads tsv homology files for each genome"""
157 if not any((config.align_names, config.tree_names)):
158 return
160 site_map = get_site_map(config.host)
161 remote_template = (
162 f"{config.remote_path}/release-{config.release}/{site_map.homologies_path}/"
163 + "{}"
164 )
166 local = config.staging_homologies
168 for db_name in config.db_names:
169 remote_path = remote_template.format(db_name)
170 remote_paths = list(listdir(config.host, remote_path, valid_compara_homology()))
171 if verbose:
172 print(remote_paths)
174 if debug:
175 # we need the checksum files
176 remote_paths = [p for p in remote_paths if not is_signature(p)]
177 remote_paths = remote_paths[:4]
179 local_dir = local / db_name
180 local_dir.mkdir(parents=True, exist_ok=True)
181 _remove_tmpdirs(local_dir)
182 download_data(
183 host=config.host,
184 local_dest=local_dir,
185 remote_paths=remote_paths,
186 description=f"homologies/{db_name[:10]}...",
187 do_checksum=False, # no checksums for species homology files
188 )
189 return
192def download_ensembl_tree(host: str, remote_path: str, release: str, tree_fname: str):
193 """loads a tree from Ensembl"""
194 site_map = get_site_map(host)
195 url = f"https://{host}/{remote_path}/release-{release}/{site_map.trees_path}/{tree_fname}"
196 return load_tree(url)
199def get_ensembl_trees(host: str, remote_path: str, release: str) -> list[str]:
200 """returns trees from ensembl compara"""
201 site_map = get_site_map(host)
202 path = f"{remote_path}/release-{release}/{site_map.trees_path}"
203 return list(listdir(host=host, path=path, pattern=lambda x: x.endswith(".nh")))
206def get_species_for_alignments(
207 host: str, remote_path: str, release: str, align_names: list[str]
208) -> dict[str, list[str]]:
209 """return the species for the indicated alignments"""
210 ensembl_trees = get_ensembl_trees(
211 host=host, remote_path=remote_path, release=release
212 )
213 aligns_trees = trees_for_aligns(align_names, ensembl_trees)
214 species = {}
215 for tree_path in aligns_trees.values():
216 tree_path = pathlib.Path(tree_path)
217 tree = download_ensembl_tree(
218 host=host,
219 remote_path=remote_path,
220 release=release,
221 tree_fname=tree_path.name,
222 )
223 # dict structure is {common name: db prefix}, just use common name
224 species |= {n: ["core"] for n in species_from_ensembl_tree(tree).keys()}
225 return species