Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_install.py: 12%
129 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1from __future__ import annotations
3import shutil
4import typing
6from rich.progress import Progress
8from ensembl_lite._aligndb import AlignDb
9from ensembl_lite._config import Config
10from ensembl_lite._genomedb import (
11 _ANNOTDB_NAME,
12 fasta_to_hdf5,
13 make_annotation_db,
14)
15from ensembl_lite._homologydb import (
16 HomologyDb,
17 compressor,
18 inflate,
19 load_homologies,
20 pickler,
21)
22from ensembl_lite._maf import load_align_records
23from ensembl_lite._species import SPECIES_NAME, Species
24from ensembl_lite._util import PathType, get_iterable_tasks
27def _make_src_dest_annotation_paths(
28 src_dir: PathType, dest_dir: PathType
29) -> list[tuple[PathType, PathType]]:
30 src_dir = src_dir / "gff3"
31 dest = dest_dir / _ANNOTDB_NAME
32 paths = list(src_dir.glob("*.gff3.gz"))
33 return [(path, dest) for path in paths]
36def local_install_genomes(
37 config: Config,
38 force_overwrite: bool,
39 max_workers: int | None,
40 verbose: bool = False,
41 progress: typing.Optional[Progress] = None,
42):
43 if force_overwrite:
44 shutil.rmtree(config.install_genomes, ignore_errors=True)
45 # we create the local installation
46 config.install_genomes.mkdir(parents=True, exist_ok=True)
47 # we create subdirectories for each species
48 for db_name in list(config.db_names):
49 sp_dir = config.install_genomes / db_name
50 sp_dir.mkdir(parents=True, exist_ok=True)
52 # for each species, we identify the download and dest paths for annotations
53 db_names = list(config.db_names)
54 if max_workers:
55 max_workers = min(len(db_names) + 1, max_workers)
57 if verbose:
58 print(f"genomes {max_workers=}")
60 # we load the individual gff3 files and write to annotation db's
61 src_dest_paths = []
62 for db_name in config.db_names:
63 src_dir = config.staging_genomes / db_name
64 dest_dir = config.install_genomes / db_name
65 src_dest_paths.extend(_make_src_dest_annotation_paths(src_dir, dest_dir))
67 msg = "Installing features 📚"
68 if progress is not None:
69 writing = progress.add_task(total=len(src_dest_paths), description=msg)
71 tasks = get_iterable_tasks(
72 func=make_annotation_db, series=src_dest_paths, max_workers=max_workers
73 )
74 for db_name, prefixes in tasks:
75 if verbose:
76 print(f"{db_name=} {prefixes=}")
78 if prefixes:
79 for prefix in prefixes:
80 Species.add_stableid_prefix(db_name, prefix)
82 if progress is not None:
83 progress.update(writing, description=msg, advance=1)
85 species_table = Species.to_table()
86 species_table.write(config.install_genomes / SPECIES_NAME)
87 if verbose:
88 print("Finished installing features ")
90 msg = "Installing 🧬🧬"
91 if progress is not None:
92 writing = progress.add_task(total=len(db_names), description=msg, advance=0)
93 # we parallelise across databases
94 writer = fasta_to_hdf5(config=config)
95 tasks = get_iterable_tasks(func=writer, series=db_names, max_workers=max_workers)
96 for result in tasks:
97 if not result:
98 print(result)
99 raise RuntimeError
101 if progress is not None:
102 progress.update(writing, description=msg, advance=1)
104 if verbose:
105 print("Finished installing sequences ")
106 return
109def local_install_alignments(
110 config: Config,
111 force_overwrite: bool,
112 max_workers: int | None,
113 verbose: bool = False,
114 progress: typing.Optional[Progress] = None,
115):
116 if force_overwrite:
117 shutil.rmtree(config.install_aligns, ignore_errors=True)
119 aln_loader = load_align_records(set(config.db_names))
121 for align_name in config.align_names:
122 src_dir = config.staging_aligns / align_name
123 dest_dir = config.install_aligns
124 dest_dir.mkdir(parents=True, exist_ok=True)
125 # write out to a db with align_name
126 db = AlignDb(source=(dest_dir / f"{align_name}.sqlitedb"))
127 records = []
128 paths = list(src_dir.glob(f"{align_name}*maf*"))
130 if max_workers and max_workers > 1:
131 # we adjust the maximum workers to the number of paths
132 max_workers = min(len(paths) + 1, max_workers or 0)
134 if verbose:
135 print(f"{max_workers=}")
137 series = get_iterable_tasks(
138 func=aln_loader, series=paths, max_workers=max_workers
139 )
141 msg = "Installing alignments"
142 if progress is not None:
143 writing = progress.add_task(total=len(paths), description=msg, advance=0)
145 for result in series:
146 if not result:
147 print(result)
148 raise RuntimeError
150 records.extend(result)
152 if progress is not None:
153 progress.update(writing, description=msg, advance=1)
155 db.add_records(records=records)
156 db.close()
158 if verbose:
159 print("Finished installing homologies")
161 return
164def local_install_homology(
165 config: Config,
166 force_overwrite: bool,
167 max_workers: int | None,
168 verbose: bool = False,
169 progress: typing.Optional[Progress] = None,
170):
171 if force_overwrite:
172 shutil.rmtree(config.install_homologies, ignore_errors=True)
174 config.install_homologies.mkdir(parents=True, exist_ok=True)
176 outpath = config.install_homologies / "homologies.sqlitedb"
177 db = HomologyDb(source=outpath)
179 dirnames = []
180 for sp in config.db_names:
181 path = config.staging_homologies / sp
182 dirnames.extend(list(path.glob("*.tsv.gz")))
184 if max_workers:
185 max_workers = min(len(dirnames) + 1, max_workers)
186 else:
187 max_workers = 1
189 if verbose:
190 print(f"homologies {max_workers=}")
192 loader = load_homologies(allowed_species=set(config.db_names))
193 if max_workers > 1:
194 loader = loader + pickler + compressor
196 msg = "Installing homologies"
197 if progress is not None:
198 writing = progress.add_task(total=len(dirnames), description=msg, advance=0)
200 tasks = get_iterable_tasks(func=loader, series=dirnames, max_workers=max_workers)
201 for result in tasks:
202 if max_workers > 1:
203 # reconstitute the blosc compressed data
204 result = inflate(result)
206 for rel_type, records in result.items():
207 db.add_records(records=records, relationship_type=rel_type)
209 if progress is not None:
210 progress.update(writing, description=msg, advance=1)
212 no_records = len(db) == 0
213 db.close()
214 if no_records:
215 outpath.unlink()
217 if verbose:
218 print("Finished installing homologies")