Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/cli.py: 33%
291 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:32 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:32 -0400
1import pathlib
2import shutil
4import click
7try:
8 from wakepy.keep import running as keep_running
9except ImportError:
10 from ensembl_lite._util import fake_wake as keep_running
12from trogon import tui
14from ensembl_lite import __version__
15from ensembl_lite import _config as elt_config
16from ensembl_lite import _download as elt_download
17from ensembl_lite._species import Species
18from ensembl_lite._util import PathType
21try:
22 # trap flaky behaviour on linux
23 with keep_running():
24 ...
26except NotImplementedError:
27 from ensembl_lite._util import fake_wake as keep_running
30def _get_installed_config_path(ctx, param, path) -> PathType:
31 """path to installed.cfg"""
32 path = pathlib.Path(path)
33 if path.name == elt_config.INSTALLED_CONFIG_NAME:
34 return path
36 path = path / elt_config.INSTALLED_CONFIG_NAME
37 if not path.exists():
38 click.secho(f"{str(path)} missing", fg="red")
39 exit(1)
40 return path
43def _values_from_csv(ctx, param, value) -> list[str] | None:
44 if value is None:
45 return
47 return [f.strip() for f in value.split(",")]
50def _species_names_from_csv(ctx, param, species) -> list[str] | None:
51 """returns species names"""
52 species = _values_from_csv(ctx, param, species)
53 if species is None:
54 return
56 db_names = []
57 for name in species:
58 try:
59 db_name = Species.get_ensembl_db_prefix(name)
60 except ValueError:
61 click.secho(f"ERROR: unknown species {name!r}", fg="red")
62 exit(1)
64 db_names.append(db_name)
66 return db_names
69# defining some of the options
70_cfgpath = click.option(
71 "-c",
72 "--configpath",
73 default=elt_download._cfg,
74 type=pathlib.Path,
75 help="path to config file specifying databases, only "
76 "species or compara at present",
77)
78_download = click.option(
79 "-d",
80 "--download",
81 type=pathlib.Path,
82 help="path to local download directory, contains a cfg file",
83)
84_installed = click.option(
85 "-i",
86 "--installed",
87 required=True,
88 callback=_get_installed_config_path,
89 help="string pointing to installation",
90)
91_outpath = click.option(
92 "-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file"
93)
94_outdir = click.option(
95 "-od", "--outdir", required=True, type=pathlib.Path, help="path to write files"
96)
97_align_name = click.option(
98 "--align_name",
99 default=None,
100 help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'",
101)
102_ref = click.option("--ref", default=None, help="Reference species.")
103_ref_genes_file = click.option(
104 "--ref_genes_file",
105 default=None,
106 type=click.Path(resolve_path=True, exists=True),
107 help=".csv or .tsv file with a header containing a stableid column",
108)
109_limit = click.option(
110 "--limit",
111 type=int,
112 default=None,
113 help="Limit to this number of genes.",
114 show_default=True,
115)
117_verbose = click.option(
118 "-v",
119 "--verbose",
120 is_flag=True,
121)
122_force = click.option(
123 "-f",
124 "--force_overwrite",
125 is_flag=True,
126 help="drop existing database if it exists prior to " "installing",
127)
128_debug = click.option(
129 "-d",
130 "--debug",
131 is_flag=True,
132 help="maximum verbosity, and reduces number of downloads",
133)
134_dbrc_out = click.option(
135 "-o",
136 "--outpath",
137 type=pathlib.Path,
138 help="path to directory to export all rc contents",
139)
140_nprocs = click.option(
141 "-np",
142 "--num_procs",
143 type=int,
144 default=1,
145 help="number of procs to use, defaults to 1",
146)
149_outdir = click.option(
150 "--outdir",
151 type=pathlib.Path,
152 default=".",
153 help="Output directory name.",
154 show_default=True,
155)
157_species = click.option(
158 "--species",
159 required=True,
160 callback=_species_names_from_csv,
161 help="Single species name, or multiple (comma separated).",
162)
164_mask_features = click.option(
165 "--mask_features",
166 callback=_values_from_csv,
167 help="biotypes to mask (comma separated).",
168)
171@tui()
172@click.group()
173@click.version_option(__version__)
174def main():
175 """tools for obtaining and interrogating subsets of https://ensembl.org genomic data"""
176 pass
179@main.command(no_args_is_help=True)
180@_dbrc_out
181def exportrc(outpath):
182 """exports sample config and species table to the nominated path
184 setting an environment variable ENSEMBLDBRC with this path
185 will force its contents to override the default ensembl_lite settings"""
186 from ensembl_lite._util import ENSEMBLDBRC
188 shutil.copytree(ENSEMBLDBRC, outpath)
189 # we assume all files starting with alphabetical characters are valid
190 for fn in pathlib.Path(outpath).glob("*"):
191 if not fn.stem.isalpha():
192 if fn.is_file():
193 fn.unlink()
194 else:
195 # __pycache__ directory
196 shutil.rmtree(fn)
197 click.secho(f"Contents written to {outpath}", fg="green")
200@main.command(no_args_is_help=True)
201@_cfgpath
202@_debug
203@_verbose
204def download(configpath, debug, verbose):
205 """download data from Ensembl's ftp site"""
206 if configpath.name == elt_download._cfg:
207 # todo is this statement correct if we're seting a root dir now?
208 click.secho(
209 "WARN: using the built in demo cfg, will write to /tmp", fg="yellow"
210 )
211 config = elt_config.read_config(configpath, root_dir=pathlib.Path(".").resolve())
213 if verbose:
214 print(config)
216 if not any((config.species_dbs, config.align_names)):
217 click.secho("No genomes, no alignments specified", fg="red")
218 exit(1)
220 if not config.species_dbs:
221 species = elt_download.get_species_for_alignments(
222 host=config.host,
223 remote_path=config.remote_path,
224 release=config.release,
225 align_names=config.align_names,
226 )
227 config.update_species(species)
229 if verbose:
230 print(config.species_dbs)
232 config.write()
233 with keep_running():
234 elt_download.download_species(config, debug, verbose)
235 elt_download.download_homology(config, debug, verbose)
236 elt_download.download_aligns(config, debug, verbose)
238 click.secho(f"Downloaded to {config.staging_path}", fg="green")
241@main.command(no_args_is_help=True)
242@_download
243@_nprocs
244@_force
245@_verbose
246def install(download, num_procs, force_overwrite, verbose):
247 """create the local representations of the data"""
248 from rich import progress
250 from ensembl_lite._install import (
251 local_install_alignments,
252 local_install_genomes,
253 local_install_homology,
254 )
256 configpath = download / elt_config.DOWNLOADED_CONFIG_NAME
257 config = elt_config.read_config(configpath)
258 if verbose:
259 print(f"{config.install_path=}")
261 if force_overwrite:
262 shutil.rmtree(config.install_path, ignore_errors=True)
264 config.install_path.mkdir(parents=True, exist_ok=True)
265 elt_config.write_installed_cfg(config)
266 with keep_running():
267 with progress.Progress(
268 progress.TextColumn("[progress.description]{task.description}"),
269 progress.BarColumn(),
270 progress.TaskProgressColumn(),
271 progress.TimeRemainingColumn(),
272 progress.TimeElapsedColumn(),
273 ) as progress:
274 local_install_genomes(
275 config,
276 force_overwrite=force_overwrite,
277 max_workers=num_procs,
278 verbose=verbose,
279 progress=progress,
280 )
281 # On test cases, only 30% speedup from running install homology data
282 # in parallel due to overhead of pickling the data, but considerable
283 # increase in memory. So, run in serial to avoid memory issues since
284 # it's reasonably fast anyway. (At least until we have
285 # a more robust solution.)
286 local_install_homology(
287 config,
288 force_overwrite=force_overwrite,
289 max_workers=num_procs,
290 verbose=verbose,
291 progress=progress,
292 )
293 local_install_alignments(
294 config,
295 force_overwrite=force_overwrite,
296 max_workers=num_procs,
297 verbose=verbose,
298 progress=progress,
299 )
301 click.secho(f"Contents installed to {str(config.install_path)!r}", fg="green")
304@main.command(no_args_is_help=True)
305@_installed
306def installed(installed):
307 """show what is installed"""
308 from cogent3 import make_table
310 from ensembl_lite._species import Species
311 from ensembl_lite._util import rich_display
313 config = elt_config.read_installed_cfg(installed)
315 genome_dir = config.genomes_path
316 if genome_dir.exists():
317 species = [fn.name for fn in genome_dir.glob("*")]
318 data = {"species": [], "common name": []}
319 for name in species:
320 cn = Species.get_common_name(name, level="ignore")
321 if not cn:
322 continue
323 data["species"].append(name)
324 data["common name"].append(cn)
326 table = make_table(data=data, title="Installed genomes")
327 rich_display(table)
329 # TODO as above
330 compara_aligns = config.aligns_path
331 if compara_aligns.exists():
332 align_names = [
333 fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".")
334 ]
335 table = make_table(
336 data={"align name": align_names}, title="Installed whole genome alignments"
337 )
338 rich_display(table)
341@main.command(no_args_is_help=True)
342@_installed
343@_species
344def species_summary(installed, species):
345 """genome summary data for a species"""
346 from ._genomedb import (
347 _ANNOTDB_NAME,
348 get_species_summary,
349 load_annotations_for_species,
350 )
351 from ._util import rich_display
353 config = elt_config.read_installed_cfg(installed)
354 if species is None:
355 click.secho("ERROR: a species name is required", fg="red")
356 exit(1)
358 if len(species) > 1:
359 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red")
360 exit(1)
362 species = species[0]
363 path = config.installed_genome(species=species) / _ANNOTDB_NAME
364 if not path.exists():
365 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red")
366 exit(1)
368 annot_db = load_annotations_for_species(path=path)
369 summary = get_species_summary(annot_db=annot_db, species=species)
370 rich_display(summary)
373@main.command(no_args_is_help=True)
374@_installed
375@_outdir
376@_align_name
377@_ref
378@_ref_genes_file
379@_mask_features
380@_limit
381@_force
382@_verbose
383def alignments(
384 installed,
385 outdir,
386 align_name,
387 ref,
388 ref_genes_file,
389 mask_features,
390 limit,
391 force_overwrite,
392 verbose,
393):
394 """dump alignments for named genes"""
395 from cogent3 import load_table
397 from ensembl_lite._aligndb import AlignDb, write_alignments
398 from ensembl_lite._genomedb import load_genome, update_stableid_prefixes
399 from ensembl_lite._species import Species
401 # todo support genomic coordinates, e.g. coord_name:start-stop:strand, for
402 # a reference species
404 if not ref:
405 click.secho(
406 "ERROR: must specify a reference genome",
407 fg="red",
408 )
409 exit(1)
411 if force_overwrite:
412 shutil.rmtree(outdir, ignore_errors=True)
414 outdir.mkdir(parents=True, exist_ok=True)
416 config = elt_config.read_installed_cfg(installed)
417 # update the prefixes
418 update_stableid_prefixes(config)
419 align_path = config.path_to_alignment(align_name)
420 if align_path is None:
421 click.secho(
422 f"{align_name!r} does not match any alignments under {str(config.aligns_path)!r}",
423 fg="red",
424 )
425 exit(1)
427 # load the gene stable ID's
428 table = load_table(ref_genes_file)
429 if "stableid" not in table.columns:
430 click.secho(
431 f"'stableid' column missing from {str(ref_genes_file)!r}",
432 fg="red",
433 )
434 exit(1)
436 align_db = AlignDb(source=align_path)
437 ref_species = Species.get_ensembl_db_prefix(ref)
438 if ref_species not in align_db.get_species_names():
439 click.secho(
440 f"species {ref!r} not in the alignment",
441 fg="red",
442 )
443 exit(1)
445 # get all the genomes
446 if verbose:
447 print(f"working on species {align_db.get_species_names()}")
449 genomes = {
450 sp: load_genome(config=config, species=sp)
451 for sp in align_db.get_species_names()
452 }
454 write_alignments(
455 align_db=align_db,
456 genomes=genomes,
457 limit=limit,
458 mask_features=mask_features,
459 outdir=outdir,
460 ref_species=ref_species,
461 stableids=table.columns["stableid"],
462 )
464 click.secho("Done!", fg="green")
467@main.command(no_args_is_help=True)
468@_installed
469@_outpath
470@click.option(
471 "-r",
472 "--relationship",
473 type=click.Choice(["ortholog_one2one"]),
474 default="ortholog_one2one",
475 help="type of homology",
476)
477@_ref
478@_nprocs
479@_limit
480@_force
481@_verbose
482def homologs(
483 installed, outpath, relationship, ref, num_procs, limit, force_overwrite, verbose
484):
485 """exports all homolog groups of type relationship in fasta format"""
486 from rich.progress import Progress
488 from ensembl_lite._genomedb import load_genome
489 from ensembl_lite._homologydb import (
490 _HOMOLOGYDB_NAME,
491 collect_seqs,
492 load_homology_db,
493 )
495 if ref is None:
496 click.secho("ERROR: a reference species name is required, use --ref", fg="red")
497 exit(1)
499 if force_overwrite:
500 shutil.rmtree(outpath, ignore_errors=True)
502 outpath.mkdir(parents=True, exist_ok=True)
504 config = elt_config.read_installed_cfg(installed)
505 Species.update_from_file(config.genomes_path / "species.tsv")
506 # we all the protein coding gene IDs from the reference species
507 genome = load_genome(config=config, species=ref)
508 if verbose:
509 print(f"loaded genome for {ref}")
510 gene_ids = list(genome.get_ids_for_biotype(biotype="gene"))
511 if verbose:
512 print(f"found {len(gene_ids)} gene IDs for {ref}")
513 db = load_homology_db(path=config.homologies_path / _HOMOLOGYDB_NAME)
514 related = []
515 with Progress(transient=False) as progress:
516 searching = progress.add_task(
517 total=limit or len(gene_ids), description="Homolog search"
518 )
519 for gid in gene_ids:
520 if rel := db.get_related_to(gene_id=gid, relationship_type=relationship):
521 related.append(rel)
522 progress.update(searching, advance=1)
524 if limit and len(related) >= limit:
525 break
527 if verbose:
528 print(f"Found {len(related)} homolog groups")
529 # todo create a directory data store writer and write all output to
530 # that. This requires homolog_group has a .source attribute
531 get_seqs = collect_seqs(config=config)
532 with Progress(transient=False) as progress:
533 reading = progress.add_task(total=len(related), description="Extracting 🧬")
534 for seqs in get_seqs.as_completed(
535 related,
536 parallel=True,
537 show_progress=False,
538 par_kw=dict(max_workers=num_procs),
539 ):
540 progress.update(reading, advance=1)
541 if not seqs:
542 if verbose:
543 print(f"{seqs=}")
544 continue
545 if not seqs.obj.seqs:
546 if verbose:
547 print(f"{seqs.obj.seqs=}")
548 continue
550 # todo also need to be writing out a logfile, plus a meta data table of
551 # gene IDs and location info
552 txt = [seq.to_fasta() for seq in seqs.obj.seqs]
553 outname = outpath / f"{seqs.source.source}.fasta"
554 outname.write_text("".join(txt))
557@main.command(no_args_is_help=True)
558@_installed
559@_species
560@_outdir
561@_limit
562def dump_genes(installed, species, outdir, limit):
563 """Dump meta data table for genes from one species to <species>-<release>.gene_metadata.tsv"""
564 from ensembl_lite._genomedb import (
565 _ANNOTDB_NAME,
566 get_gene_table_for_species,
567 load_annotations_for_species,
568 )
570 config = elt_config.read_installed_cfg(installed)
571 if species is None:
572 click.secho("ERROR: a species name is required", fg="red")
573 exit(1)
575 if len(species) > 1:
576 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red")
577 exit(1)
579 path = config.installed_genome(species=species[0]) / _ANNOTDB_NAME
580 if not path.exists():
581 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red")
582 exit(1)
584 annot_db = load_annotations_for_species(path=path)
585 path = annot_db.source
586 table = get_gene_table_for_species(annot_db=annot_db, limit=limit)
587 outdir.mkdir(parents=True, exist_ok=True)
588 outpath = outdir / f"{path.parent.stem}-{config.release}-gene_metadata.tsv"
589 table.write(outpath)
590 click.secho(f"Finished: wrote {str(outpath)!r}!", fg="green")
593if __name__ == "__main__":
594 main()