Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/cli.py: 40%
269 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1import os
2import pathlib
3import shutil
5from collections import defaultdict
7import click
10try:
11 from wakepy.keep import running as keep_running
12except ImportError:
13 from ensembl_lite._util import fake_wake as keep_running
15from rich.progress import track
16from trogon import tui
18from ensembl_lite import __version__
19from ensembl_lite import _config as elt_config
20from ensembl_lite import _download as elt_download
21from ensembl_lite._species import Species
24try:
25 # trap flaky behaviour on linux
26 with keep_running():
27 ...
29except NotImplementedError:
30 from ensembl_lite._util import fake_wake as keep_running
33def _get_installed_config_path(ctx, param, path) -> os.PathLike:
34 """path to installed.cfg"""
35 path = pathlib.Path(path)
36 if path.name == elt_config.INSTALLED_CONFIG_NAME:
37 return path
39 path = path / elt_config.INSTALLED_CONFIG_NAME
40 if not path.exists():
41 click.secho(f"{str(path)} missing", fg="red")
42 exit(1)
43 return path
46def _values_from_csv(ctx, param, value) -> list[str] | None:
47 if value is None:
48 return
50 return [f.strip() for f in value.split(",")]
53def _species_names_from_csv(ctx, param, species) -> list[str] | None:
54 """returns species names"""
55 species = _values_from_csv(ctx, param, species)
56 if species is None:
57 return
59 db_names = []
60 for name in species:
61 try:
62 db_name = Species.get_ensembl_db_prefix(name)
63 except ValueError:
64 click.secho(f"ERROR: unknown species {name!r}", fg="red")
65 exit(1)
67 db_names.append(db_name)
69 return db_names
72# defining some of the options
73_cfgpath = click.option(
74 "-c",
75 "--configpath",
76 default=elt_download._cfg,
77 type=pathlib.Path,
78 help="path to config file specifying databases, only "
79 "species or compara at present",
80)
81_download = click.option(
82 "-d",
83 "--download",
84 type=pathlib.Path,
85 help="path to local download directory, contains a cfg file",
86)
87_installation = click.option(
88 "--installation",
89 type=pathlib.Path,
90 help="path to local installation directory",
91)
92_installed = click.option(
93 "-i",
94 "--installed",
95 required=True,
96 callback=_get_installed_config_path,
97 help="string pointing to installation",
98)
99_outpath = click.option(
100 "-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file"
101)
102_outdir = click.option(
103 "-od", "--outdir", required=True, type=pathlib.Path, help="path to write files"
104)
105_align_name = click.option(
106 "--align_name",
107 default=None,
108 help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'",
109)
110_ref = click.option("--ref", default=None, help="Reference species.")
111_ref_genes_file = click.option(
112 "--ref_genes_file",
113 default=None,
114 type=click.Path(resolve_path=True, exists=True),
115 help=".csv or .tsv file with a header containing a stableid column",
116)
117_limit = click.option(
118 "--limit",
119 type=int,
120 default=None,
121 help="Limit to this number of genes.",
122 show_default=True,
123)
125_verbose = click.option(
126 "-v",
127 "--verbose",
128 is_flag=True,
129 help="causes stdout/stderr from rsync download to be " "written to screen",
130)
131_numprocs = click.option(
132 "-n",
133 "--numprocs",
134 type=int,
135 default=1,
136 help="number of processes to use for download",
137)
138_force = click.option(
139 "-f",
140 "--force_overwrite",
141 is_flag=True,
142 help="drop existing database if it exists prior to " "installing",
143)
144_debug = click.option(
145 "-d",
146 "--debug",
147 is_flag=True,
148 help="maximum verbosity, and reduces number of downloads",
149)
150_dbrc_out = click.option(
151 "-o",
152 "--outpath",
153 type=pathlib.Path,
154 help="path to directory to export all rc contents",
155)
156_release = click.option("-r", "--release", type=int, help="Ensembl release number")
157_nprocs = click.option(
158 "-np",
159 "--num_procs",
160 type=int,
161 default=None,
162 help="number of procs to use, defaults to all",
163)
166_outdir = click.option(
167 "--outdir",
168 type=pathlib.Path,
169 default=".",
170 help="Output directory name.",
171 show_default=True,
172)
174_species = click.option(
175 "--species",
176 required=True,
177 callback=_species_names_from_csv,
178 help="Single species name, or multiple (comma separated).",
179)
181_mask_features = click.option(
182 "--mask_features",
183 callback=_values_from_csv,
184 help="biotypes to mask (comma separated).",
185)
188@tui()
189@click.group()
190@click.version_option(__version__)
191def main():
192 """tools for obtaining and interrogating subsets of https://ensembl.org genomic data"""
193 pass
196@main.command(no_args_is_help=True)
197@_dbrc_out
198def exportrc(outpath):
199 """exports sample config and species table to the nominated path
201 setting an environment variable ENSEMBLDBRC with this path
202 will force its contents to override the default ensembl_lite settings"""
203 from ensembl_lite._util import ENSEMBLDBRC
205 shutil.copytree(ENSEMBLDBRC, outpath)
206 # we assume all files starting with alphabetical characters are valid
207 for fn in pathlib.Path(outpath).glob("*"):
208 if not fn.stem.isalpha():
209 if fn.is_file():
210 fn.unlink()
211 else:
212 # __pycache__ directory
213 shutil.rmtree(fn)
214 click.secho(f"Contents written to {outpath}", fg="green")
217@main.command(no_args_is_help=True)
218@_cfgpath
219@_debug
220@_verbose
221def download(configpath, debug, verbose):
222 """download data from Ensembl's ftp site"""
223 if configpath.name == elt_download._cfg:
224 click.secho(
225 "WARN: using the built in demo cfg, will write to /tmp", fg="yellow"
226 )
228 config = elt_config.read_config(configpath)
229 if not any((config.species_dbs, config.align_names)):
230 click.secho("No genomes, no alignments specified", fg="red")
231 exit(1)
233 if not config.species_dbs:
234 species = elt_download.get_species_for_alignments(
235 host=config.host,
236 remote_path=config.remote_path,
237 release=config.release,
238 align_names=config.align_names,
239 )
240 config.update_species(species)
242 if verbose:
243 print(config.species_dbs)
245 config.write()
246 with keep_running():
247 elt_download.download_species(config, debug, verbose)
248 elt_download.download_homology(config, debug, verbose)
249 elt_download.download_aligns(config, debug, verbose)
251 click.secho(f"Downloaded to {config.staging_path}", fg="green")
254@main.command(no_args_is_help=True)
255@_download
256@_nprocs
257@_force
258@_verbose
259def install(download, num_procs, force_overwrite, verbose):
260 """create the local representations of the data"""
261 from ensembl_lite._install import (
262 local_install_compara,
263 local_install_genomes,
264 local_install_homology,
265 )
267 configpath = download / elt_config.DOWNLOADED_CONFIG_NAME
268 config = elt_config.read_config(configpath)
269 if verbose:
270 print(f"{config.install_path=}")
272 if force_overwrite:
273 shutil.rmtree(config.install_path, ignore_errors=True)
275 config.install_path.mkdir(parents=True, exist_ok=True)
276 elt_config.write_installed_cfg(config)
277 with keep_running():
278 local_install_genomes(
279 config, force_overwrite=force_overwrite, max_workers=num_procs
280 )
281 local_install_compara(
282 config, force_overwrite=force_overwrite, max_workers=num_procs
283 )
284 local_install_homology(
285 config, force_overwrite=force_overwrite, max_workers=num_procs
286 )
288 click.secho(f"Contents installed to {str(config.install_path)!r}", fg="green")
291@main.command(no_args_is_help=True)
292@_installed
293def installed(installed):
294 """show what is installed"""
295 from cogent3 import make_table
297 from ensembl_lite._species import Species
298 from ensembl_lite._util import rich_display
300 config = elt_config.read_installed_cfg(installed)
302 genome_dir = config.genomes_path
303 if genome_dir.exists():
304 species = [fn.name for fn in genome_dir.glob("*")]
305 data = {"species": [], "common name": []}
306 for name in species:
307 cn = Species.get_common_name(name, level="ignore")
308 if not cn:
309 continue
310 data["species"].append(name)
311 data["common name"].append(cn)
313 table = make_table(data=data, title="Installed genomes")
314 rich_display(table)
316 # TODO as above
317 compara_aligns = config.aligns_path
318 if compara_aligns.exists():
319 align_names = [
320 fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".")
321 ]
322 table = make_table(
323 data={"align name": align_names}, title="Installed whole genome alignments"
324 )
325 rich_display(table)
328@main.command(no_args_is_help=True)
329@_installed
330@_species
331def species_summary(installed, species):
332 """genome summary data for a species"""
333 from ._genomedb import get_annotations_for_species, get_species_summary
334 from ._util import rich_display
336 config = elt_config.read_installed_cfg(installed)
337 if species is None:
338 click.secho("ERROR: a species name is required", fg="red")
339 exit(1)
341 if len(species) > 1:
342 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red")
343 exit(1)
345 species = species[0]
346 annot_db = get_annotations_for_species(config=config, species=species)
347 summary = get_species_summary(annot_db=annot_db, species=species)
348 rich_display(summary)
351@main.command(no_args_is_help=True)
352@_installed
353@_outdir
354@_align_name
355@_ref
356@_ref_genes_file
357@_mask_features
358@_limit
359@_force
360@_verbose
361def alignments(
362 installed,
363 outdir,
364 align_name,
365 ref,
366 ref_genes_file,
367 mask_features,
368 limit,
369 force_overwrite,
370 verbose,
371):
372 """dump alignments for named genes"""
373 from cogent3 import load_table
375 from ensembl_lite._aligndb import AlignDb, write_alignments
376 from ensembl_lite._genomedb import load_genome
377 from ensembl_lite._species import Species
379 # todo support genomic coordinates, e.g. coord_name:start-stop:strand, for
380 # a reference species
382 if not ref:
383 click.secho(
384 "ERROR: must specify a reference genome",
385 fg="red",
386 )
387 exit(1)
389 if force_overwrite:
390 shutil.rmtree(outdir, ignore_errors=True)
392 outdir.mkdir(parents=True, exist_ok=True)
394 config = elt_config.read_installed_cfg(installed)
395 align_path = config.path_to_alignment(align_name)
396 if align_path is None:
397 click.secho(
398 f"{align_name!r} does not match any alignments under {str(config.aligns_path)!r}",
399 fg="red",
400 )
401 exit(1)
403 # load the gene stable ID's
404 table = load_table(ref_genes_file)
405 if "stableid" not in table.columns:
406 click.secho(
407 f"'stableid' column missing from {str(ref_genes_file)!r}",
408 fg="red",
409 )
410 exit(1)
412 align_db = AlignDb(source=align_path)
413 ref_species = Species.get_ensembl_db_prefix(ref)
414 if ref_species not in align_db.get_species_names():
415 click.secho(
416 f"species {ref!r} not in the alignment",
417 fg="red",
418 )
419 exit(1)
421 # get all the genomes
422 genomes = {
423 sp: load_genome(config=config, species=sp)
424 for sp in align_db.get_species_names()
425 }
427 write_alignments(
428 align_db=align_db,
429 genomes=genomes,
430 limit=limit,
431 mask_features=mask_features,
432 outdir=outdir,
433 ref_species=ref_species,
434 stableids=table.columns["stableid"],
435 )
437 click.secho("Done!", fg="green")
440@main.command(no_args_is_help=True)
441@_installed
442@_outpath
443@click.option(
444 "-r",
445 "--relationship",
446 type=click.Choice(["ortholog_one2one"]),
447 default="ortholog_one2one",
448 help="type of homology",
449)
450@_limit
451@_force
452@_verbose
453def homologs(installed, outpath, relationship, limit, force_overwrite, verbose):
454 """exports all homolog groups of type relationship in json format"""
455 from rich.progress import Progress
457 from ensembl_lite._genomedb import get_selected_seqs
458 from ensembl_lite._homologydb import id_by_species_group, load_homology_db
459 from ensembl_lite._species import Species
461 if force_overwrite:
462 shutil.rmtree(outpath, ignore_errors=True)
464 outpath.mkdir(parents=True, exist_ok=True)
466 config = elt_config.read_installed_cfg(installed)
467 db = load_homology_db(config=config)
468 related = db.get_related_groups(relationship_type=relationship)
469 if limit:
470 related = list(related)[:limit]
472 get_seqs = get_selected_seqs(config=config)
473 sp_gene_groups, gene_map = id_by_species_group(related)
474 # we now get all the sequences for all species
475 grouped = defaultdict(list)
476 todo = {s.species for s in sp_gene_groups}
477 with Progress(transient=True) as progress:
478 reading = progress.add_task(
479 total=len(sp_gene_groups), description="Extracting 🧬"
480 )
481 for seqs in get_seqs.as_completed(
482 sp_gene_groups,
483 parallel=True,
484 par_kw=dict(max_workers=11),
485 show_progress=False,
486 ):
487 if not seqs:
488 print(seqs)
489 exit(1)
491 common = Species.get_common_name(seqs.obj[0].info.species)
492 msg = f"Done {common!r} 🧬"
493 if verbose:
494 todo = todo - {seqs.obj[0].info.species}
495 msg = f"Remaining {todo} 🧬"
497 progress.update(reading, description=msg, advance=1)
498 for seq in seqs.obj:
499 grouped[gene_map[seq.info.name]].append(seq)
501 # todo also need to be writing out a logfile, plus a meta data table of
502 # gene IDs and location info
503 # todo why is this loop so slow if we use make_unaligned_seqs??
504 for group, seqs in track(
505 grouped.items(), description="✏️ 🧬", total=len(grouped), transient=True
506 ):
507 txt = [seq.to_fasta() for seq in seqs]
508 outname = outpath / f"seqcoll-{group}.fasta"
509 with outname.open(mode="w") as outfile:
510 outfile.write("".join(txt))
513@main.command(no_args_is_help=True)
514@_installed
515@_species
516@_outdir
517@_limit
518def dump_genes(installed, species, outdir, limit):
519 """Dump meta data table for genes from one species to <species>-<release>.gene_metadata.tsv"""
520 from ensembl_lite._genomedb import (
521 get_annotations_for_species,
522 get_gene_table_for_species,
523 )
525 config = elt_config.read_installed_cfg(installed)
526 if species is None:
527 click.secho("ERROR: a species name is required", fg="red")
528 exit(1)
530 if len(species) > 1:
531 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red")
532 exit(1)
534 annot_db = get_annotations_for_species(config=config, species=species[0])
535 path = annot_db.source
536 table = get_gene_table_for_species(annot_db=annot_db, limit=limit)
537 outdir.mkdir(parents=True, exist_ok=True)
538 outpath = outdir / f"{path.parent.stem}-{config.release}-gene_metadata.tsv"
539 table.write(outpath)
540 click.secho(f"Finished: wrote {str(outpath)!r}!", fg="green")
543if __name__ == "__main__":
544 main()