Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/cli.py: 33%

291 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:32 -0400

1import pathlib 

2import shutil 

3 

4import click 

5 

6 

7try: 

8 from wakepy.keep import running as keep_running 

9except ImportError: 

10 from ensembl_lite._util import fake_wake as keep_running 

11 

12from trogon import tui 

13 

14from ensembl_lite import __version__ 

15from ensembl_lite import _config as elt_config 

16from ensembl_lite import _download as elt_download 

17from ensembl_lite._species import Species 

18from ensembl_lite._util import PathType 

19 

20 

21try: 

22 # trap flaky behaviour on linux 

23 with keep_running(): 

24 ... 

25 

26except NotImplementedError: 

27 from ensembl_lite._util import fake_wake as keep_running 

28 

29 

30def _get_installed_config_path(ctx, param, path) -> PathType: 

31 """path to installed.cfg""" 

32 path = pathlib.Path(path) 

33 if path.name == elt_config.INSTALLED_CONFIG_NAME: 

34 return path 

35 

36 path = path / elt_config.INSTALLED_CONFIG_NAME 

37 if not path.exists(): 

38 click.secho(f"{str(path)} missing", fg="red") 

39 exit(1) 

40 return path 

41 

42 

43def _values_from_csv(ctx, param, value) -> list[str] | None: 

44 if value is None: 

45 return 

46 

47 return [f.strip() for f in value.split(",")] 

48 

49 

50def _species_names_from_csv(ctx, param, species) -> list[str] | None: 

51 """returns species names""" 

52 species = _values_from_csv(ctx, param, species) 

53 if species is None: 

54 return 

55 

56 db_names = [] 

57 for name in species: 

58 try: 

59 db_name = Species.get_ensembl_db_prefix(name) 

60 except ValueError: 

61 click.secho(f"ERROR: unknown species {name!r}", fg="red") 

62 exit(1) 

63 

64 db_names.append(db_name) 

65 

66 return db_names 

67 

68 

69# defining some of the options 

70_cfgpath = click.option( 

71 "-c", 

72 "--configpath", 

73 default=elt_download._cfg, 

74 type=pathlib.Path, 

75 help="path to config file specifying databases, only " 

76 "species or compara at present", 

77) 

78_download = click.option( 

79 "-d", 

80 "--download", 

81 type=pathlib.Path, 

82 help="path to local download directory, contains a cfg file", 

83) 

84_installed = click.option( 

85 "-i", 

86 "--installed", 

87 required=True, 

88 callback=_get_installed_config_path, 

89 help="string pointing to installation", 

90) 

91_outpath = click.option( 

92 "-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file" 

93) 

94_outdir = click.option( 

95 "-od", "--outdir", required=True, type=pathlib.Path, help="path to write files" 

96) 

97_align_name = click.option( 

98 "--align_name", 

99 default=None, 

100 help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'", 

101) 

102_ref = click.option("--ref", default=None, help="Reference species.") 

103_ref_genes_file = click.option( 

104 "--ref_genes_file", 

105 default=None, 

106 type=click.Path(resolve_path=True, exists=True), 

107 help=".csv or .tsv file with a header containing a stableid column", 

108) 

109_limit = click.option( 

110 "--limit", 

111 type=int, 

112 default=None, 

113 help="Limit to this number of genes.", 

114 show_default=True, 

115) 

116 

117_verbose = click.option( 

118 "-v", 

119 "--verbose", 

120 is_flag=True, 

121) 

122_force = click.option( 

123 "-f", 

124 "--force_overwrite", 

125 is_flag=True, 

126 help="drop existing database if it exists prior to " "installing", 

127) 

128_debug = click.option( 

129 "-d", 

130 "--debug", 

131 is_flag=True, 

132 help="maximum verbosity, and reduces number of downloads", 

133) 

134_dbrc_out = click.option( 

135 "-o", 

136 "--outpath", 

137 type=pathlib.Path, 

138 help="path to directory to export all rc contents", 

139) 

140_nprocs = click.option( 

141 "-np", 

142 "--num_procs", 

143 type=int, 

144 default=1, 

145 help="number of procs to use, defaults to 1", 

146) 

147 

148 

149_outdir = click.option( 

150 "--outdir", 

151 type=pathlib.Path, 

152 default=".", 

153 help="Output directory name.", 

154 show_default=True, 

155) 

156 

157_species = click.option( 

158 "--species", 

159 required=True, 

160 callback=_species_names_from_csv, 

161 help="Single species name, or multiple (comma separated).", 

162) 

163 

164_mask_features = click.option( 

165 "--mask_features", 

166 callback=_values_from_csv, 

167 help="biotypes to mask (comma separated).", 

168) 

169 

170 

171@tui() 

172@click.group() 

173@click.version_option(__version__) 

174def main(): 

175 """tools for obtaining and interrogating subsets of https://ensembl.org genomic data""" 

176 pass 

177 

178 

179@main.command(no_args_is_help=True) 

180@_dbrc_out 

181def exportrc(outpath): 

182 """exports sample config and species table to the nominated path 

183 

184 setting an environment variable ENSEMBLDBRC with this path 

185 will force its contents to override the default ensembl_lite settings""" 

186 from ensembl_lite._util import ENSEMBLDBRC 

187 

188 shutil.copytree(ENSEMBLDBRC, outpath) 

189 # we assume all files starting with alphabetical characters are valid 

190 for fn in pathlib.Path(outpath).glob("*"): 

191 if not fn.stem.isalpha(): 

192 if fn.is_file(): 

193 fn.unlink() 

194 else: 

195 # __pycache__ directory 

196 shutil.rmtree(fn) 

197 click.secho(f"Contents written to {outpath}", fg="green") 

198 

199 

200@main.command(no_args_is_help=True) 

201@_cfgpath 

202@_debug 

203@_verbose 

204def download(configpath, debug, verbose): 

205 """download data from Ensembl's ftp site""" 

206 if configpath.name == elt_download._cfg: 

207 # todo is this statement correct if we're seting a root dir now? 

208 click.secho( 

209 "WARN: using the built in demo cfg, will write to /tmp", fg="yellow" 

210 ) 

211 config = elt_config.read_config(configpath, root_dir=pathlib.Path(".").resolve()) 

212 

213 if verbose: 

214 print(config) 

215 

216 if not any((config.species_dbs, config.align_names)): 

217 click.secho("No genomes, no alignments specified", fg="red") 

218 exit(1) 

219 

220 if not config.species_dbs: 

221 species = elt_download.get_species_for_alignments( 

222 host=config.host, 

223 remote_path=config.remote_path, 

224 release=config.release, 

225 align_names=config.align_names, 

226 ) 

227 config.update_species(species) 

228 

229 if verbose: 

230 print(config.species_dbs) 

231 

232 config.write() 

233 with keep_running(): 

234 elt_download.download_species(config, debug, verbose) 

235 elt_download.download_homology(config, debug, verbose) 

236 elt_download.download_aligns(config, debug, verbose) 

237 

238 click.secho(f"Downloaded to {config.staging_path}", fg="green") 

239 

240 

241@main.command(no_args_is_help=True) 

242@_download 

243@_nprocs 

244@_force 

245@_verbose 

246def install(download, num_procs, force_overwrite, verbose): 

247 """create the local representations of the data""" 

248 from rich import progress 

249 

250 from ensembl_lite._install import ( 

251 local_install_alignments, 

252 local_install_genomes, 

253 local_install_homology, 

254 ) 

255 

256 configpath = download / elt_config.DOWNLOADED_CONFIG_NAME 

257 config = elt_config.read_config(configpath) 

258 if verbose: 

259 print(f"{config.install_path=}") 

260 

261 if force_overwrite: 

262 shutil.rmtree(config.install_path, ignore_errors=True) 

263 

264 config.install_path.mkdir(parents=True, exist_ok=True) 

265 elt_config.write_installed_cfg(config) 

266 with keep_running(): 

267 with progress.Progress( 

268 progress.TextColumn("[progress.description]{task.description}"), 

269 progress.BarColumn(), 

270 progress.TaskProgressColumn(), 

271 progress.TimeRemainingColumn(), 

272 progress.TimeElapsedColumn(), 

273 ) as progress: 

274 local_install_genomes( 

275 config, 

276 force_overwrite=force_overwrite, 

277 max_workers=num_procs, 

278 verbose=verbose, 

279 progress=progress, 

280 ) 

281 # On test cases, only 30% speedup from running install homology data 

282 # in parallel due to overhead of pickling the data, but considerable 

283 # increase in memory. So, run in serial to avoid memory issues since 

284 # it's reasonably fast anyway. (At least until we have 

285 # a more robust solution.) 

286 local_install_homology( 

287 config, 

288 force_overwrite=force_overwrite, 

289 max_workers=num_procs, 

290 verbose=verbose, 

291 progress=progress, 

292 ) 

293 local_install_alignments( 

294 config, 

295 force_overwrite=force_overwrite, 

296 max_workers=num_procs, 

297 verbose=verbose, 

298 progress=progress, 

299 ) 

300 

301 click.secho(f"Contents installed to {str(config.install_path)!r}", fg="green") 

302 

303 

304@main.command(no_args_is_help=True) 

305@_installed 

306def installed(installed): 

307 """show what is installed""" 

308 from cogent3 import make_table 

309 

310 from ensembl_lite._species import Species 

311 from ensembl_lite._util import rich_display 

312 

313 config = elt_config.read_installed_cfg(installed) 

314 

315 genome_dir = config.genomes_path 

316 if genome_dir.exists(): 

317 species = [fn.name for fn in genome_dir.glob("*")] 

318 data = {"species": [], "common name": []} 

319 for name in species: 

320 cn = Species.get_common_name(name, level="ignore") 

321 if not cn: 

322 continue 

323 data["species"].append(name) 

324 data["common name"].append(cn) 

325 

326 table = make_table(data=data, title="Installed genomes") 

327 rich_display(table) 

328 

329 # TODO as above 

330 compara_aligns = config.aligns_path 

331 if compara_aligns.exists(): 

332 align_names = [ 

333 fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".") 

334 ] 

335 table = make_table( 

336 data={"align name": align_names}, title="Installed whole genome alignments" 

337 ) 

338 rich_display(table) 

339 

340 

341@main.command(no_args_is_help=True) 

342@_installed 

343@_species 

344def species_summary(installed, species): 

345 """genome summary data for a species""" 

346 from ._genomedb import ( 

347 _ANNOTDB_NAME, 

348 get_species_summary, 

349 load_annotations_for_species, 

350 ) 

351 from ._util import rich_display 

352 

353 config = elt_config.read_installed_cfg(installed) 

354 if species is None: 

355 click.secho("ERROR: a species name is required", fg="red") 

356 exit(1) 

357 

358 if len(species) > 1: 

359 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red") 

360 exit(1) 

361 

362 species = species[0] 

363 path = config.installed_genome(species=species) / _ANNOTDB_NAME 

364 if not path.exists(): 

365 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red") 

366 exit(1) 

367 

368 annot_db = load_annotations_for_species(path=path) 

369 summary = get_species_summary(annot_db=annot_db, species=species) 

370 rich_display(summary) 

371 

372 

373@main.command(no_args_is_help=True) 

374@_installed 

375@_outdir 

376@_align_name 

377@_ref 

378@_ref_genes_file 

379@_mask_features 

380@_limit 

381@_force 

382@_verbose 

383def alignments( 

384 installed, 

385 outdir, 

386 align_name, 

387 ref, 

388 ref_genes_file, 

389 mask_features, 

390 limit, 

391 force_overwrite, 

392 verbose, 

393): 

394 """dump alignments for named genes""" 

395 from cogent3 import load_table 

396 

397 from ensembl_lite._aligndb import AlignDb, write_alignments 

398 from ensembl_lite._genomedb import load_genome, update_stableid_prefixes 

399 from ensembl_lite._species import Species 

400 

401 # todo support genomic coordinates, e.g. coord_name:start-stop:strand, for 

402 # a reference species 

403 

404 if not ref: 

405 click.secho( 

406 "ERROR: must specify a reference genome", 

407 fg="red", 

408 ) 

409 exit(1) 

410 

411 if force_overwrite: 

412 shutil.rmtree(outdir, ignore_errors=True) 

413 

414 outdir.mkdir(parents=True, exist_ok=True) 

415 

416 config = elt_config.read_installed_cfg(installed) 

417 # update the prefixes 

418 update_stableid_prefixes(config) 

419 align_path = config.path_to_alignment(align_name) 

420 if align_path is None: 

421 click.secho( 

422 f"{align_name!r} does not match any alignments under {str(config.aligns_path)!r}", 

423 fg="red", 

424 ) 

425 exit(1) 

426 

427 # load the gene stable ID's 

428 table = load_table(ref_genes_file) 

429 if "stableid" not in table.columns: 

430 click.secho( 

431 f"'stableid' column missing from {str(ref_genes_file)!r}", 

432 fg="red", 

433 ) 

434 exit(1) 

435 

436 align_db = AlignDb(source=align_path) 

437 ref_species = Species.get_ensembl_db_prefix(ref) 

438 if ref_species not in align_db.get_species_names(): 

439 click.secho( 

440 f"species {ref!r} not in the alignment", 

441 fg="red", 

442 ) 

443 exit(1) 

444 

445 # get all the genomes 

446 if verbose: 

447 print(f"working on species {align_db.get_species_names()}") 

448 

449 genomes = { 

450 sp: load_genome(config=config, species=sp) 

451 for sp in align_db.get_species_names() 

452 } 

453 

454 write_alignments( 

455 align_db=align_db, 

456 genomes=genomes, 

457 limit=limit, 

458 mask_features=mask_features, 

459 outdir=outdir, 

460 ref_species=ref_species, 

461 stableids=table.columns["stableid"], 

462 ) 

463 

464 click.secho("Done!", fg="green") 

465 

466 

467@main.command(no_args_is_help=True) 

468@_installed 

469@_outpath 

470@click.option( 

471 "-r", 

472 "--relationship", 

473 type=click.Choice(["ortholog_one2one"]), 

474 default="ortholog_one2one", 

475 help="type of homology", 

476) 

477@_ref 

478@_nprocs 

479@_limit 

480@_force 

481@_verbose 

482def homologs( 

483 installed, outpath, relationship, ref, num_procs, limit, force_overwrite, verbose 

484): 

485 """exports all homolog groups of type relationship in fasta format""" 

486 from rich.progress import Progress 

487 

488 from ensembl_lite._genomedb import load_genome 

489 from ensembl_lite._homologydb import ( 

490 _HOMOLOGYDB_NAME, 

491 collect_seqs, 

492 load_homology_db, 

493 ) 

494 

495 if ref is None: 

496 click.secho("ERROR: a reference species name is required, use --ref", fg="red") 

497 exit(1) 

498 

499 if force_overwrite: 

500 shutil.rmtree(outpath, ignore_errors=True) 

501 

502 outpath.mkdir(parents=True, exist_ok=True) 

503 

504 config = elt_config.read_installed_cfg(installed) 

505 Species.update_from_file(config.genomes_path / "species.tsv") 

506 # we all the protein coding gene IDs from the reference species 

507 genome = load_genome(config=config, species=ref) 

508 if verbose: 

509 print(f"loaded genome for {ref}") 

510 gene_ids = list(genome.get_ids_for_biotype(biotype="gene")) 

511 if verbose: 

512 print(f"found {len(gene_ids)} gene IDs for {ref}") 

513 db = load_homology_db(path=config.homologies_path / _HOMOLOGYDB_NAME) 

514 related = [] 

515 with Progress(transient=False) as progress: 

516 searching = progress.add_task( 

517 total=limit or len(gene_ids), description="Homolog search" 

518 ) 

519 for gid in gene_ids: 

520 if rel := db.get_related_to(gene_id=gid, relationship_type=relationship): 

521 related.append(rel) 

522 progress.update(searching, advance=1) 

523 

524 if limit and len(related) >= limit: 

525 break 

526 

527 if verbose: 

528 print(f"Found {len(related)} homolog groups") 

529 # todo create a directory data store writer and write all output to 

530 # that. This requires homolog_group has a .source attribute 

531 get_seqs = collect_seqs(config=config) 

532 with Progress(transient=False) as progress: 

533 reading = progress.add_task(total=len(related), description="Extracting 🧬") 

534 for seqs in get_seqs.as_completed( 

535 related, 

536 parallel=True, 

537 show_progress=False, 

538 par_kw=dict(max_workers=num_procs), 

539 ): 

540 progress.update(reading, advance=1) 

541 if not seqs: 

542 if verbose: 

543 print(f"{seqs=}") 

544 continue 

545 if not seqs.obj.seqs: 

546 if verbose: 

547 print(f"{seqs.obj.seqs=}") 

548 continue 

549 

550 # todo also need to be writing out a logfile, plus a meta data table of 

551 # gene IDs and location info 

552 txt = [seq.to_fasta() for seq in seqs.obj.seqs] 

553 outname = outpath / f"{seqs.source.source}.fasta" 

554 outname.write_text("".join(txt)) 

555 

556 

557@main.command(no_args_is_help=True) 

558@_installed 

559@_species 

560@_outdir 

561@_limit 

562def dump_genes(installed, species, outdir, limit): 

563 """Dump meta data table for genes from one species to <species>-<release>.gene_metadata.tsv""" 

564 from ensembl_lite._genomedb import ( 

565 _ANNOTDB_NAME, 

566 get_gene_table_for_species, 

567 load_annotations_for_species, 

568 ) 

569 

570 config = elt_config.read_installed_cfg(installed) 

571 if species is None: 

572 click.secho("ERROR: a species name is required", fg="red") 

573 exit(1) 

574 

575 if len(species) > 1: 

576 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red") 

577 exit(1) 

578 

579 path = config.installed_genome(species=species[0]) / _ANNOTDB_NAME 

580 if not path.exists(): 

581 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red") 

582 exit(1) 

583 

584 annot_db = load_annotations_for_species(path=path) 

585 path = annot_db.source 

586 table = get_gene_table_for_species(annot_db=annot_db, limit=limit) 

587 outdir.mkdir(parents=True, exist_ok=True) 

588 outpath = outdir / f"{path.parent.stem}-{config.release}-gene_metadata.tsv" 

589 table.write(outpath) 

590 click.secho(f"Finished: wrote {str(outpath)!r}!", fg="green") 

591 

592 

593if __name__ == "__main__": 

594 main()