Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/cli.py: 40%

269 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1import os 

2import pathlib 

3import shutil 

4 

5from collections import defaultdict 

6 

7import click 

8 

9 

10try: 

11 from wakepy.keep import running as keep_running 

12except ImportError: 

13 from ensembl_lite._util import fake_wake as keep_running 

14 

15from rich.progress import track 

16from trogon import tui 

17 

18from ensembl_lite import __version__ 

19from ensembl_lite import _config as elt_config 

20from ensembl_lite import _download as elt_download 

21from ensembl_lite._species import Species 

22 

23 

24try: 

25 # trap flaky behaviour on linux 

26 with keep_running(): 

27 ... 

28 

29except NotImplementedError: 

30 from ensembl_lite._util import fake_wake as keep_running 

31 

32 

33def _get_installed_config_path(ctx, param, path) -> os.PathLike: 

34 """path to installed.cfg""" 

35 path = pathlib.Path(path) 

36 if path.name == elt_config.INSTALLED_CONFIG_NAME: 

37 return path 

38 

39 path = path / elt_config.INSTALLED_CONFIG_NAME 

40 if not path.exists(): 

41 click.secho(f"{str(path)} missing", fg="red") 

42 exit(1) 

43 return path 

44 

45 

46def _values_from_csv(ctx, param, value) -> list[str] | None: 

47 if value is None: 

48 return 

49 

50 return [f.strip() for f in value.split(",")] 

51 

52 

53def _species_names_from_csv(ctx, param, species) -> list[str] | None: 

54 """returns species names""" 

55 species = _values_from_csv(ctx, param, species) 

56 if species is None: 

57 return 

58 

59 db_names = [] 

60 for name in species: 

61 try: 

62 db_name = Species.get_ensembl_db_prefix(name) 

63 except ValueError: 

64 click.secho(f"ERROR: unknown species {name!r}", fg="red") 

65 exit(1) 

66 

67 db_names.append(db_name) 

68 

69 return db_names 

70 

71 

72# defining some of the options 

73_cfgpath = click.option( 

74 "-c", 

75 "--configpath", 

76 default=elt_download._cfg, 

77 type=pathlib.Path, 

78 help="path to config file specifying databases, only " 

79 "species or compara at present", 

80) 

81_download = click.option( 

82 "-d", 

83 "--download", 

84 type=pathlib.Path, 

85 help="path to local download directory, contains a cfg file", 

86) 

87_installation = click.option( 

88 "--installation", 

89 type=pathlib.Path, 

90 help="path to local installation directory", 

91) 

92_installed = click.option( 

93 "-i", 

94 "--installed", 

95 required=True, 

96 callback=_get_installed_config_path, 

97 help="string pointing to installation", 

98) 

99_outpath = click.option( 

100 "-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file" 

101) 

102_outdir = click.option( 

103 "-od", "--outdir", required=True, type=pathlib.Path, help="path to write files" 

104) 

105_align_name = click.option( 

106 "--align_name", 

107 default=None, 

108 help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'", 

109) 

110_ref = click.option("--ref", default=None, help="Reference species.") 

111_ref_genes_file = click.option( 

112 "--ref_genes_file", 

113 default=None, 

114 type=click.Path(resolve_path=True, exists=True), 

115 help=".csv or .tsv file with a header containing a stableid column", 

116) 

117_limit = click.option( 

118 "--limit", 

119 type=int, 

120 default=None, 

121 help="Limit to this number of genes.", 

122 show_default=True, 

123) 

124 

125_verbose = click.option( 

126 "-v", 

127 "--verbose", 

128 is_flag=True, 

129 help="causes stdout/stderr from rsync download to be " "written to screen", 

130) 

131_numprocs = click.option( 

132 "-n", 

133 "--numprocs", 

134 type=int, 

135 default=1, 

136 help="number of processes to use for download", 

137) 

138_force = click.option( 

139 "-f", 

140 "--force_overwrite", 

141 is_flag=True, 

142 help="drop existing database if it exists prior to " "installing", 

143) 

144_debug = click.option( 

145 "-d", 

146 "--debug", 

147 is_flag=True, 

148 help="maximum verbosity, and reduces number of downloads", 

149) 

150_dbrc_out = click.option( 

151 "-o", 

152 "--outpath", 

153 type=pathlib.Path, 

154 help="path to directory to export all rc contents", 

155) 

156_release = click.option("-r", "--release", type=int, help="Ensembl release number") 

157_nprocs = click.option( 

158 "-np", 

159 "--num_procs", 

160 type=int, 

161 default=None, 

162 help="number of procs to use, defaults to all", 

163) 

164 

165 

166_outdir = click.option( 

167 "--outdir", 

168 type=pathlib.Path, 

169 default=".", 

170 help="Output directory name.", 

171 show_default=True, 

172) 

173 

174_species = click.option( 

175 "--species", 

176 required=True, 

177 callback=_species_names_from_csv, 

178 help="Single species name, or multiple (comma separated).", 

179) 

180 

181_mask_features = click.option( 

182 "--mask_features", 

183 callback=_values_from_csv, 

184 help="biotypes to mask (comma separated).", 

185) 

186 

187 

188@tui() 

189@click.group() 

190@click.version_option(__version__) 

191def main(): 

192 """tools for obtaining and interrogating subsets of https://ensembl.org genomic data""" 

193 pass 

194 

195 

196@main.command(no_args_is_help=True) 

197@_dbrc_out 

198def exportrc(outpath): 

199 """exports sample config and species table to the nominated path 

200 

201 setting an environment variable ENSEMBLDBRC with this path 

202 will force its contents to override the default ensembl_lite settings""" 

203 from ensembl_lite._util import ENSEMBLDBRC 

204 

205 shutil.copytree(ENSEMBLDBRC, outpath) 

206 # we assume all files starting with alphabetical characters are valid 

207 for fn in pathlib.Path(outpath).glob("*"): 

208 if not fn.stem.isalpha(): 

209 if fn.is_file(): 

210 fn.unlink() 

211 else: 

212 # __pycache__ directory 

213 shutil.rmtree(fn) 

214 click.secho(f"Contents written to {outpath}", fg="green") 

215 

216 

217@main.command(no_args_is_help=True) 

218@_cfgpath 

219@_debug 

220@_verbose 

221def download(configpath, debug, verbose): 

222 """download data from Ensembl's ftp site""" 

223 if configpath.name == elt_download._cfg: 

224 click.secho( 

225 "WARN: using the built in demo cfg, will write to /tmp", fg="yellow" 

226 ) 

227 

228 config = elt_config.read_config(configpath) 

229 if not any((config.species_dbs, config.align_names)): 

230 click.secho("No genomes, no alignments specified", fg="red") 

231 exit(1) 

232 

233 if not config.species_dbs: 

234 species = elt_download.get_species_for_alignments( 

235 host=config.host, 

236 remote_path=config.remote_path, 

237 release=config.release, 

238 align_names=config.align_names, 

239 ) 

240 config.update_species(species) 

241 

242 if verbose: 

243 print(config.species_dbs) 

244 

245 config.write() 

246 with keep_running(): 

247 elt_download.download_species(config, debug, verbose) 

248 elt_download.download_homology(config, debug, verbose) 

249 elt_download.download_aligns(config, debug, verbose) 

250 

251 click.secho(f"Downloaded to {config.staging_path}", fg="green") 

252 

253 

254@main.command(no_args_is_help=True) 

255@_download 

256@_nprocs 

257@_force 

258@_verbose 

259def install(download, num_procs, force_overwrite, verbose): 

260 """create the local representations of the data""" 

261 from ensembl_lite._install import ( 

262 local_install_compara, 

263 local_install_genomes, 

264 local_install_homology, 

265 ) 

266 

267 configpath = download / elt_config.DOWNLOADED_CONFIG_NAME 

268 config = elt_config.read_config(configpath) 

269 if verbose: 

270 print(f"{config.install_path=}") 

271 

272 if force_overwrite: 

273 shutil.rmtree(config.install_path, ignore_errors=True) 

274 

275 config.install_path.mkdir(parents=True, exist_ok=True) 

276 elt_config.write_installed_cfg(config) 

277 with keep_running(): 

278 local_install_genomes( 

279 config, force_overwrite=force_overwrite, max_workers=num_procs 

280 ) 

281 local_install_compara( 

282 config, force_overwrite=force_overwrite, max_workers=num_procs 

283 ) 

284 local_install_homology( 

285 config, force_overwrite=force_overwrite, max_workers=num_procs 

286 ) 

287 

288 click.secho(f"Contents installed to {str(config.install_path)!r}", fg="green") 

289 

290 

291@main.command(no_args_is_help=True) 

292@_installed 

293def installed(installed): 

294 """show what is installed""" 

295 from cogent3 import make_table 

296 

297 from ensembl_lite._species import Species 

298 from ensembl_lite._util import rich_display 

299 

300 config = elt_config.read_installed_cfg(installed) 

301 

302 genome_dir = config.genomes_path 

303 if genome_dir.exists(): 

304 species = [fn.name for fn in genome_dir.glob("*")] 

305 data = {"species": [], "common name": []} 

306 for name in species: 

307 cn = Species.get_common_name(name, level="ignore") 

308 if not cn: 

309 continue 

310 data["species"].append(name) 

311 data["common name"].append(cn) 

312 

313 table = make_table(data=data, title="Installed genomes") 

314 rich_display(table) 

315 

316 # TODO as above 

317 compara_aligns = config.aligns_path 

318 if compara_aligns.exists(): 

319 align_names = [ 

320 fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".") 

321 ] 

322 table = make_table( 

323 data={"align name": align_names}, title="Installed whole genome alignments" 

324 ) 

325 rich_display(table) 

326 

327 

328@main.command(no_args_is_help=True) 

329@_installed 

330@_species 

331def species_summary(installed, species): 

332 """genome summary data for a species""" 

333 from ._genomedb import get_annotations_for_species, get_species_summary 

334 from ._util import rich_display 

335 

336 config = elt_config.read_installed_cfg(installed) 

337 if species is None: 

338 click.secho("ERROR: a species name is required", fg="red") 

339 exit(1) 

340 

341 if len(species) > 1: 

342 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red") 

343 exit(1) 

344 

345 species = species[0] 

346 annot_db = get_annotations_for_species(config=config, species=species) 

347 summary = get_species_summary(annot_db=annot_db, species=species) 

348 rich_display(summary) 

349 

350 

351@main.command(no_args_is_help=True) 

352@_installed 

353@_outdir 

354@_align_name 

355@_ref 

356@_ref_genes_file 

357@_mask_features 

358@_limit 

359@_force 

360@_verbose 

361def alignments( 

362 installed, 

363 outdir, 

364 align_name, 

365 ref, 

366 ref_genes_file, 

367 mask_features, 

368 limit, 

369 force_overwrite, 

370 verbose, 

371): 

372 """dump alignments for named genes""" 

373 from cogent3 import load_table 

374 

375 from ensembl_lite._aligndb import AlignDb, write_alignments 

376 from ensembl_lite._genomedb import load_genome 

377 from ensembl_lite._species import Species 

378 

379 # todo support genomic coordinates, e.g. coord_name:start-stop:strand, for 

380 # a reference species 

381 

382 if not ref: 

383 click.secho( 

384 "ERROR: must specify a reference genome", 

385 fg="red", 

386 ) 

387 exit(1) 

388 

389 if force_overwrite: 

390 shutil.rmtree(outdir, ignore_errors=True) 

391 

392 outdir.mkdir(parents=True, exist_ok=True) 

393 

394 config = elt_config.read_installed_cfg(installed) 

395 align_path = config.path_to_alignment(align_name) 

396 if align_path is None: 

397 click.secho( 

398 f"{align_name!r} does not match any alignments under {str(config.aligns_path)!r}", 

399 fg="red", 

400 ) 

401 exit(1) 

402 

403 # load the gene stable ID's 

404 table = load_table(ref_genes_file) 

405 if "stableid" not in table.columns: 

406 click.secho( 

407 f"'stableid' column missing from {str(ref_genes_file)!r}", 

408 fg="red", 

409 ) 

410 exit(1) 

411 

412 align_db = AlignDb(source=align_path) 

413 ref_species = Species.get_ensembl_db_prefix(ref) 

414 if ref_species not in align_db.get_species_names(): 

415 click.secho( 

416 f"species {ref!r} not in the alignment", 

417 fg="red", 

418 ) 

419 exit(1) 

420 

421 # get all the genomes 

422 genomes = { 

423 sp: load_genome(config=config, species=sp) 

424 for sp in align_db.get_species_names() 

425 } 

426 

427 write_alignments( 

428 align_db=align_db, 

429 genomes=genomes, 

430 limit=limit, 

431 mask_features=mask_features, 

432 outdir=outdir, 

433 ref_species=ref_species, 

434 stableids=table.columns["stableid"], 

435 ) 

436 

437 click.secho("Done!", fg="green") 

438 

439 

440@main.command(no_args_is_help=True) 

441@_installed 

442@_outpath 

443@click.option( 

444 "-r", 

445 "--relationship", 

446 type=click.Choice(["ortholog_one2one"]), 

447 default="ortholog_one2one", 

448 help="type of homology", 

449) 

450@_limit 

451@_force 

452@_verbose 

453def homologs(installed, outpath, relationship, limit, force_overwrite, verbose): 

454 """exports all homolog groups of type relationship in json format""" 

455 from rich.progress import Progress 

456 

457 from ensembl_lite._genomedb import get_selected_seqs 

458 from ensembl_lite._homologydb import id_by_species_group, load_homology_db 

459 from ensembl_lite._species import Species 

460 

461 if force_overwrite: 

462 shutil.rmtree(outpath, ignore_errors=True) 

463 

464 outpath.mkdir(parents=True, exist_ok=True) 

465 

466 config = elt_config.read_installed_cfg(installed) 

467 db = load_homology_db(config=config) 

468 related = db.get_related_groups(relationship_type=relationship) 

469 if limit: 

470 related = list(related)[:limit] 

471 

472 get_seqs = get_selected_seqs(config=config) 

473 sp_gene_groups, gene_map = id_by_species_group(related) 

474 # we now get all the sequences for all species 

475 grouped = defaultdict(list) 

476 todo = {s.species for s in sp_gene_groups} 

477 with Progress(transient=True) as progress: 

478 reading = progress.add_task( 

479 total=len(sp_gene_groups), description="Extracting 🧬" 

480 ) 

481 for seqs in get_seqs.as_completed( 

482 sp_gene_groups, 

483 parallel=True, 

484 par_kw=dict(max_workers=11), 

485 show_progress=False, 

486 ): 

487 if not seqs: 

488 print(seqs) 

489 exit(1) 

490 

491 common = Species.get_common_name(seqs.obj[0].info.species) 

492 msg = f"Done {common!r} 🧬" 

493 if verbose: 

494 todo = todo - {seqs.obj[0].info.species} 

495 msg = f"Remaining {todo} 🧬" 

496 

497 progress.update(reading, description=msg, advance=1) 

498 for seq in seqs.obj: 

499 grouped[gene_map[seq.info.name]].append(seq) 

500 

501 # todo also need to be writing out a logfile, plus a meta data table of 

502 # gene IDs and location info 

503 # todo why is this loop so slow if we use make_unaligned_seqs?? 

504 for group, seqs in track( 

505 grouped.items(), description="✏️ 🧬", total=len(grouped), transient=True 

506 ): 

507 txt = [seq.to_fasta() for seq in seqs] 

508 outname = outpath / f"seqcoll-{group}.fasta" 

509 with outname.open(mode="w") as outfile: 

510 outfile.write("".join(txt)) 

511 

512 

513@main.command(no_args_is_help=True) 

514@_installed 

515@_species 

516@_outdir 

517@_limit 

518def dump_genes(installed, species, outdir, limit): 

519 """Dump meta data table for genes from one species to <species>-<release>.gene_metadata.tsv""" 

520 from ensembl_lite._genomedb import ( 

521 get_annotations_for_species, 

522 get_gene_table_for_species, 

523 ) 

524 

525 config = elt_config.read_installed_cfg(installed) 

526 if species is None: 

527 click.secho("ERROR: a species name is required", fg="red") 

528 exit(1) 

529 

530 if len(species) > 1: 

531 click.secho(f"ERROR: one species at a time, not {species!r}", fg="red") 

532 exit(1) 

533 

534 annot_db = get_annotations_for_species(config=config, species=species[0]) 

535 path = annot_db.source 

536 table = get_gene_table_for_species(annot_db=annot_db, limit=limit) 

537 outdir.mkdir(parents=True, exist_ok=True) 

538 outpath = outdir / f"{path.parent.stem}-{config.release}-gene_metadata.tsv" 

539 table.write(outpath) 

540 click.secho(f"Finished: wrote {str(outpath)!r}!", fg="green") 

541 

542 

543if __name__ == "__main__": 

544 main()