Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_install.py: 12%

129 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1from __future__ import annotations 

2 

3import shutil 

4import typing 

5 

6from rich.progress import Progress 

7 

8from ensembl_lite._aligndb import AlignDb 

9from ensembl_lite._config import Config 

10from ensembl_lite._genomedb import ( 

11 _ANNOTDB_NAME, 

12 fasta_to_hdf5, 

13 make_annotation_db, 

14) 

15from ensembl_lite._homologydb import ( 

16 HomologyDb, 

17 compressor, 

18 inflate, 

19 load_homologies, 

20 pickler, 

21) 

22from ensembl_lite._maf import load_align_records 

23from ensembl_lite._species import SPECIES_NAME, Species 

24from ensembl_lite._util import PathType, get_iterable_tasks 

25 

26 

27def _make_src_dest_annotation_paths( 

28 src_dir: PathType, dest_dir: PathType 

29) -> list[tuple[PathType, PathType]]: 

30 src_dir = src_dir / "gff3" 

31 dest = dest_dir / _ANNOTDB_NAME 

32 paths = list(src_dir.glob("*.gff3.gz")) 

33 return [(path, dest) for path in paths] 

34 

35 

36def local_install_genomes( 

37 config: Config, 

38 force_overwrite: bool, 

39 max_workers: int | None, 

40 verbose: bool = False, 

41 progress: typing.Optional[Progress] = None, 

42): 

43 if force_overwrite: 

44 shutil.rmtree(config.install_genomes, ignore_errors=True) 

45 # we create the local installation 

46 config.install_genomes.mkdir(parents=True, exist_ok=True) 

47 # we create subdirectories for each species 

48 for db_name in list(config.db_names): 

49 sp_dir = config.install_genomes / db_name 

50 sp_dir.mkdir(parents=True, exist_ok=True) 

51 

52 # for each species, we identify the download and dest paths for annotations 

53 db_names = list(config.db_names) 

54 if max_workers: 

55 max_workers = min(len(db_names) + 1, max_workers) 

56 

57 if verbose: 

58 print(f"genomes {max_workers=}") 

59 

60 # we load the individual gff3 files and write to annotation db's 

61 src_dest_paths = [] 

62 for db_name in config.db_names: 

63 src_dir = config.staging_genomes / db_name 

64 dest_dir = config.install_genomes / db_name 

65 src_dest_paths.extend(_make_src_dest_annotation_paths(src_dir, dest_dir)) 

66 

67 msg = "Installing features 📚" 

68 if progress is not None: 

69 writing = progress.add_task(total=len(src_dest_paths), description=msg) 

70 

71 tasks = get_iterable_tasks( 

72 func=make_annotation_db, series=src_dest_paths, max_workers=max_workers 

73 ) 

74 for db_name, prefixes in tasks: 

75 if verbose: 

76 print(f"{db_name=} {prefixes=}") 

77 

78 if prefixes: 

79 for prefix in prefixes: 

80 Species.add_stableid_prefix(db_name, prefix) 

81 

82 if progress is not None: 

83 progress.update(writing, description=msg, advance=1) 

84 

85 species_table = Species.to_table() 

86 species_table.write(config.install_genomes / SPECIES_NAME) 

87 if verbose: 

88 print("Finished installing features ") 

89 

90 msg = "Installing 🧬🧬" 

91 if progress is not None: 

92 writing = progress.add_task(total=len(db_names), description=msg, advance=0) 

93 # we parallelise across databases 

94 writer = fasta_to_hdf5(config=config) 

95 tasks = get_iterable_tasks(func=writer, series=db_names, max_workers=max_workers) 

96 for result in tasks: 

97 if not result: 

98 print(result) 

99 raise RuntimeError 

100 

101 if progress is not None: 

102 progress.update(writing, description=msg, advance=1) 

103 

104 if verbose: 

105 print("Finished installing sequences ") 

106 return 

107 

108 

109def local_install_alignments( 

110 config: Config, 

111 force_overwrite: bool, 

112 max_workers: int | None, 

113 verbose: bool = False, 

114 progress: typing.Optional[Progress] = None, 

115): 

116 if force_overwrite: 

117 shutil.rmtree(config.install_aligns, ignore_errors=True) 

118 

119 aln_loader = load_align_records(set(config.db_names)) 

120 

121 for align_name in config.align_names: 

122 src_dir = config.staging_aligns / align_name 

123 dest_dir = config.install_aligns 

124 dest_dir.mkdir(parents=True, exist_ok=True) 

125 # write out to a db with align_name 

126 db = AlignDb(source=(dest_dir / f"{align_name}.sqlitedb")) 

127 records = [] 

128 paths = list(src_dir.glob(f"{align_name}*maf*")) 

129 

130 if max_workers and max_workers > 1: 

131 # we adjust the maximum workers to the number of paths 

132 max_workers = min(len(paths) + 1, max_workers or 0) 

133 

134 if verbose: 

135 print(f"{max_workers=}") 

136 

137 series = get_iterable_tasks( 

138 func=aln_loader, series=paths, max_workers=max_workers 

139 ) 

140 

141 msg = "Installing alignments" 

142 if progress is not None: 

143 writing = progress.add_task(total=len(paths), description=msg, advance=0) 

144 

145 for result in series: 

146 if not result: 

147 print(result) 

148 raise RuntimeError 

149 

150 records.extend(result) 

151 

152 if progress is not None: 

153 progress.update(writing, description=msg, advance=1) 

154 

155 db.add_records(records=records) 

156 db.close() 

157 

158 if verbose: 

159 print("Finished installing homologies") 

160 

161 return 

162 

163 

164def local_install_homology( 

165 config: Config, 

166 force_overwrite: bool, 

167 max_workers: int | None, 

168 verbose: bool = False, 

169 progress: typing.Optional[Progress] = None, 

170): 

171 if force_overwrite: 

172 shutil.rmtree(config.install_homologies, ignore_errors=True) 

173 

174 config.install_homologies.mkdir(parents=True, exist_ok=True) 

175 

176 outpath = config.install_homologies / "homologies.sqlitedb" 

177 db = HomologyDb(source=outpath) 

178 

179 dirnames = [] 

180 for sp in config.db_names: 

181 path = config.staging_homologies / sp 

182 dirnames.extend(list(path.glob("*.tsv.gz"))) 

183 

184 if max_workers: 

185 max_workers = min(len(dirnames) + 1, max_workers) 

186 else: 

187 max_workers = 1 

188 

189 if verbose: 

190 print(f"homologies {max_workers=}") 

191 

192 loader = load_homologies(allowed_species=set(config.db_names)) 

193 if max_workers > 1: 

194 loader = loader + pickler + compressor 

195 

196 msg = "Installing homologies" 

197 if progress is not None: 

198 writing = progress.add_task(total=len(dirnames), description=msg, advance=0) 

199 

200 tasks = get_iterable_tasks(func=loader, series=dirnames, max_workers=max_workers) 

201 for result in tasks: 

202 if max_workers > 1: 

203 # reconstitute the blosc compressed data 

204 result = inflate(result) 

205 

206 for rel_type, records in result.items(): 

207 db.add_records(records=records, relationship_type=rel_type) 

208 

209 if progress is not None: 

210 progress.update(writing, description=msg, advance=1) 

211 

212 no_records = len(db) == 0 

213 db.close() 

214 if no_records: 

215 outpath.unlink() 

216 

217 if verbose: 

218 print("Finished installing homologies")