Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_genomedb.py: 85%

155 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1import typing 

2 

3import click 

4 

5from cogent3 import make_seq, make_table 

6from cogent3.app.composable import define_app 

7from cogent3.core.annotation import Feature 

8from cogent3.core.annotation_db import GffAnnotationDb 

9from cogent3.core.sequence import Sequence 

10from cogent3.util.table import Table 

11 

12from ensembl_lite._config import InstalledConfig 

13from ensembl_lite._db_base import SqliteDbMixin 

14from ensembl_lite._homologydb import species_genes 

15from ensembl_lite._util import elt_compress_it, elt_decompress_it 

16 

17 

18_SEQDB_NAME = "genome_sequence.seqdb" 

19_ANNOTDB_NAME = "features.gff3db" 

20 

21 

22class GenomeSeqsDb(SqliteDbMixin): 

23 """class to be replaced by cogent3 sequence collection when that 

24 has been modernised""" 

25 

26 table_name = "genome" 

27 _genome_schema = {"seqid": "TEXT PRIMARY KEY", "seq": "TEXT", "length": "INT"} 

28 _metadata_schema = {"species": "TEXT"} 

29 

30 def __init__(self, *, source: str = ":memory:", species: str = None): 

31 self.source = source 

32 self._init_tables() 

33 # the metadata table stores species info 

34 self._execute_sql("INSERT INTO metadata(species) VALUES (?)", (species,)) 

35 self.db.commit() 

36 

37 def __hash__(self): 

38 return id(self) 

39 

40 def add_record(self, *, seqid: str, seq: str): 

41 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)" 

42 self._execute_sql(sql, (seqid, seq, len(seq))) 

43 self.db.commit() 

44 

45 def add_records(self, *, records: typing.Iterable[list[str, str]]): 

46 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)" 

47 self.db.executemany(sql, [(n, s, len(s)) for n, s in records]) 

48 self.db.commit() 

49 

50 def get_seq( 

51 self, *, seqid: str, start: int | None = None, stop: int | None = None 

52 ) -> str: 

53 """ 

54 

55 Parameters 

56 ---------- 

57 seqid 

58 name of chromosome etc.. 

59 start 

60 starting position of slice in python coordinates, defaults 

61 to 0 

62 stop 

63 ending position of slice in python coordinates, defaults 

64 to length of coordinate 

65 """ 

66 if start is not None: 

67 start += 1 # SQLite counts from 1 

68 else: 

69 start = 1 

70 

71 if stop is None: 

72 sql = ( 

73 f"SELECT SUBSTR(seq, ?, length) FROM {self.table_name} where seqid = ?" 

74 ) 

75 values = start, seqid 

76 else: 

77 stop -= start - 1 

78 sql = f"SELECT SUBSTR(seq, ?, ?) FROM {self.table_name} where seqid = ?" 

79 values = start, stop, seqid 

80 

81 return self._execute_sql(sql, values).fetchone()[0] 

82 

83 

84class CompressedGenomeSeqsDb(GenomeSeqsDb): 

85 """class to be replaced by cogent3 sequence collection when that 

86 has been modernised""" 

87 

88 _genome_schema = {"seqid": "TEXT PRIMARY KEY", "seq": "BLOB", "length": "INT"} 

89 

90 def __hash__(self): 

91 return id(self) 

92 

93 def add_record(self, *, seqid: str, seq: str): 

94 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)" 

95 self._execute_sql(sql, (seqid, elt_compress_it(seq), len(seq))) 

96 self.db.commit() 

97 

98 def add_records(self, *, records: typing.Iterable[list[str, str]]): 

99 self.add_compressed_records( 

100 records=[(n, elt_compress_it(s)) for n, s in records] 

101 ) 

102 

103 def add_compressed_records(self, *, records: typing.Iterable[list[str, bytes]]): 

104 """sequences already compressed""" 

105 

106 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)" 

107 

108 self.db.executemany(sql, [(n, s, len(s)) for n, s in records]) 

109 self.db.commit() 

110 

111 def get_seq( 

112 self, *, seqid: str, start: int | None = None, stop: int | None = None 

113 ) -> str: 

114 """ 

115 

116 Parameters 

117 ---------- 

118 seqid 

119 name of chromosome etc.. 

120 start 

121 starting position of slice in python coordinates, defaults 

122 to 0 

123 stop 

124 ending position of slice in python coordinates, defaults 

125 to length of coordinate 

126 """ 

127 sql = f"SELECT seq FROM {self.table_name} where seqid = ?" 

128 

129 seq = elt_decompress_it(self._execute_sql(sql, (seqid,)).fetchone()[0]) 

130 return seq[start:stop] if start or stop else seq 

131 

132 

133# todo: this wrapping class is required for memory efficiency because 

134# the cogent3 SequeceCollection class is not designed for large sequence 

135# collections, either large sequences or large numbers of sequences. The 

136# correct solution is to improve that. 

137class Genome: 

138 """class to be replaced by cogent3 sequence collection when that 

139 has been modernised""" 

140 

141 def __init__( 

142 self, 

143 *, 

144 species: str, 

145 seqs: GenomeSeqsDb | CompressedGenomeSeqsDb, 

146 annots: GffAnnotationDb, 

147 ) -> None: 

148 self.species = species 

149 self._seqs = seqs 

150 self.annotation_db = annots 

151 

152 def get_seq( 

153 self, 

154 *, 

155 seqid: str, 

156 start: int | None = None, 

157 stop: int | None = None, 

158 namer: typing.Callable | None = None, 

159 ) -> str: 

160 """returns annotated sequence 

161 

162 Parameters 

163 ---------- 

164 seqid 

165 name of chromosome etc.. 

166 start 

167 starting position of slice in python coordinates, defaults 

168 to 0 

169 stop 

170 ending position of slice in python coordinates, defaults 

171 to length of coordinate 

172 namer 

173 callback for naming the sequence. Callback must take four 

174 arguments: species, seqid,start, stop. Default is 

175 species:seqid:start-stop. 

176 Notes 

177 ----- 

178 Annotations partially within region are included. 

179 """ 

180 seq = self._seqs.get_seq(seqid=seqid, start=start, stop=stop) 

181 if namer: 

182 name = namer(self.species, seqid, start, stop) 

183 else: 

184 name = f"{self.species}:{seqid}:{start}-{stop}" 

185 # we use seqid to make the sequence here because that identifies the 

186 # parent seq identity, required for querying annotations 

187 seq = make_seq(seq, name=seqid, moltype="dna") 

188 seq.name = name 

189 if self.annotation_db: 

190 seq.annotation_offset = start or 0 

191 seq.annotation_db = self.annotation_db.subset( 

192 seqid=seqid, start=start, stop=stop, allow_partial=True 

193 ) 

194 return seq 

195 

196 def get_features( 

197 self, 

198 *, 

199 biotype: str = None, 

200 seqid: str = None, 

201 name: str = None, 

202 start: int = None, 

203 stop: int = None, 

204 ) -> typing.Iterable[Feature]: 

205 """yields features in blocks of seqid""" 

206 kwargs = {k: v for k, v in locals().items() if k not in ("self", "seqid") and v} 

207 if seqid: 

208 seqids = [seqid] 

209 else: 

210 seqids = { 

211 ft["seqid"] for ft in self.annotation_db.get_features_matching(**kwargs) 

212 } 

213 for seqid in seqids: 

214 try: 

215 seq = self.get_seq(seqid=seqid) 

216 except TypeError: 

217 msg = f"ERROR (report me): {self.species!r}, {seqid!r}" 

218 raise TypeError(msg) 

219 # because self.get_seq() automatically names seqs differently 

220 seq.name = seqid 

221 yield from seq.get_features(**kwargs) 

222 

223 def close(self): 

224 self._seqs.close() 

225 self.annotation_db.db.close() 

226 

227 

228def load_genome(*, config: InstalledConfig, species: str): 

229 """returns the Genome with bound seqs and features""" 

230 genome_path = config.installed_genome(species) / _SEQDB_NAME 

231 seqs = CompressedGenomeSeqsDb(source=genome_path, species=species) 

232 ann_path = config.installed_genome(species) / _ANNOTDB_NAME 

233 ann = GffAnnotationDb(source=ann_path) 

234 return Genome(species=species, seqs=seqs, annots=ann) 

235 

236 

237def get_seqs_for_ids( 

238 *, 

239 config: InstalledConfig, 

240 species: str, 

241 names: list[str], 

242 make_seq_name: typing.Callable = None, 

243) -> typing.Iterable[Sequence]: 

244 genome = load_genome(config=config, species=species) 

245 # is it possible to do batch query for all names? 

246 for name in names: 

247 feature = list(genome.get_features(name=f"%{name}"))[0] 

248 transcripts = list(feature.get_children(biotype="mRNA")) 

249 if not transcripts: 

250 continue 

251 

252 longest = max(transcripts, key=lambda x: len(x)) 

253 cds = list(longest.get_children(biotype="CDS")) 

254 if not cds: 

255 continue 

256 

257 feature = cds[0] 

258 seq = feature.get_slice() 

259 if callable(make_seq_name): 

260 seq.name = make_seq_name(feature) 

261 else: 

262 seq.name = f"{species}-{name}" 

263 seq.info["species"] = species 

264 seq.info["name"] = name 

265 # disconnect from annotation so the closure of the genome 

266 # does not cause issues when run in parallel 

267 seq.annotation_db = None 

268 yield seq 

269 

270 genome.close() 

271 del genome 

272 

273 

274@define_app 

275def get_selected_seqs(species_gene_ids: species_genes, config: InstalledConfig) -> list: 

276 """return gene sequences when given a species_gene_id instance 

277 

278 Notes 

279 ----- 

280 This function becomes a class, created using config. Calling the class 

281 instance with a species_genes instance is used to extract the list of gene 

282 ID's from the species. 

283 """ 

284 species = species_gene_ids.species 

285 gene_ids = species_gene_ids.gene_ids 

286 return list(get_seqs_for_ids(config=config, species=species, names=gene_ids)) 

287 

288 

289def get_annotations_for_species( 

290 *, config: InstalledConfig, species: str 

291) -> GffAnnotationDb: 

292 """returns the annotation Db for species""" 

293 path = config.installed_genome(species=species) 

294 if not path.exists(): 

295 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red") 

296 exit(1) 

297 # TODO: this filename should be defined in one place 

298 path = path / "features.gff3db" 

299 if not path.exists(): 

300 click.secho(f"{path.name!r} is missing", fg="red") 

301 exit(1) 

302 return GffAnnotationDb(source=path) 

303 

304 

305def get_gene_table_for_species( 

306 *, annot_db: GffAnnotationDb, limit: int | None, species: str | None = None 

307) -> Table: 

308 """ 

309 returns gene data from a GffDb 

310 

311 Parameters 

312 ---------- 

313 annot_db 

314 feature db 

315 limit 

316 limit number of records to 

317 species 

318 species name, overrides inference from annot_db.source 

319 """ 

320 species = species or annot_db.source.parent.name 

321 

322 columns = ( 

323 "species", 

324 "name", 

325 "seqid", 

326 "source", 

327 "biotype", 

328 "start", 

329 "stop", 

330 "score", 

331 "strand", 

332 "phase", 

333 ) 

334 rows = [] 

335 for i, record in enumerate(annot_db.get_records_matching(biotype="gene")): 

336 rows.append([species] + [record.get(c, None) for c in columns[1:]]) 

337 if i == limit: 

338 break 

339 

340 return make_table(header=columns, data=rows) 

341 

342 

343def get_species_summary( 

344 *, annot_db: GffAnnotationDb, species: str | None = None 

345) -> Table: 

346 """ 

347 returns the Table summarising data for species_name 

348 

349 Parameters 

350 ---------- 

351 annot_db 

352 feature db 

353 species 

354 species name, overrides inference from annot_db.source 

355 """ 

356 from ._species import Species 

357 

358 # for now, just biotype 

359 species = species or annot_db.source.parent.name 

360 counts = annot_db.biotype_counts() 

361 try: 

362 common_name = Species.get_common_name(species) 

363 except ValueError: 

364 common_name = species 

365 

366 return Table( 

367 header=("biotype", "count"), 

368 data=list(counts.items()), 

369 title=f"{common_name} features", 

370 )