Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_genomedb.py: 85%
155 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1import typing
3import click
5from cogent3 import make_seq, make_table
6from cogent3.app.composable import define_app
7from cogent3.core.annotation import Feature
8from cogent3.core.annotation_db import GffAnnotationDb
9from cogent3.core.sequence import Sequence
10from cogent3.util.table import Table
12from ensembl_lite._config import InstalledConfig
13from ensembl_lite._db_base import SqliteDbMixin
14from ensembl_lite._homologydb import species_genes
15from ensembl_lite._util import elt_compress_it, elt_decompress_it
18_SEQDB_NAME = "genome_sequence.seqdb"
19_ANNOTDB_NAME = "features.gff3db"
22class GenomeSeqsDb(SqliteDbMixin):
23 """class to be replaced by cogent3 sequence collection when that
24 has been modernised"""
26 table_name = "genome"
27 _genome_schema = {"seqid": "TEXT PRIMARY KEY", "seq": "TEXT", "length": "INT"}
28 _metadata_schema = {"species": "TEXT"}
30 def __init__(self, *, source: str = ":memory:", species: str = None):
31 self.source = source
32 self._init_tables()
33 # the metadata table stores species info
34 self._execute_sql("INSERT INTO metadata(species) VALUES (?)", (species,))
35 self.db.commit()
37 def __hash__(self):
38 return id(self)
40 def add_record(self, *, seqid: str, seq: str):
41 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)"
42 self._execute_sql(sql, (seqid, seq, len(seq)))
43 self.db.commit()
45 def add_records(self, *, records: typing.Iterable[list[str, str]]):
46 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)"
47 self.db.executemany(sql, [(n, s, len(s)) for n, s in records])
48 self.db.commit()
50 def get_seq(
51 self, *, seqid: str, start: int | None = None, stop: int | None = None
52 ) -> str:
53 """
55 Parameters
56 ----------
57 seqid
58 name of chromosome etc..
59 start
60 starting position of slice in python coordinates, defaults
61 to 0
62 stop
63 ending position of slice in python coordinates, defaults
64 to length of coordinate
65 """
66 if start is not None:
67 start += 1 # SQLite counts from 1
68 else:
69 start = 1
71 if stop is None:
72 sql = (
73 f"SELECT SUBSTR(seq, ?, length) FROM {self.table_name} where seqid = ?"
74 )
75 values = start, seqid
76 else:
77 stop -= start - 1
78 sql = f"SELECT SUBSTR(seq, ?, ?) FROM {self.table_name} where seqid = ?"
79 values = start, stop, seqid
81 return self._execute_sql(sql, values).fetchone()[0]
84class CompressedGenomeSeqsDb(GenomeSeqsDb):
85 """class to be replaced by cogent3 sequence collection when that
86 has been modernised"""
88 _genome_schema = {"seqid": "TEXT PRIMARY KEY", "seq": "BLOB", "length": "INT"}
90 def __hash__(self):
91 return id(self)
93 def add_record(self, *, seqid: str, seq: str):
94 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)"
95 self._execute_sql(sql, (seqid, elt_compress_it(seq), len(seq)))
96 self.db.commit()
98 def add_records(self, *, records: typing.Iterable[list[str, str]]):
99 self.add_compressed_records(
100 records=[(n, elt_compress_it(s)) for n, s in records]
101 )
103 def add_compressed_records(self, *, records: typing.Iterable[list[str, bytes]]):
104 """sequences already compressed"""
106 sql = f"INSERT INTO {self.table_name}(seqid, seq, length) VALUES (?, ?, ?)"
108 self.db.executemany(sql, [(n, s, len(s)) for n, s in records])
109 self.db.commit()
111 def get_seq(
112 self, *, seqid: str, start: int | None = None, stop: int | None = None
113 ) -> str:
114 """
116 Parameters
117 ----------
118 seqid
119 name of chromosome etc..
120 start
121 starting position of slice in python coordinates, defaults
122 to 0
123 stop
124 ending position of slice in python coordinates, defaults
125 to length of coordinate
126 """
127 sql = f"SELECT seq FROM {self.table_name} where seqid = ?"
129 seq = elt_decompress_it(self._execute_sql(sql, (seqid,)).fetchone()[0])
130 return seq[start:stop] if start or stop else seq
133# todo: this wrapping class is required for memory efficiency because
134# the cogent3 SequeceCollection class is not designed for large sequence
135# collections, either large sequences or large numbers of sequences. The
136# correct solution is to improve that.
137class Genome:
138 """class to be replaced by cogent3 sequence collection when that
139 has been modernised"""
141 def __init__(
142 self,
143 *,
144 species: str,
145 seqs: GenomeSeqsDb | CompressedGenomeSeqsDb,
146 annots: GffAnnotationDb,
147 ) -> None:
148 self.species = species
149 self._seqs = seqs
150 self.annotation_db = annots
152 def get_seq(
153 self,
154 *,
155 seqid: str,
156 start: int | None = None,
157 stop: int | None = None,
158 namer: typing.Callable | None = None,
159 ) -> str:
160 """returns annotated sequence
162 Parameters
163 ----------
164 seqid
165 name of chromosome etc..
166 start
167 starting position of slice in python coordinates, defaults
168 to 0
169 stop
170 ending position of slice in python coordinates, defaults
171 to length of coordinate
172 namer
173 callback for naming the sequence. Callback must take four
174 arguments: species, seqid,start, stop. Default is
175 species:seqid:start-stop.
176 Notes
177 -----
178 Annotations partially within region are included.
179 """
180 seq = self._seqs.get_seq(seqid=seqid, start=start, stop=stop)
181 if namer:
182 name = namer(self.species, seqid, start, stop)
183 else:
184 name = f"{self.species}:{seqid}:{start}-{stop}"
185 # we use seqid to make the sequence here because that identifies the
186 # parent seq identity, required for querying annotations
187 seq = make_seq(seq, name=seqid, moltype="dna")
188 seq.name = name
189 if self.annotation_db:
190 seq.annotation_offset = start or 0
191 seq.annotation_db = self.annotation_db.subset(
192 seqid=seqid, start=start, stop=stop, allow_partial=True
193 )
194 return seq
196 def get_features(
197 self,
198 *,
199 biotype: str = None,
200 seqid: str = None,
201 name: str = None,
202 start: int = None,
203 stop: int = None,
204 ) -> typing.Iterable[Feature]:
205 """yields features in blocks of seqid"""
206 kwargs = {k: v for k, v in locals().items() if k not in ("self", "seqid") and v}
207 if seqid:
208 seqids = [seqid]
209 else:
210 seqids = {
211 ft["seqid"] for ft in self.annotation_db.get_features_matching(**kwargs)
212 }
213 for seqid in seqids:
214 try:
215 seq = self.get_seq(seqid=seqid)
216 except TypeError:
217 msg = f"ERROR (report me): {self.species!r}, {seqid!r}"
218 raise TypeError(msg)
219 # because self.get_seq() automatically names seqs differently
220 seq.name = seqid
221 yield from seq.get_features(**kwargs)
223 def close(self):
224 self._seqs.close()
225 self.annotation_db.db.close()
228def load_genome(*, config: InstalledConfig, species: str):
229 """returns the Genome with bound seqs and features"""
230 genome_path = config.installed_genome(species) / _SEQDB_NAME
231 seqs = CompressedGenomeSeqsDb(source=genome_path, species=species)
232 ann_path = config.installed_genome(species) / _ANNOTDB_NAME
233 ann = GffAnnotationDb(source=ann_path)
234 return Genome(species=species, seqs=seqs, annots=ann)
237def get_seqs_for_ids(
238 *,
239 config: InstalledConfig,
240 species: str,
241 names: list[str],
242 make_seq_name: typing.Callable = None,
243) -> typing.Iterable[Sequence]:
244 genome = load_genome(config=config, species=species)
245 # is it possible to do batch query for all names?
246 for name in names:
247 feature = list(genome.get_features(name=f"%{name}"))[0]
248 transcripts = list(feature.get_children(biotype="mRNA"))
249 if not transcripts:
250 continue
252 longest = max(transcripts, key=lambda x: len(x))
253 cds = list(longest.get_children(biotype="CDS"))
254 if not cds:
255 continue
257 feature = cds[0]
258 seq = feature.get_slice()
259 if callable(make_seq_name):
260 seq.name = make_seq_name(feature)
261 else:
262 seq.name = f"{species}-{name}"
263 seq.info["species"] = species
264 seq.info["name"] = name
265 # disconnect from annotation so the closure of the genome
266 # does not cause issues when run in parallel
267 seq.annotation_db = None
268 yield seq
270 genome.close()
271 del genome
274@define_app
275def get_selected_seqs(species_gene_ids: species_genes, config: InstalledConfig) -> list:
276 """return gene sequences when given a species_gene_id instance
278 Notes
279 -----
280 This function becomes a class, created using config. Calling the class
281 instance with a species_genes instance is used to extract the list of gene
282 ID's from the species.
283 """
284 species = species_gene_ids.species
285 gene_ids = species_gene_ids.gene_ids
286 return list(get_seqs_for_ids(config=config, species=species, names=gene_ids))
289def get_annotations_for_species(
290 *, config: InstalledConfig, species: str
291) -> GffAnnotationDb:
292 """returns the annotation Db for species"""
293 path = config.installed_genome(species=species)
294 if not path.exists():
295 click.secho(f"{species!r} not in {str(config.install_path.parent)!r}", fg="red")
296 exit(1)
297 # TODO: this filename should be defined in one place
298 path = path / "features.gff3db"
299 if not path.exists():
300 click.secho(f"{path.name!r} is missing", fg="red")
301 exit(1)
302 return GffAnnotationDb(source=path)
305def get_gene_table_for_species(
306 *, annot_db: GffAnnotationDb, limit: int | None, species: str | None = None
307) -> Table:
308 """
309 returns gene data from a GffDb
311 Parameters
312 ----------
313 annot_db
314 feature db
315 limit
316 limit number of records to
317 species
318 species name, overrides inference from annot_db.source
319 """
320 species = species or annot_db.source.parent.name
322 columns = (
323 "species",
324 "name",
325 "seqid",
326 "source",
327 "biotype",
328 "start",
329 "stop",
330 "score",
331 "strand",
332 "phase",
333 )
334 rows = []
335 for i, record in enumerate(annot_db.get_records_matching(biotype="gene")):
336 rows.append([species] + [record.get(c, None) for c in columns[1:]])
337 if i == limit:
338 break
340 return make_table(header=columns, data=rows)
343def get_species_summary(
344 *, annot_db: GffAnnotationDb, species: str | None = None
345) -> Table:
346 """
347 returns the Table summarising data for species_name
349 Parameters
350 ----------
351 annot_db
352 feature db
353 species
354 species name, overrides inference from annot_db.source
355 """
356 from ._species import Species
358 # for now, just biotype
359 species = species or annot_db.source.parent.name
360 counts = annot_db.biotype_counts()
361 try:
362 common_name = Species.get_common_name(species)
363 except ValueError:
364 common_name = species
366 return Table(
367 header=("biotype", "count"),
368 data=list(counts.items()),
369 title=f"{common_name} features",
370 )