Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_homologydb.py: 96%
77 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1from __future__ import annotations
3import dataclasses
4import typing
6from rich.progress import track
8from ensembl_lite._config import InstalledConfig
9from ensembl_lite._db_base import SqliteDbMixin
12_HOMOLOGYDB_NAME = "homologies.sqlitedb"
15@dataclasses.dataclass(slots=True, eq=True)
16class HomologyRecord:
17 source: str | None = None
18 species_1: str | None = None
19 gene_id_1: str | None = None
20 prot_id_1: str | None = None
21 species_2: str | None = None
22 gene_id_2: str | None = None
23 prot_id_2: str | None = None
24 relationship: str | None = None
26 def __getitem__(self, item):
27 return getattr(self, item)
29 def __setitem__(self, item, value):
30 setattr(self, item, value)
33def grouped_related(
34 data: list[HomologyRecord],
35) -> set[frozenset[tuple[str, str]]]:
36 """determines related groups of genes
38 Parameters
39 ----------
40 data
41 list of full records from the HomologyDb
43 Returns
44 -------
45 a data structure that can be json serialised
47 Notes
48 -----
49 I assume that for a specific relationship type, a gene can only belong
50 to one group.
51 """
52 # grouped is {gene id: set(group)}. So gene's that belong to the same
53 # group have the same value
54 grouped = {}
55 for record in track(data, description="Grouping related...", transient=True):
56 pair = [
57 (record.species_1, record.gene_id_1),
58 (record.species_2, record.gene_id_2),
59 ]
60 if record.gene_id_1 in grouped:
61 val = grouped[record.gene_id_1]
62 elif record.gene_id_2 in grouped:
63 val = grouped[record.gene_id_2]
64 else:
65 val = set()
66 val.update(pair)
67 grouped[record.gene_id_1] = grouped[record.gene_id_2] = val
68 return {frozenset(v) for v in grouped.values()}
71# the homology db stores pairwise relationship information
72class HomologyDb(SqliteDbMixin):
73 table_name = "homology"
75 _homology_schema = {
76 "source": "TEXT", # the file path
77 "species_1": "TEXT",
78 "gene_id_1": "TEXT",
79 "prot_id_1": "TEXT",
80 "species_2": "TEXT",
81 "gene_id_2": "TEXT",
82 "prot_id_2": "TEXT",
83 "relationship": "TEXT", # defined by Ensembl
84 }
86 def __init__(self, source=":memory:"):
87 self.source = source
88 self._init_tables()
90 def add_records(
91 self,
92 *,
93 records: typing.Sequence,
94 col_order: typing.Sized[str],
95 ) -> None:
96 # bulk insert
97 val_placeholder = ", ".join("?" * len(col_order))
98 sql = f"INSERT INTO {self.table_name} ({', '.join(col_order)}) VALUES ({val_placeholder})"
99 self.db.executemany(sql, records)
100 self.db.commit()
102 def get_related_to(
103 self, *, gene_id: str, relationship_type: str
104 ) -> set[tuple[str, str]]:
105 """return genes with relationship type to gene_id"""
106 sql = (
107 f"SELECT species_1, gene_id_1, species_2, gene_id_2 from {self.table_name}"
108 f" WHERE relationship = ? AND (gene_id_1=? OR gene_id_2=?)"
109 )
110 result = set()
111 for r in self._execute_sql(
112 sql, (relationship_type, gene_id, gene_id)
113 ).fetchall():
114 for num in (1, 2):
115 species_key = f"species_{num}"
116 gene_id_key = f"gene_id_{num}"
117 result.add((r[species_key], r[gene_id_key]))
118 return result
120 def get_related_groups(
121 self, relationship_type: str
122 ) -> set[frozenset[tuple[str, str]]]:
123 """returns all groups of relationship type"""
124 # get all gene ID's first
125 sql = f"SELECT * from {self.table_name} WHERE relationship=?"
126 results = [
127 HomologyRecord(**dict(zip(r.keys(), r)))
128 for r in self._execute_sql(sql, (relationship_type,)).fetchall()
129 ]
130 return grouped_related(results)
133def load_homology_db(
134 *,
135 config: InstalledConfig,
136) -> HomologyDb:
137 return HomologyDb(source=config.homologies_path / _HOMOLOGYDB_NAME)
140@dataclasses.dataclass(slots=True)
141class species_genes:
142 """contains gene IDs for species"""
144 species: str
145 gene_ids: list[str] = None
147 def __hash__(self):
148 return hash(self.species)
150 def __post_init__(self):
151 self.gene_ids = []
154def id_by_species_group(related) -> tuple[list[species_genes], dict[str, int]]:
155 """returns species gene sets and relationship index"""
156 sp_groups = {}
157 id_group_map = {}
158 for group_num, group in enumerate(related):
159 for sp, gene_id in group:
160 val = sp_groups[sp] if sp in sp_groups else species_genes(species=sp)
161 val.gene_ids.append(gene_id)
162 sp_groups[sp] = val
163 id_group_map[gene_id] = group_num
164 return list(sp_groups.values()), id_group_map