Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_homologydb.py: 96%

77 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1from __future__ import annotations 

2 

3import dataclasses 

4import typing 

5 

6from rich.progress import track 

7 

8from ensembl_lite._config import InstalledConfig 

9from ensembl_lite._db_base import SqliteDbMixin 

10 

11 

12_HOMOLOGYDB_NAME = "homologies.sqlitedb" 

13 

14 

15@dataclasses.dataclass(slots=True, eq=True) 

16class HomologyRecord: 

17 source: str | None = None 

18 species_1: str | None = None 

19 gene_id_1: str | None = None 

20 prot_id_1: str | None = None 

21 species_2: str | None = None 

22 gene_id_2: str | None = None 

23 prot_id_2: str | None = None 

24 relationship: str | None = None 

25 

26 def __getitem__(self, item): 

27 return getattr(self, item) 

28 

29 def __setitem__(self, item, value): 

30 setattr(self, item, value) 

31 

32 

33def grouped_related( 

34 data: list[HomologyRecord], 

35) -> set[frozenset[tuple[str, str]]]: 

36 """determines related groups of genes 

37 

38 Parameters 

39 ---------- 

40 data 

41 list of full records from the HomologyDb 

42 

43 Returns 

44 ------- 

45 a data structure that can be json serialised 

46 

47 Notes 

48 ----- 

49 I assume that for a specific relationship type, a gene can only belong 

50 to one group. 

51 """ 

52 # grouped is {gene id: set(group)}. So gene's that belong to the same 

53 # group have the same value 

54 grouped = {} 

55 for record in track(data, description="Grouping related...", transient=True): 

56 pair = [ 

57 (record.species_1, record.gene_id_1), 

58 (record.species_2, record.gene_id_2), 

59 ] 

60 if record.gene_id_1 in grouped: 

61 val = grouped[record.gene_id_1] 

62 elif record.gene_id_2 in grouped: 

63 val = grouped[record.gene_id_2] 

64 else: 

65 val = set() 

66 val.update(pair) 

67 grouped[record.gene_id_1] = grouped[record.gene_id_2] = val 

68 return {frozenset(v) for v in grouped.values()} 

69 

70 

71# the homology db stores pairwise relationship information 

72class HomologyDb(SqliteDbMixin): 

73 table_name = "homology" 

74 

75 _homology_schema = { 

76 "source": "TEXT", # the file path 

77 "species_1": "TEXT", 

78 "gene_id_1": "TEXT", 

79 "prot_id_1": "TEXT", 

80 "species_2": "TEXT", 

81 "gene_id_2": "TEXT", 

82 "prot_id_2": "TEXT", 

83 "relationship": "TEXT", # defined by Ensembl 

84 } 

85 

86 def __init__(self, source=":memory:"): 

87 self.source = source 

88 self._init_tables() 

89 

90 def add_records( 

91 self, 

92 *, 

93 records: typing.Sequence, 

94 col_order: typing.Sized[str], 

95 ) -> None: 

96 # bulk insert 

97 val_placeholder = ", ".join("?" * len(col_order)) 

98 sql = f"INSERT INTO {self.table_name} ({', '.join(col_order)}) VALUES ({val_placeholder})" 

99 self.db.executemany(sql, records) 

100 self.db.commit() 

101 

102 def get_related_to( 

103 self, *, gene_id: str, relationship_type: str 

104 ) -> set[tuple[str, str]]: 

105 """return genes with relationship type to gene_id""" 

106 sql = ( 

107 f"SELECT species_1, gene_id_1, species_2, gene_id_2 from {self.table_name}" 

108 f" WHERE relationship = ? AND (gene_id_1=? OR gene_id_2=?)" 

109 ) 

110 result = set() 

111 for r in self._execute_sql( 

112 sql, (relationship_type, gene_id, gene_id) 

113 ).fetchall(): 

114 for num in (1, 2): 

115 species_key = f"species_{num}" 

116 gene_id_key = f"gene_id_{num}" 

117 result.add((r[species_key], r[gene_id_key])) 

118 return result 

119 

120 def get_related_groups( 

121 self, relationship_type: str 

122 ) -> set[frozenset[tuple[str, str]]]: 

123 """returns all groups of relationship type""" 

124 # get all gene ID's first 

125 sql = f"SELECT * from {self.table_name} WHERE relationship=?" 

126 results = [ 

127 HomologyRecord(**dict(zip(r.keys(), r))) 

128 for r in self._execute_sql(sql, (relationship_type,)).fetchall() 

129 ] 

130 return grouped_related(results) 

131 

132 

133def load_homology_db( 

134 *, 

135 config: InstalledConfig, 

136) -> HomologyDb: 

137 return HomologyDb(source=config.homologies_path / _HOMOLOGYDB_NAME) 

138 

139 

140@dataclasses.dataclass(slots=True) 

141class species_genes: 

142 """contains gene IDs for species""" 

143 

144 species: str 

145 gene_ids: list[str] = None 

146 

147 def __hash__(self): 

148 return hash(self.species) 

149 

150 def __post_init__(self): 

151 self.gene_ids = [] 

152 

153 

154def id_by_species_group(related) -> tuple[list[species_genes], dict[str, int]]: 

155 """returns species gene sets and relationship index""" 

156 sp_groups = {} 

157 id_group_map = {} 

158 for group_num, group in enumerate(related): 

159 for sp, gene_id in group: 

160 val = sp_groups[sp] if sp in sp_groups else species_genes(species=sp) 

161 val.gene_ids.append(gene_id) 

162 sp_groups[sp] = val 

163 id_group_map[gene_id] = group_num 

164 return list(sp_groups.values()), id_group_map