Coverage for crateweb/nlp_classification/models.py: 83%

124 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1from typing import Any 

2 

3from django.conf import settings 

4from django.db import models 

5 

6from crate_anon.crateweb.nlp_classification.database_connection import ( 

7 DatabaseConnection, 

8) 

9 

10from crate_anon.nlp_manager.constants import ( 

11 FN_SRCFIELD, 

12 FN_SRCPKFIELD, 

13 FN_SRCPKVAL, 

14 FN_SRCTABLE, 

15) 

16 

17from crate_anon.nlp_manager.regex_parser import ( 

18 FN_CONTENT, 

19 FN_END, 

20 FN_START, 

21) 

22 

23 

24class Task(models.Model): 

25 """ 

26 Task is the overall concept, e.g. "assessing CRP accuracy for Bob's study". 

27 Everything else hangs off this. There may be more than one Assignment per 

28 task. 

29 """ 

30 

31 name = models.CharField(max_length=100) 

32 

33 def __str__(self) -> Any: 

34 return self.name 

35 

36 

37class Option(models.Model): 

38 """ 

39 Associated with one or more questions. 

40 

41 Examples: "Yes", "No" 

42 

43 """ 

44 

45 description = models.CharField(max_length=100) 

46 

47 def __str__(self) -> Any: 

48 return self.description 

49 

50 

51class Question(models.Model): 

52 """ 

53 Question is presented to the user validating the NLP records. 

54 

55 Example: "Does this text show a C-reactive protein (CRP) value?" 

56 (has NLP identified CRP at all even if it didn't extract the right value) 

57 

58 or, more specifically: "Does this text show a C-reactive protein (CRP) 

59 value AND that value matches the NLP output?". 

60 

61 A yes/no answer makes it easier to assess precision and recall. We will 

62 support more than two choices, however. 

63 """ 

64 

65 title = models.CharField(max_length=100) 

66 task = models.ForeignKey(Task, on_delete=models.CASCADE) 

67 options = models.ManyToManyField(Option) 

68 

69 def __str__(self) -> Any: 

70 return self.title 

71 

72 

73class TableDefinition(models.Model): 

74 """ 

75 Points to a table in a database. 

76 """ 

77 

78 db_connection_name = models.CharField(max_length=100) 

79 table_name = models.CharField(max_length=100) 

80 pk_column_name = models.CharField(max_length=100) 

81 

82 def __str__(self) -> Any: 

83 return f"{self.db_connection_name}.{self.table_name}" 

84 

85 

86class Column(models.Model): 

87 """ 

88 Points to a particular column in a table of a database. 

89 Needed because we don't know which columns are in the table. 

90 """ 

91 

92 table_definition = models.ForeignKey( 

93 TableDefinition, on_delete=models.CASCADE 

94 ) 

95 name = models.CharField(max_length=100) 

96 

97 def __str__(self) -> Any: 

98 return f"{self.table_definition}.{self.name}" 

99 

100 

101class SampleSpec(models.Model): 

102 """ 

103 Used to create SourceRecords across one or more source tables and 

104 corresponding NLP records. 

105 

106 Size might be maximum. What happens if there are fewer matching records in 

107 the sample? 

108 """ 

109 

110 source_column = models.ForeignKey(Column, on_delete=models.CASCADE) 

111 nlp_table_definition = models.ForeignKey( 

112 TableDefinition, on_delete=models.CASCADE 

113 ) 

114 search_term = models.CharField(max_length=100) 

115 size = models.IntegerField() 

116 seed = models.PositiveIntegerField() # default range 0-2147483647 

117 

118 def __str__(self) -> Any: 

119 return ( 

120 f"{self.size} records from '{self.source_column}' " 

121 f"with seed {self.seed} and search term '{self.search_term}'" 

122 ) 

123 

124 

125class SourceRecord(models.Model): 

126 """ 

127 This is an individual entry for a source table with optional NLP record. 

128 """ 

129 

130 source_column = models.ForeignKey(Column, on_delete=models.CASCADE) 

131 nlp_table_definition = models.ForeignKey( 

132 TableDefinition, 

133 null=True, 

134 on_delete=models.SET_NULL, 

135 related_name="source_records", 

136 ) 

137 source_pk_value = models.CharField(max_length=100) 

138 nlp_pk_value = models.CharField(max_length=100) 

139 

140 def __init__(self, *args: Any, **kwargs: Any) -> None: 

141 super().__init__(*args, **kwargs) 

142 

143 self._nlp_dict: dict[str, Any] = None 

144 self._source_text: str = None 

145 self._extra_nlp_column_names = None 

146 

147 @property 

148 def extra_nlp_column_names(self) -> list[str]: 

149 if self._extra_nlp_column_names is None: 

150 self._extra_nlp_column_names = [ 

151 c.name 

152 for c in Column.objects.filter( 

153 table_definition=self.nlp_table_definition 

154 ) 

155 ] 

156 

157 return self._extra_nlp_column_names 

158 

159 @property 

160 def nlp_dict(self) -> dict[str, Any]: 

161 if self._nlp_dict is None: 

162 column_names = [ 

163 FN_SRCFIELD, 

164 FN_SRCTABLE, 

165 FN_SRCPKFIELD, 

166 FN_SRCPKVAL, 

167 FN_CONTENT, 

168 FN_START, 

169 FN_END, 

170 ] + self.extra_nlp_column_names 

171 

172 connection = self.get_nlp_database_connection() 

173 self._nlp_dict = connection.fetchone_as_dict( 

174 column_names, 

175 self.nlp_table_definition.table_name, 

176 where=f"{self.nlp_table_definition.pk_column_name} = %s", 

177 params=[self.nlp_pk_value], 

178 ) 

179 

180 return self._nlp_dict 

181 

182 @property 

183 def before(self) -> str: 

184 if self.nlp_dict: 

185 return self.source_text[: self.nlp_dict[FN_START]] 

186 

187 return self.source_text 

188 

189 @property 

190 def after(self) -> str: 

191 if self.nlp_dict: 

192 return self.source_text[self.nlp_dict[FN_END] :] 

193 

194 return "" 

195 

196 @property 

197 def match(self) -> str: 

198 if self.nlp_dict: 

199 return self.nlp_dict[FN_CONTENT] 

200 

201 return "" 

202 

203 @property 

204 def extra_nlp_fields(self) -> dict[str, Any]: 

205 return dict( 

206 (k, self.nlp_dict[k]) 

207 for k in self.extra_nlp_column_names 

208 if k in self.nlp_dict 

209 ) 

210 

211 @property 

212 def source_text(self) -> str: 

213 if self._source_text is None: 

214 source_column_name = self.source_column.name 

215 source_table = self.source_column.table_definition.table_name 

216 source_pk_column_name = ( 

217 self.source_column.table_definition.pk_column_name 

218 ) 

219 

220 connection = self.get_source_database_connection() 

221 

222 row = connection.fetchone_as_dict( 

223 [source_column_name], 

224 source_table, 

225 where=f"{source_pk_column_name} = %s", 

226 params=[self.source_pk_value], 

227 ) 

228 self._source_text = row[source_column_name] 

229 

230 return self._source_text 

231 

232 def get_source_database_connection(self) -> DatabaseConnection: 

233 return DatabaseConnection( 

234 self.source_column.table_definition.db_connection_name 

235 ) 

236 

237 def get_nlp_database_connection(self) -> DatabaseConnection: 

238 return DatabaseConnection(self.nlp_table_definition.db_connection_name) 

239 

240 def __str__(self) -> Any: 

241 pk_column_name = self.source_column.table_definition.pk_column_name 

242 

243 return ( 

244 f"Item {self.source_column.table_definition}.{pk_column_name}=" 

245 f"{self.source_pk_value}" 

246 ) 

247 

248 

249class Assignment(models.Model): 

250 task = models.ForeignKey(Task, on_delete=models.CASCADE) 

251 sample_spec = models.ForeignKey(SampleSpec, on_delete=models.CASCADE) 

252 user = models.ForeignKey( 

253 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

254 ) 

255 

256 source_records = models.ManyToManyField(SourceRecord) 

257 

258 def assign_source_records(self) -> None: 

259 source_table_definition = ( 

260 self.sample_spec.source_column.table_definition 

261 ) 

262 source_connection_name = source_table_definition.db_connection_name 

263 source_pk_column_name = source_table_definition.pk_column_name 

264 source_table_name = source_table_definition.table_name 

265 source_connection = DatabaseConnection(source_connection_name) 

266 

267 nlp_table_definition = self.sample_spec.nlp_table_definition 

268 nlp_connection_name = nlp_table_definition.db_connection_name 

269 nlp_pk_column_name = nlp_table_definition.pk_column_name 

270 nlp_table_name = nlp_table_definition.table_name 

271 nlp_connection = DatabaseConnection(nlp_connection_name) 

272 for source_row in source_connection.fetchall( 

273 [source_pk_column_name], source_table_name 

274 ): 

275 nlp_dict = nlp_connection.fetchone_as_dict( 

276 [nlp_pk_column_name], 

277 nlp_table_name, 

278 where=f"{FN_SRCPKVAL} = %s", 

279 params=[source_row[0]], 

280 ) 

281 

282 nlp_pk_value = "" 

283 if nlp_dict: 

284 nlp_pk_value = nlp_dict[nlp_pk_column_name] 

285 

286 source_record, created = SourceRecord.objects.get_or_create( 

287 source_column=self.sample_spec.source_column, 

288 nlp_table_definition=nlp_table_definition, 

289 source_pk_value=source_row[0], 

290 nlp_pk_value=nlp_pk_value, 

291 ) 

292 

293 self.source_records.add(source_record) 

294 

295 

296class UserAnswer(models.Model): 

297 """ 

298 A user's answer to a Question. Linked with SourceRecord, which has an 

299 optional NLP record. 

300 

301 - Note that in this analogy a question can have many answers. 

302 

303 """ 

304 

305 source_record = models.ForeignKey(SourceRecord, on_delete=models.CASCADE) 

306 question = models.ForeignKey(Question, on_delete=models.CASCADE) 

307 decision = models.ForeignKey(Option, null=True, on_delete=models.SET_NULL) 

308 assignment = models.ForeignKey(Assignment, on_delete=models.CASCADE)