Coverage for crateweb/nlp_classification/models.py: 83%
124 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1from typing import Any
3from django.conf import settings
4from django.db import models
6from crate_anon.crateweb.nlp_classification.database_connection import (
7 DatabaseConnection,
8)
10from crate_anon.nlp_manager.constants import (
11 FN_SRCFIELD,
12 FN_SRCPKFIELD,
13 FN_SRCPKVAL,
14 FN_SRCTABLE,
15)
17from crate_anon.nlp_manager.regex_parser import (
18 FN_CONTENT,
19 FN_END,
20 FN_START,
21)
24class Task(models.Model):
25 """
26 Task is the overall concept, e.g. "assessing CRP accuracy for Bob's study".
27 Everything else hangs off this. There may be more than one Assignment per
28 task.
29 """
31 name = models.CharField(max_length=100)
33 def __str__(self) -> Any:
34 return self.name
37class Option(models.Model):
38 """
39 Associated with one or more questions.
41 Examples: "Yes", "No"
43 """
45 description = models.CharField(max_length=100)
47 def __str__(self) -> Any:
48 return self.description
51class Question(models.Model):
52 """
53 Question is presented to the user validating the NLP records.
55 Example: "Does this text show a C-reactive protein (CRP) value?"
56 (has NLP identified CRP at all even if it didn't extract the right value)
58 or, more specifically: "Does this text show a C-reactive protein (CRP)
59 value AND that value matches the NLP output?".
61 A yes/no answer makes it easier to assess precision and recall. We will
62 support more than two choices, however.
63 """
65 title = models.CharField(max_length=100)
66 task = models.ForeignKey(Task, on_delete=models.CASCADE)
67 options = models.ManyToManyField(Option)
69 def __str__(self) -> Any:
70 return self.title
73class TableDefinition(models.Model):
74 """
75 Points to a table in a database.
76 """
78 db_connection_name = models.CharField(max_length=100)
79 table_name = models.CharField(max_length=100)
80 pk_column_name = models.CharField(max_length=100)
82 def __str__(self) -> Any:
83 return f"{self.db_connection_name}.{self.table_name}"
86class Column(models.Model):
87 """
88 Points to a particular column in a table of a database.
89 Needed because we don't know which columns are in the table.
90 """
92 table_definition = models.ForeignKey(
93 TableDefinition, on_delete=models.CASCADE
94 )
95 name = models.CharField(max_length=100)
97 def __str__(self) -> Any:
98 return f"{self.table_definition}.{self.name}"
101class SampleSpec(models.Model):
102 """
103 Used to create SourceRecords across one or more source tables and
104 corresponding NLP records.
106 Size might be maximum. What happens if there are fewer matching records in
107 the sample?
108 """
110 source_column = models.ForeignKey(Column, on_delete=models.CASCADE)
111 nlp_table_definition = models.ForeignKey(
112 TableDefinition, on_delete=models.CASCADE
113 )
114 search_term = models.CharField(max_length=100)
115 size = models.IntegerField()
116 seed = models.PositiveIntegerField() # default range 0-2147483647
118 def __str__(self) -> Any:
119 return (
120 f"{self.size} records from '{self.source_column}' "
121 f"with seed {self.seed} and search term '{self.search_term}'"
122 )
125class SourceRecord(models.Model):
126 """
127 This is an individual entry for a source table with optional NLP record.
128 """
130 source_column = models.ForeignKey(Column, on_delete=models.CASCADE)
131 nlp_table_definition = models.ForeignKey(
132 TableDefinition,
133 null=True,
134 on_delete=models.SET_NULL,
135 related_name="source_records",
136 )
137 source_pk_value = models.CharField(max_length=100)
138 nlp_pk_value = models.CharField(max_length=100)
140 def __init__(self, *args: Any, **kwargs: Any) -> None:
141 super().__init__(*args, **kwargs)
143 self._nlp_dict: dict[str, Any] = None
144 self._source_text: str = None
145 self._extra_nlp_column_names = None
147 @property
148 def extra_nlp_column_names(self) -> list[str]:
149 if self._extra_nlp_column_names is None:
150 self._extra_nlp_column_names = [
151 c.name
152 for c in Column.objects.filter(
153 table_definition=self.nlp_table_definition
154 )
155 ]
157 return self._extra_nlp_column_names
159 @property
160 def nlp_dict(self) -> dict[str, Any]:
161 if self._nlp_dict is None:
162 column_names = [
163 FN_SRCFIELD,
164 FN_SRCTABLE,
165 FN_SRCPKFIELD,
166 FN_SRCPKVAL,
167 FN_CONTENT,
168 FN_START,
169 FN_END,
170 ] + self.extra_nlp_column_names
172 connection = self.get_nlp_database_connection()
173 self._nlp_dict = connection.fetchone_as_dict(
174 column_names,
175 self.nlp_table_definition.table_name,
176 where=f"{self.nlp_table_definition.pk_column_name} = %s",
177 params=[self.nlp_pk_value],
178 )
180 return self._nlp_dict
182 @property
183 def before(self) -> str:
184 if self.nlp_dict:
185 return self.source_text[: self.nlp_dict[FN_START]]
187 return self.source_text
189 @property
190 def after(self) -> str:
191 if self.nlp_dict:
192 return self.source_text[self.nlp_dict[FN_END] :]
194 return ""
196 @property
197 def match(self) -> str:
198 if self.nlp_dict:
199 return self.nlp_dict[FN_CONTENT]
201 return ""
203 @property
204 def extra_nlp_fields(self) -> dict[str, Any]:
205 return dict(
206 (k, self.nlp_dict[k])
207 for k in self.extra_nlp_column_names
208 if k in self.nlp_dict
209 )
211 @property
212 def source_text(self) -> str:
213 if self._source_text is None:
214 source_column_name = self.source_column.name
215 source_table = self.source_column.table_definition.table_name
216 source_pk_column_name = (
217 self.source_column.table_definition.pk_column_name
218 )
220 connection = self.get_source_database_connection()
222 row = connection.fetchone_as_dict(
223 [source_column_name],
224 source_table,
225 where=f"{source_pk_column_name} = %s",
226 params=[self.source_pk_value],
227 )
228 self._source_text = row[source_column_name]
230 return self._source_text
232 def get_source_database_connection(self) -> DatabaseConnection:
233 return DatabaseConnection(
234 self.source_column.table_definition.db_connection_name
235 )
237 def get_nlp_database_connection(self) -> DatabaseConnection:
238 return DatabaseConnection(self.nlp_table_definition.db_connection_name)
240 def __str__(self) -> Any:
241 pk_column_name = self.source_column.table_definition.pk_column_name
243 return (
244 f"Item {self.source_column.table_definition}.{pk_column_name}="
245 f"{self.source_pk_value}"
246 )
249class Assignment(models.Model):
250 task = models.ForeignKey(Task, on_delete=models.CASCADE)
251 sample_spec = models.ForeignKey(SampleSpec, on_delete=models.CASCADE)
252 user = models.ForeignKey(
253 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
254 )
256 source_records = models.ManyToManyField(SourceRecord)
258 def assign_source_records(self) -> None:
259 source_table_definition = (
260 self.sample_spec.source_column.table_definition
261 )
262 source_connection_name = source_table_definition.db_connection_name
263 source_pk_column_name = source_table_definition.pk_column_name
264 source_table_name = source_table_definition.table_name
265 source_connection = DatabaseConnection(source_connection_name)
267 nlp_table_definition = self.sample_spec.nlp_table_definition
268 nlp_connection_name = nlp_table_definition.db_connection_name
269 nlp_pk_column_name = nlp_table_definition.pk_column_name
270 nlp_table_name = nlp_table_definition.table_name
271 nlp_connection = DatabaseConnection(nlp_connection_name)
272 for source_row in source_connection.fetchall(
273 [source_pk_column_name], source_table_name
274 ):
275 nlp_dict = nlp_connection.fetchone_as_dict(
276 [nlp_pk_column_name],
277 nlp_table_name,
278 where=f"{FN_SRCPKVAL} = %s",
279 params=[source_row[0]],
280 )
282 nlp_pk_value = ""
283 if nlp_dict:
284 nlp_pk_value = nlp_dict[nlp_pk_column_name]
286 source_record, created = SourceRecord.objects.get_or_create(
287 source_column=self.sample_spec.source_column,
288 nlp_table_definition=nlp_table_definition,
289 source_pk_value=source_row[0],
290 nlp_pk_value=nlp_pk_value,
291 )
293 self.source_records.add(source_record)
296class UserAnswer(models.Model):
297 """
298 A user's answer to a Question. Linked with SourceRecord, which has an
299 optional NLP record.
301 - Note that in this analogy a question can have many answers.
303 """
305 source_record = models.ForeignKey(SourceRecord, on_delete=models.CASCADE)
306 question = models.ForeignKey(Question, on_delete=models.CASCADE)
307 decision = models.ForeignKey(Option, null=True, on_delete=models.SET_NULL)
308 assignment = models.ForeignKey(Assignment, on_delete=models.CASCADE)