Coverage for nlp_manager/regex_read_codes.py: 98%
131 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/regex_read_codes.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Regular expressions to detect some Read codes (CTV3).**
28See https://en.wikipedia.org/wiki/Read_code.
30"""
32import logging
33from typing import List
35from crate_anon.common.regex_helpers import (
36 at_start_wb,
37 escape_literal_string_for_regex,
38 escape_literal_for_regex_allowing_flexible_whitespace,
39 LEFT_BRACKET as LB,
40 OPTIONAL_WHITESPACE,
41 regex_or,
42 RIGHT_BRACKET as RB,
43)
45log = logging.getLogger(__name__)
48# =============================================================================
49# Represent a Read code
50# =============================================================================
53class ReadCode:
54 r"""
55 Represents information about the way a quantity is represented as a Read
56 code.
58 NOTE: Read codes are case-sensitive. (See
59 https://www.gp-training.net/it/read-codes/.)
61 It would be desirable to mark the Read code as case-sensitive, within a
62 regex that is case-insensitive overall. Apparently Tcl supports this via
63 the ``(?c)`` flag: https://www.regular-expressions.info/modifiers.html.
65 However, others just support the "locally case-insensitive" flag, ``(?i)``.
67 Python (via ``regex``) fails to parse the test regex ``(?i)te(?-i)st``,
68 from https://www.regular-expressions.info/modifiers.html. It gives the
69 error ``regex._regex_core.error: bad inline flags: cannot turn flags off at
70 position 11``. No docs at https://pypi.org/project/regex/ or
71 https://docs.python.org/3/library/re.html suggest otherwise.
73 Since we absolutely want case-insensitive matching for the most part, I
74 think we'll live with this limitation.
75 """
77 def __init__(self, read_code: str, phrases: List[str] = None) -> None:
78 """
79 Args:
80 read_code:
81 The Read (CTV3) code, a string of length 5.
82 phrases:
83 The associated possible phrases.
84 """
85 assert isinstance(read_code, str)
86 assert len(read_code) == 5
87 self.read_code = read_code
88 self.phrases = phrases or [] # type: List[str]
90 def component_regex_strings(self) -> List[str]:
91 """
92 A list of regular expression strings representing this quantity.
94 Provides regexes for:
96 .. code-block:: none
98 phrase (readcode)
99 phrase
100 """
101 components = [] # type: List[str]
102 esc_read = escape_literal_string_for_regex(self.read_code)
103 optional_observation = r"(?:\s* - \s+ observation)?"
104 for p in self.phrases:
105 phrase = at_start_wb(
106 escape_literal_for_regex_allowing_flexible_whitespace(p)
107 )
108 r = (
109 f"{phrase}{optional_observation}"
110 f"(?:{OPTIONAL_WHITESPACE}{LB}{esc_read}{RB})?"
111 )
112 components.append(r)
113 return components
115 def regex_str(self) -> str:
116 """
117 A single composite regex string representing this quantity.
118 """
119 return regex_or(
120 *self.component_regex_strings(),
121 wrap_each_in_noncapture_group=True,
122 wrap_result_in_noncapture_group=True,
123 )
126# =============================================================================
127# Some known values used by our NLP parsers
128# =============================================================================
131class ReadCodes:
132 """
133 Some known Read codes.
135 From ``v3ReadCode_PBCL.xlsx``.
136 """
138 # -------------------------------------------------------------------------
139 # Biochemistry
140 # -------------------------------------------------------------------------
142 ALBUMIN_PLASMA = ReadCode(
143 read_code="XaIRc", phrases=["Plasma albumin level"]
144 )
145 ALBUMIN_SERUM = ReadCode(
146 read_code="XE2eA", phrases=["Serum albumin level"]
147 )
148 ALKPHOS = ReadCode(
149 read_code="44F3.", phrases=["Alkaline phosphatase level"]
150 )
151 ALKPHOS_PLASMA = ReadCode(
152 read_code="XaIRj", phrases=["Plasma alkaline phosphatase level"]
153 )
154 ALKPHOS_SERUM = ReadCode(
155 read_code="XE2px", phrases=["Serum alkaline phosphatase level"]
156 )
157 ALT = ReadCode(read_code="44G3.", phrases=["ALT/SGPT serum level"])
159 BILIRUBIN_PLASMA_TOTAL = ReadCode(
160 read_code="XaETf", phrases=["Plasma total bilirubin level"]
161 )
162 BILIRUBIN_SERUM = ReadCode(
163 read_code="44E..", phrases=["Serum bilirubin level"]
164 )
165 BILIRUBIN_SERUM_TOTAL = ReadCode(
166 read_code="XaERu", phrases=["Serum total bilirubin level"]
167 )
168 BILIRUBIN_TOTAL = ReadCode(
169 read_code="XE2qu", phrases=["Total bilirubin level"]
170 )
172 CHOLESTEROL_SERUM = ReadCode(
173 read_code="XE2eD", phrases=["Serum cholesterol level"]
174 )
175 CHOLESTEROL_TOTAL_PLASMA = ReadCode(
176 read_code="XaIRd", phrases=["Plasma total cholesterol level"]
177 )
178 CHOLESTEROL_TOTAL_SERUM = ReadCode(
179 read_code="XaJe9", phrases=["Serum total cholesterol level"]
180 )
181 CREATININE = ReadCode(read_code="X771Q", phrases=["Creatinine level"])
182 CREATININE_PLASMA = ReadCode(
183 read_code="XaETQ", phrases=["Plasma creatinine level"]
184 )
185 CREATININE_PLASMA_CORRECTED = ReadCode(
186 read_code="XaERX", phrases=["Cor plasma creatinine level"]
187 )
188 CREATININE_SERUM = ReadCode(
189 read_code="XE2q5", phrases=["Serum creatinine level"]
190 )
191 CREATININE_SERUM_CORRECTED = ReadCode(
192 read_code="XaERc", phrases=["Cor serum creatinine level"]
193 )
194 CRP_PLASMA = ReadCode(
195 read_code="XE2dy", phrases=["Plasma C-reactive protein level"]
196 )
197 CRP_SERUM = ReadCode(
198 read_code="XaINL", phrases=["Serum C reactive protein level"]
199 )
201 GAMMA_GT = ReadCode(
202 read_code="44G4.", phrases=["Gamma-glutamyl transferase lev"]
203 )
204 GAMMA_GT_PLASMA = ReadCode(
205 read_code="XaES4", phrases=["Plasma gamma-glutamyl transferase level"]
206 )
207 GAMMA_GT_SERUM = ReadCode(
208 read_code="XaES3", phrases=["Serum gamma-glutamyl transferase level"]
209 )
210 GLUCOSE = ReadCode(read_code="X772y", phrases=["Glucose level"])
211 GLUCOSE_BLOOD = ReadCode(
212 read_code="X772z", phrases=["Blood glucose level"]
213 )
214 GLUCOSE_BLOOD_2H_POSTPRANDIAL = ReadCode(
215 read_code="44U7.", phrases=["2 hour post-prand blood gluc"]
216 )
217 GLUCOSE_BLOOD_150_MIN = ReadCode(
218 read_code="XaEOS", phrases=["150 minute blood glucose level"]
219 )
220 GLUCOSE_PLASMA_RANDOM = ReadCode(
221 read_code="44g0.", phrases=["Plasma random glucose level"]
222 )
223 GLUCOSE_PLASMA_FASTING = ReadCode(
224 read_code="44g1.", phrases=["Plasma fasting glucose level"]
225 )
226 GLUCOSE_PLASMA_30_MIN = ReadCode(
227 read_code="XaEOT", phrases=["30 minute plasma glucose level"]
228 )
229 GLUCOSE_PLASMA_60_MIN = ReadCode(
230 read_code="XaEOU", phrases=["60 minute plasma glucose level"]
231 )
232 GLUCOSE_PLASMA_90_MIN = ReadCode(
233 read_code="XaEPc", phrases=["90 minute plasma glucose level"]
234 )
235 GLUCOSE_PLASMA_120_MIN = ReadCode(
236 read_code="XaEOV", phrases=["120 minute plasma glucose level"]
237 )
238 GLUCOSE_PLASMA_2H_POSTPRANDIAL = ReadCode(
239 read_code="44g2.", phrases=["Plasma 2-hr post-pran gluc lev"]
240 )
241 GLUCOSE_PLASMA_150_MIN = ReadCode(
242 read_code="XaEOW", phrases=["150 min plasma glucose level"]
243 )
244 GLUCOSE_SERUM = ReadCode(
245 read_code="44f..", phrases=["Serum glucose level"]
246 )
247 GLUCOSE_SERUM_RANDOM = ReadCode(
248 read_code="44f0.", phrases=["Serum random glucose level"]
249 )
250 GLUCOSE_SERUM_FASTING = ReadCode(
251 read_code="44f1.", phrases=["Serum fasting glucose level"]
252 )
253 GLUCOSE_SERUM_30_MIN = ReadCode(
254 read_code="XaEOX", phrases=["30 minute serum glucose level"]
255 )
256 GLUCOSE_SERUM_60_MIN = ReadCode(
257 read_code="XaEOY", phrases=["60 minute serum glucose level"]
258 )
259 GLUCOSE_SERUM_90_MIN = ReadCode(
260 read_code="XaEPd", phrases=["90 minute serum glucose level"]
261 )
262 GLUCOSE_SERUM_120_MIN = ReadCode(
263 read_code="XaEOZ", phrases=["120 minute serum glucose level"]
264 )
265 GLUCOSE_SERUM_2H_POSTPRANDIAL = ReadCode(
266 read_code="44f2.", phrases=["Serum 2-hr post-prand gluc lev"]
267 )
268 GLUCOSE_SERUM_150_MIN = ReadCode(
269 read_code="XaERQ", phrases=["150 minute serum glucose level"]
270 )
272 HBA1C = ReadCode(read_code="X772q", phrases=["Haemoglobin A1c level"])
273 HBA1C_DCCT = ReadCode(
274 read_code="XaERp", phrases=["HbA1c level (DCCT aligned)"]
275 )
276 HBA1C_IFCC = ReadCode(
277 read_code="XaPbt", phrases=["HbA1c levl - IFCC standardised"]
278 )
279 HDL_PLASMA = ReadCode(
280 read_code="XaEVr", phrases=["Plasma HDL cholesterol level"]
281 )
282 HDL_PLASMA_RANDOM = ReadCode(
283 read_code="44d2.", phrases=["Plasma rndm HDL cholest level"]
284 )
285 HDL_PLASMA_FASTING = ReadCode(
286 read_code="44d3.", phrases=["Plasma fast HDL cholest level"]
287 )
288 HDL_SERUM = ReadCode(
289 read_code="44P5.", phrases=["Serum HDL cholesterol level"]
290 )
291 HDL_SERUM_FASTING = ReadCode(
292 read_code="44PB.", phrases=["Serum fast HDL cholesterol lev"]
293 )
294 HDL_SERUM_RANDOM = ReadCode(
295 read_code="44PC.", phrases=["Ser random HDL cholesterol lev"]
296 )
298 LITHIUM_SERUM = ReadCode(
299 read_code="XE25g", phrases=["Serum lithium level"]
300 )
301 LDL_PLASMA = ReadCode(
302 read_code="XaEVs", phrases=["Plasma LDL cholesterol level"]
303 )
304 LDL_PLASMA_RANDOM = ReadCode(
305 read_code="44d4.", phrases=["Plasma rndm LDL cholest level"]
306 )
307 LDL_PLASMA_FASTING = ReadCode(
308 read_code="44d5.", phrases=["Plasma fast LDL cholest level"]
309 )
310 LDL_SERUM = ReadCode(
311 read_code="44P6.", phrases=["Serum LDL cholesterol level"]
312 )
313 LDL_SERUM_FASTING = ReadCode(
314 read_code="44PD.", phrases=["Serum fast LDL cholesterol lev"]
315 )
316 LDL_SERUM_RANDOM = ReadCode(
317 read_code="44PE.", phrases=["Ser random LDL cholesterol lev"]
318 )
320 POTASSIUM = ReadCode(read_code="X771S", phrases=["Potassium level"])
321 POTASSIUM_BLOOD = ReadCode(
322 read_code="XaDvZ", phrases=["Blood potassium level"]
323 )
324 POTASSIUM_PLASMA = ReadCode(
325 read_code="XaIRl", phrases=["Plasma potassium level"]
326 )
327 POTASSIUM_SERUM = ReadCode(
328 read_code="XE2pz", phrases=["Serum potassium level"]
329 )
331 TG = ReadCode(read_code="X772O", phrases=["Triglyceride level"])
332 TG_PLASMA = ReadCode(
333 read_code="44e..", phrases=["Plasma triglyceride level"]
334 )
335 TG_PLASMA_RANDOM = ReadCode(
336 read_code="44e0.", phrases=["Plasma rndm triglyceride level"]
337 )
338 TG_PLASMA_FASTING = ReadCode(
339 read_code="44e1.", phrases=["Plasma fast triglyceride level"]
340 )
341 TG_SERUM = ReadCode(
342 read_code="XE2q9", phrases=["Serum triglyceride levels"]
343 )
344 TG_SERUM_FASTING = ReadCode(
345 read_code="44Q4.", phrases=["Serum fasting triglyceride lev"]
346 )
347 TG_SERUM_RANDOM = ReadCode(
348 read_code="44Q5.", phrases=["Serum random triglyceride lev"]
349 )
350 TSH_PLASMA = ReadCode(read_code="XaELW", phrases=["Plasma TSH level"])
351 TSH_PLASMA_30_MIN = ReadCode(
352 read_code="XaET7", phrases=["30 minute plasma TSH level"]
353 )
354 TSH_PLASMA_60_MIN = ReadCode(
355 read_code="XaESa", phrases=["60 minute plasma TSH level"]
356 )
357 TSH_PLASMA_90_MIN = ReadCode(
358 read_code="XaET2", phrases=["90 minute plasma TSH level"]
359 )
360 TSH_PLASMA_120_MIN = ReadCode(
361 read_code="XaESb", phrases=["120 minute plasma TSH level"]
362 )
363 TSH_PLASMA_150_MIN = ReadCode(
364 read_code="XaESc", phrases=["150 minute plasma TSH level"]
365 )
366 TSH_SERUM = ReadCode(read_code="XaELV", phrases=["Serum TSH level"])
367 TSH_SERUM_60_MIN = ReadCode(
368 read_code="XaESX", phrases=["60 minute serum TSH level"]
369 )
370 TSH_SERUM_90_MIN = ReadCode(
371 read_code="XaESY", phrases=["90 minute serum TSH level"]
372 )
373 TSH_SERUM_120_MIN = ReadCode(
374 read_code="XaET1", phrases=["120 minute serum TSH level"]
375 )
376 TSH_SERUM_150_MIN = ReadCode(
377 read_code="XaESZ", phrases=["150 minute serum TSH level"]
378 )
380 SODIUM = ReadCode(read_code="X771T", phrases=["Sodium level"])
381 SODIUM_BLOOD = ReadCode(read_code="XaDva", phrases=["Blood sodium level"])
382 SODIUM_PLASMA = ReadCode(
383 read_code="XaIRf", phrases=["Plasma sodium level"]
384 )
385 SODIUM_SERUM = ReadCode(read_code="XE2q0", phrases=["Serum sodium level"])
387 UREA_BLOOD = ReadCode(read_code="X771P", phrases=["Blood urea"])
388 UREA_PLASMA = ReadCode(read_code="XaDvl", phrases=["Plasma urea level"])
389 UREA_SERUM = ReadCode(read_code="XM0lt", phrases=["Serum urea level"])
391 # -------------------------------------------------------------------------
392 # Haematology
393 # -------------------------------------------------------------------------
395 BASOPHIL_COUNT = ReadCode(read_code="42L..", phrases=["Basophil count"])
397 EOSINOPHIL_COUNT = ReadCode(
398 read_code="42K..", phrases=["Eosinophil count"]
399 )
400 ESR = ReadCode(
401 read_code="XE2m7", phrases=["Erythrocyte sedimentation rate"]
402 )
404 HAEMATOCRIT = ReadCode(read_code="X76tb", phrases=["Haematocrit"])
405 HAEMOGLOBIN_CONCENTRATION = ReadCode(
406 read_code="Xa96v", phrases=["Haemoglobin concentration"]
407 )
409 LYMPHOCYTE_COUNT = ReadCode(
410 read_code="42M..", phrases=["Lymphocyte count"]
411 )
413 MONOCYTE_COUNT = ReadCode(read_code="42N..", phrases=["Monocyte count"])
415 NEUTROPHIL_COUNT = ReadCode(
416 read_code="42J..", phrases=["Neutrophil count"]
417 )
419 PLATELET_COUNT = ReadCode(read_code="42P..", phrases=["Platelet count"])
420 POLYMORPH_COUNT = ReadCode( # = neutrophils
421 read_code="XaIao", phrases=["Polymorph count"]
422 )
424 RBC_COUNT = ReadCode(read_code="426..", phrases=["Red blood cell count"])
426 WBC_COUNT = ReadCode(
427 read_code="XaIdY", phrases=["Total white blood count"]
428 )
431# =============================================================================
432# Combiner function
433# =============================================================================
436def regex_components_from_read_codes(*read_codes: ReadCode) -> List[str]:
437 """
438 Returns all components from the specified Read code objects.
439 """
440 code_strings = [] # type: List[str]
441 for rc in read_codes:
442 code_strings += rc.component_regex_strings()
443 return code_strings
446def any_read_code_of(*read_codes: ReadCode) -> str:
447 """
448 Returns a regex allowing any of the specified Read codes.
449 """
450 code_strings = regex_components_from_read_codes(*read_codes)
451 return regex_or(
452 *code_strings,
453 wrap_each_in_noncapture_group=True,
454 wrap_result_in_noncapture_group=True,
455 )