Coverage for nlp_manager/parse_haematology.py: 100%
162 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/parse_haematology.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Python regex-based NLP processors for haematology tests.**
28All inherit from
29:class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`
30and are constructed with these arguments:
32nlpdef:
33 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
34cfgsection:
35 the name of a CRATE NLP config file section (from which we may
36 choose to get extra config information)
37commit:
38 force a COMMIT whenever we insert data? You should specify this
39 in multiprocess mode, or you may get database deadlocks.
41"""
43from abc import ABC
44import logging
45from typing import List, Optional, Tuple
47from crate_anon.common.regex_helpers import (
48 regex_or,
49 WORD_BOUNDARY,
50)
51from crate_anon.nlp_manager.nlp_definition import NlpDefinition
52from crate_anon.nlp_manager.regex_parser import (
53 make_simple_numeric_regex,
54 OPTIONAL_POC,
55 SimpleNumericalResultParser,
56 ValidatorBase,
57)
58from crate_anon.nlp_manager.regex_read_codes import (
59 ReadCodes,
60 regex_components_from_read_codes,
61)
62from crate_anon.nlp_manager.regex_units import (
63 BILLION_PER_L,
64 CELLS_PER_CUBIC_MM_OR_MICROLITRE,
65 G_PER_DL,
66 G_PER_L,
67 L_PER_L,
68 MG_PER_DL,
69 MG_PER_L,
70 MM_PER_H,
71 PERCENT,
72 TRILLION_PER_L,
73)
75log = logging.getLogger(__name__)
78# =============================================================================
79# Haemoglobin (Hb)
80# =============================================================================
83class Haemoglobin(SimpleNumericalResultParser):
84 """
85 HAEMATOLOGY (FBC).
87 Haemoglobin (Hb). Default units are g/L; also supports g/dL.
89 UK reporting for haemoglobin switched in 2013 from g/dL to g/L; see
90 e.g.
92 - http://www.pathology.leedsth.nhs.uk/pathology/Portals/0/PDFs/BP-2013-02%20Hb%20units.pdf
93 - https://www.acb.org.uk/docs/default-source/committees/scientific/guidelines/acb/pathology-harmony-haematology.pdf
95 The *DANGER* remains that "Hb 9" may have been from someone assuming
96 old-style units, 9 g/dL = 90 g/L, but this will be interpreted as 9 g/L.
97 This problem is hard to avoid.
99 """ # noqa: E501
101 HAEMOGLOBIN_BASE = rf"""
102 {WORD_BOUNDARY} (?: Ha?emoglobin | Hb | HGB ) {WORD_BOUNDARY}
103 """
104 HAEMOGLOBIN = regex_or(
105 *regex_components_from_read_codes(
106 ReadCodes.HAEMOGLOBIN_CONCENTRATION,
107 ),
108 HAEMOGLOBIN_BASE,
109 wrap_each_in_noncapture_group=True,
110 wrap_result_in_noncapture_group=False,
111 )
112 REGEX = make_simple_numeric_regex(
113 quantity=HAEMOGLOBIN,
114 units=regex_or(G_PER_L, G_PER_DL),
115 optional_ignorable_after_quantity=OPTIONAL_POC,
116 )
117 NAME = "Haemoglobin"
118 PREFERRED_UNIT_COLUMN = "value_g_L"
119 UNIT_MAPPING = {
120 G_PER_L: 1, # preferred unit
121 G_PER_DL: 10, # older unit (e.g. 2000)
122 }
124 def __init__(
125 self,
126 nlpdef: Optional[NlpDefinition],
127 cfg_processor_name: Optional[str],
128 commit: bool = False,
129 ) -> None:
130 # see documentation above
131 super().__init__(
132 nlpdef=nlpdef,
133 cfg_processor_name=cfg_processor_name,
134 regex_str=self.REGEX,
135 variable=self.NAME,
136 target_unit=self.PREFERRED_UNIT_COLUMN,
137 units_to_factor=self.UNIT_MAPPING,
138 commit=commit,
139 take_absolute=True,
140 )
142 def test(self, verbose: bool = False) -> None:
143 # docstring in superclass
144 self.test_numerical_parser(
145 [
146 ("Haemoglobin (should fail)", []), # should fail; no values
147 ("Haemoglobin 90 (should succeed)", [90]),
148 ("Hemoglobin = 60", [60]),
149 ("Hb 6 g/dL", [60]),
150 ("Hb 60 g/L", [60]),
151 ("Hb <80", [80]),
152 ("Hb <80 g/L", [80]),
153 ("Hb was 62", [62]),
154 ("Hb was 62 g/L", [62]),
155 ("Hb was 62 (L) g/L", [62]),
156 ("Haemoglobin | 7.6 (H) | g/dL", [76]),
157 ("Hb-96", [96]),
158 ("HGB, POC 96", [96]),
159 ("Haemoglobin concentration (Xa96v) 96", [96]),
160 ],
161 verbose=verbose,
162 )
165class HaemoglobinValidator(ValidatorBase):
166 """
167 Validator for Haemoglobin (see help for explanation).
168 """
170 @classmethod
171 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
172 return Haemoglobin.NAME, [Haemoglobin.HAEMOGLOBIN]
175# =============================================================================
176# Haematocrit (Hct)
177# =============================================================================
180class Haematocrit(SimpleNumericalResultParser):
181 """
182 HAEMATOLOGY (FBC).
184 Haematocrit (Hct).
185 A dimensionless quantity (but supports L/L notation).
186 """
188 HAEMATOCRIT_BASE = rf"""
189 {WORD_BOUNDARY} (?: Ha?ematocrit | Hct ) {WORD_BOUNDARY}
190 """
191 HAEMATOCRIT = regex_or(
192 *regex_components_from_read_codes(
193 ReadCodes.HAEMATOCRIT,
194 ),
195 HAEMATOCRIT_BASE,
196 wrap_each_in_noncapture_group=True,
197 wrap_result_in_noncapture_group=False,
198 )
199 REGEX = make_simple_numeric_regex(
200 quantity=HAEMATOCRIT,
201 units=L_PER_L,
202 optional_ignorable_after_quantity=OPTIONAL_POC,
203 )
204 NAME = "Haematocrit"
205 PREFERRED_UNIT_COLUMN = "value_L_L"
206 UNIT_MAPPING = {
207 L_PER_L: 1, # preferred unit
208 # not MG_PER_DL, MG_PER_L
209 }
211 def __init__(
212 self,
213 nlpdef: Optional[NlpDefinition],
214 cfg_processor_name: Optional[str],
215 commit: bool = False,
216 ) -> None:
217 # see documentation above
218 super().__init__(
219 nlpdef=nlpdef,
220 cfg_processor_name=cfg_processor_name,
221 regex_str=self.REGEX,
222 variable=self.NAME,
223 target_unit=self.PREFERRED_UNIT_COLUMN,
224 units_to_factor=self.UNIT_MAPPING,
225 commit=commit,
226 take_absolute=True,
227 )
229 def test(self, verbose: bool = False) -> None:
230 # docstring in superclass
231 self.test_numerical_parser(
232 [
233 ("Haematocrit (should fail)", []), # should fail; no values
234 ("Haematocrit 0.4 (should succeed)", [0.4]),
235 ("Hematocrit = 0.4", [0.4]),
236 ("Hct 0.3 L/L", [0.3]),
237 ("Haematocrit | 0.33 (H) | L/L", [0.33]),
238 (
239 "my haematocrit was 0.3; his haematocrit was 0.4!",
240 [0.3, 0.4],
241 ),
242 ("Hct-0.48", [0.48]),
243 ("Haematocrit (X76tb) 0.48", [0.48]),
244 ],
245 verbose=verbose,
246 )
249class HaematocritValidator(ValidatorBase):
250 """
251 Validator for Haematocrit (see help for explanation).
252 """
254 @classmethod
255 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
256 return Haematocrit.NAME, [Haematocrit.HAEMATOCRIT]
259# =============================================================================
260# RBCs
261# =============================================================================
264class RBC(SimpleNumericalResultParser):
265 """
266 HAEMATOLOGY (FBC).
268 Red blood cell count.
269 Default units are 10^12/L; also supports cells/mm^3 = cells/μL.
271 A typical excerpt from a FBC report:
273 .. code-block:: none
275 RBC, POC 4.84 10*12/L
276 RBC, POC 9.99 (H) 10*12/L
277 """
279 RED_BLOOD_CELLS_BASE = rf"""
280 {WORD_BOUNDARY}
281 (?:
282 # Red [blood] cell[s] [(RBC)] [count]:
283 Red \b \s* (?: blood \s*)? \b cells? \b
284 (?:\s* \(RBC\) )?
285 (?:\s* count \b )?
286 |
287 # RBC(s):
288 (?: RBCs? )
289 )
290 """
291 # Beware: \( or \) next to \b becomes unhappy.
292 RED_BLOOD_CELLS = regex_or(
293 # The order matters here (so, probably everywhere). Go from more to
294 # less specific, i.e. Read codes first.
295 # Otherwise, e.g.:
296 #
297 # Expected [6.2], got [426.0], when parsing
298 # 'Red blood cell count (426..) 6.2'
299 *regex_components_from_read_codes(
300 ReadCodes.RBC_COUNT,
301 ),
302 RED_BLOOD_CELLS_BASE,
303 wrap_each_in_noncapture_group=True,
304 wrap_result_in_noncapture_group=False,
305 )
306 REGEX = make_simple_numeric_regex(
307 quantity=RED_BLOOD_CELLS,
308 units=regex_or(
309 TRILLION_PER_L, # good
310 CELLS_PER_CUBIC_MM_OR_MICROLITRE, # good
311 BILLION_PER_L, # bad
312 ),
313 optional_ignorable_after_quantity=OPTIONAL_POC,
314 )
315 NAME = "RBC"
316 PREFERRED_UNIT_COLUMN = "value_trillion_per_l"
317 UNIT_MAPPING = {
318 TRILLION_PER_L: 1, # preferred unit; 10^12/L or "per pL"
319 CELLS_PER_CUBIC_MM_OR_MICROLITRE: 1e-6,
320 # not BILLION_PER_L
321 }
323 def __init__(
324 self,
325 nlpdef: Optional[NlpDefinition],
326 cfg_processor_name: Optional[str],
327 commit: bool = False,
328 ) -> None:
329 # see documentation above
330 super().__init__(
331 nlpdef=nlpdef,
332 cfg_processor_name=cfg_processor_name,
333 regex_str=self.REGEX,
334 variable=self.NAME,
335 target_unit=self.PREFERRED_UNIT_COLUMN,
336 units_to_factor=self.UNIT_MAPPING,
337 commit=commit,
338 take_absolute=True,
339 )
341 def test(self, verbose: bool = False) -> None:
342 # docstring in superclass
343 self.test_numerical_parser(
344 [
345 ("RBC (should fail)", []), # should fail; no values
346 ("RBC 6", [6]),
347 ("RBC = 6", [6]),
348 ("RBC 6 x 10^9/L", []),
349 ("RBC 6 x 10 ^ 9 / L", []),
350 ("RBC 6 x 10 ^ 12 / L", [6]),
351 ("RBC 6 10*12/L", [6]),
352 ("RBCs 6.2", [6.2]),
353 ("red cells 6.2", [6.2]),
354 ("red blood cells 6.2", [6.2]),
355 ("red blood cell count 6.2", [6.2]),
356 ("red blood cells 5000000/mm3", [5]),
357 ("red blood cells 5000000 cell/mm3", [5]),
358 ("red blood cells 5000000 cells/mm3", [5]),
359 ("red blood cells 5000000 per cubic mm", [5]),
360 ("red blood cells 5000000 per cmm", [5]),
361 ("RBC – 6", [6]), # en dash
362 ("RBC—6", [6]), # em dash
363 ("RBC -- 6", [6]), # double hyphen used as dash
364 ("RBC - 6", [6]),
365 ("RBC-6.5", [6.5]),
366 ("RBC POC 4.84 10*12/L", [4.84]),
367 ("RBC, POC 4.84 10*12/L", [4.84]),
368 ("RBC, POC 4.84 (H) 10*12/L", [4.84]),
369 ("red blood cells count 6.2", [6.2]),
370 ("red blood cells (RBC) 6.2", [6.2]),
371 ("Red blood cell count (426..) 6.2", [6.2]),
372 ],
373 verbose=verbose,
374 )
377class RBCValidator(ValidatorBase):
378 """
379 Validator for RBC (see help for explanation).
380 """
382 @classmethod
383 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
384 return RBC.NAME, [RBC.RED_BLOOD_CELLS]
387# =============================================================================
388# Erythrocyte sedimentation rate (ESR)
389# =============================================================================
392class Esr(SimpleNumericalResultParser):
393 """
394 HAEMATOLOGY (ESR).
396 Erythrocyte sedimentation rate (ESR), in mm/h.
397 """
399 ESR_BASE = rf"""
400 {WORD_BOUNDARY}
401 (?:
402 Erythrocyte [\s]+ sed(?:\.|imentation)? [\s]+ rate
403 | ESR
404 )
405 {WORD_BOUNDARY}
406 """
407 ESR = regex_or(
408 *regex_components_from_read_codes(
409 ReadCodes.ESR,
410 ),
411 ESR_BASE,
412 wrap_each_in_noncapture_group=True,
413 wrap_result_in_noncapture_group=False,
414 )
415 REGEX = make_simple_numeric_regex(
416 quantity=ESR,
417 units=regex_or(MM_PER_H, MG_PER_DL, MG_PER_L), # good # bad # bad
418 optional_ignorable_after_quantity=OPTIONAL_POC,
419 )
420 NAME = "ESR"
421 PREFERRED_UNIT_COLUMN = "value_mm_h"
422 UNIT_MAPPING = {
423 MM_PER_H: 1, # preferred unit
424 # not MG_PER_DL, MG_PER_L
425 }
427 def __init__(
428 self,
429 nlpdef: Optional[NlpDefinition],
430 cfg_processor_name: Optional[str],
431 commit: bool = False,
432 ) -> None:
433 # see documentation above
434 super().__init__(
435 nlpdef=nlpdef,
436 cfg_processor_name=cfg_processor_name,
437 regex_str=self.REGEX,
438 variable=self.NAME,
439 target_unit=self.PREFERRED_UNIT_COLUMN,
440 units_to_factor=self.UNIT_MAPPING,
441 commit=commit,
442 take_absolute=True,
443 )
445 def test(self, verbose: bool = False) -> None:
446 # docstring in superclass
447 self.test_numerical_parser(
448 [
449 ("ESR (should fail)", []), # should fail; no values
450 ("ESR 6 (should succeed)", [6]),
451 ("ESR = 6", [6]),
452 ("ESR 6 mm/h", [6]),
453 ("ESR <10", [10]),
454 ("ESR <10 mm/hr", [10]),
455 ("ESR >100", [100]),
456 ("ESR >100 mm/hour", [100]),
457 ("ESR was 62", [62]),
458 ("ESR was 62 mm/h", [62]),
459 ("ESR was 62 (H) mm/h", [62]),
460 ("ESR was 62 mg/dl (should fail, wrong units)", []),
461 ("Erythrocyte sed. rate was 19", [19]),
462 ("his erythrocyte sedimentation rate was 19", [19]),
463 ("erythrocyte sedimentation rate was 19", [19]),
464 ("ESR 1.9 mg/L", []), # wrong units
465 ("ESR 1.9 (H) mg/L", []), # wrong units
466 ("ESR | 1.9 (H) | mg/L", []),
467 ("my ESR was 15, but his ESR was 89!", [15, 89]),
468 ("ESR-18", [18]),
469 ("Erythrocyte sedimentation rate (XE2m7) 18", [18]),
470 ],
471 verbose=verbose,
472 )
475class EsrValidator(ValidatorBase):
476 """
477 Validator for Esr (see help for explanation).
478 """
480 @classmethod
481 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
482 return Esr.NAME, [Esr.ESR]
485# =============================================================================
486# White blood cell count and differential
487# =============================================================================
488# Do NOT accept my handwritten abbreviations with slashed zeros, e.g.
489# L0 lymphocytes
490# N0 neutrophils
491# M0 monocytes
492# B0 basophils
493# E0 eosinophils
494# ... too likely that these are interpreted in wrong contexts, particularly
495# if we are not allowing units, like "M0 3": macrophages 3 x 10^9/L, or part
496# of "T2 N0 M0 ..." cancer staging?
499class WbcBase(SimpleNumericalResultParser, ABC):
500 """
501 DO NOT USE DIRECTLY. White cell count base class.
502 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
503 """
505 PREFERRED_UNIT_COLUMN = "value_billion_per_l"
506 UNIT_MAPPING = {
507 BILLION_PER_L: 1, # preferred unit: 10^9 / L
508 CELLS_PER_CUBIC_MM_OR_MICROLITRE: 0.001,
509 # ... 1000 cells/mm^3 -> 1 x 10^9 / L
510 # but NOT percent (too hard to interpret relative differentials
511 # reliably)
512 }
514 def __init__(
515 self,
516 nlpdef: Optional[NlpDefinition],
517 cfg_processor_name: Optional[str],
518 cell_type_regex_text: str,
519 variable: str,
520 commit: bool = False,
521 ) -> None:
522 """
523 ``__init__`` function for :class:`WbcBase`.
525 Args:
526 nlpdef:
527 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
528 cfg_processor_name:
529 the name of a CRATE NLP config file section (from which we may
530 choose to get extra config information)
531 cell_type_regex_text:
532 text for regex for the cell type, representing e.g.
533 "monocytes" or "basophils"
534 variable:
535 used as the record value for ``variable_name``
536 commit:
537 force a COMMIT whenever we insert data? You should specify this
538 in multiprocess mode, or you may get database deadlocks.
539 """
540 super().__init__(
541 nlpdef=nlpdef,
542 cfg_processor_name=cfg_processor_name,
543 regex_str=self.make_wbc_regex(cell_type_regex_text),
544 variable=variable,
545 target_unit=self.PREFERRED_UNIT_COLUMN,
546 units_to_factor=self.UNIT_MAPPING,
547 commit=commit,
548 take_absolute=True,
549 )
551 @staticmethod
552 def make_wbc_regex(cell_type_regex_text: str) -> str:
553 """
554 Makes a regular expression (as text) from text representing a cell
555 type.
556 """
557 return make_simple_numeric_regex(
558 quantity=cell_type_regex_text,
559 units=regex_or(
560 BILLION_PER_L, # good
561 CELLS_PER_CUBIC_MM_OR_MICROLITRE, # good
562 PERCENT, # bad, so we can ignore it
563 ),
564 optional_ignorable_after_quantity=OPTIONAL_POC,
565 )
568# -----------------------------------------------------------------------------
569# WBC
570# -----------------------------------------------------------------------------
573class Wbc(WbcBase):
574 """
575 HAEMATOLOGY (FBC).
577 White cell count (WBC, WCC).
578 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
579 """
581 WBC_BASE = r"""
582 \b (?:
583 (?: # White blood cells, white cell count, etc.
584 White\b [\s]* (?:\bblood\b)? [\s]* \bcell[s]?\b
585 [\s]* (?:\bcount\b)? [\s]*
586 (?: # optional suffix WBC, (WBC), (WBCC), (WCC), etc.
587 [\(]? (?: WBC | WBCC | WCC) [\)]?
588 )?
589 )
590 | (?: # just WBC(s), WBCC, WCC
591 (?: WBC[s]? | WBCC | WCC )
592 )
593 ) \b
594 """
595 WBC = regex_or(
596 *regex_components_from_read_codes(
597 ReadCodes.WBC_COUNT,
598 ),
599 WBC_BASE,
600 wrap_each_in_noncapture_group=True,
601 wrap_result_in_noncapture_group=False,
602 )
603 NAME = "WBC"
605 def __init__(
606 self,
607 nlpdef: Optional[NlpDefinition],
608 cfg_processor_name: Optional[str],
609 commit: bool = False,
610 ) -> None:
611 # see documentation above
612 super().__init__(
613 nlpdef=nlpdef,
614 cfg_processor_name=cfg_processor_name,
615 commit=commit,
616 cell_type_regex_text=self.WBC,
617 variable=self.NAME,
618 )
620 def test(self, verbose: bool = False) -> None:
621 # docstring in superclass
622 self.test_numerical_parser(
623 [
624 ("WBC (should fail)", []), # should fail; no values
625 ("WBC 6", [6]),
626 ("WBC = 6", [6]),
627 ("WBC 6 x 10^9/L", [6]),
628 ("WBC 6 x 10 ^ 9 / L", [6]),
629 ("WCC 6.2", [6.2]),
630 ("white cells 6.2", [6.2]),
631 ("white cells 6.2", [6.2]),
632 ("white cells 9800/mm3", [9.8]),
633 ("white cells 9800 cell/mm3", [9.8]),
634 ("white cells 9800 cells/mm3", [9.8]),
635 ("white cells 9800 per cubic mm", [9.8]),
636 ("white cells 9800 per cmm", [9.8]),
637 ("white cells 17,600/mm3", [17.6]),
638 ("white cells 17,600/μL", [17.6]),
639 ("white cells 17,600/microlitre", [17.6]),
640 ("WBC – 6", [6]), # en dash
641 ("WBC—6", [6]), # em dash
642 ("WBC -- 6", [6]), # double hyphen used as dash
643 ("WBC - 6", [6]),
644 ("WBC-6.5", [6.5]),
645 ("WBC, POC 6.5", [6.5]),
646 ("Total white blood count (XaIdY) 6.5", [6.5]),
647 ],
648 verbose=verbose,
649 )
652class WbcValidator(ValidatorBase):
653 """
654 Validator for Wbc (see help for explanation).
655 """
657 @classmethod
658 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
659 return Wbc.NAME, [Wbc.WBC]
662# -----------------------------------------------------------------------------
663# Neutrophils
664# -----------------------------------------------------------------------------
667class Neutrophils(WbcBase):
668 """
669 HAEMATOLOGY (FBC).
671 Neutrophil (polymorphonuclear leukoocte) count (absolute).
672 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
673 """
675 NEUTROPHILS_BASE = r"""
676 (?: \b absolute \s* )?
677 \b (?: Neut(?:r(?:o(?:phil)?)?)?s? | N0 ) \b
678 (?: \s* count \b )?
679 """
680 NEUTROPHILS = regex_or(
681 *regex_components_from_read_codes(
682 ReadCodes.NEUTROPHIL_COUNT,
683 ReadCodes.POLYMORPH_COUNT,
684 ),
685 NEUTROPHILS_BASE,
686 wrap_each_in_noncapture_group=True,
687 wrap_result_in_noncapture_group=False,
688 )
689 NAME = "neutrophils"
691 def __init__(
692 self,
693 nlpdef: Optional[NlpDefinition],
694 cfg_processor_name: Optional[str],
695 commit: bool = False,
696 ) -> None:
697 # see documentation above
698 super().__init__(
699 nlpdef=nlpdef,
700 cfg_processor_name=cfg_processor_name,
701 commit=commit,
702 cell_type_regex_text=self.NEUTROPHILS,
703 variable=self.NAME,
704 )
706 def test(self, verbose: bool = False) -> None:
707 # docstring in superclass
708 self.test_numerical_parser(
709 [
710 ("neutrophils (should fail)", []), # should fail; no values
711 ("absolute neutrophil count 6", [6]),
712 ("neuts = 6", [6]),
713 ("N0 6 x 10^9/L", [6]),
714 ("neutrophil count 6 x 10 ^ 9 / L", [6]),
715 ("neutrs 6.2", [6.2]),
716 ("neutrophil 6.2", [6.2]),
717 ("neutrophils 6.2", [6.2]),
718 ("n0 9800/mm3", [9.8]),
719 ("absolute neutrophils 9800 cell/mm3", [9.8]),
720 ("neutrophils count 9800 cells/mm3", [9.8]),
721 ("neuts 9800 per cmm", [9.8]),
722 ("n0 9800 per cubic mm", [9.8]),
723 ("n0 17,600/mm3", [17.6]),
724 ("neuts-17", [17]),
725 ("Neutrophil count (42J..) 17", [17]),
726 ("Polymorph count (XaIao) 17", [17]),
727 ],
728 verbose=verbose,
729 )
732class NeutrophilsValidator(ValidatorBase):
733 """
734 Validator for Neutrophils (see help for explanation).
735 """
737 @classmethod
738 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
739 return Neutrophils.NAME, [Neutrophils.NEUTROPHILS]
742# -----------------------------------------------------------------------------
743# Lymphocytes
744# -----------------------------------------------------------------------------
747class Lymphocytes(WbcBase):
748 """
749 HAEMATOLOGY (FBC).
751 Lymphocyte count (absolute).
752 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
753 """
755 LYMPHOCYTES_BASE = r"""
756 (?: \b absolute \s* )?
757 \b Lymph(?:o(?:cyte)?)?s? \b
758 (?: \s* count \b )?
759 """
760 LYMPHOCYTES = regex_or(
761 *regex_components_from_read_codes(
762 ReadCodes.LYMPHOCYTE_COUNT,
763 ),
764 LYMPHOCYTES_BASE,
765 wrap_each_in_noncapture_group=True,
766 wrap_result_in_noncapture_group=False,
767 )
768 NAME = "lymphocytes"
770 def __init__(
771 self,
772 nlpdef: Optional[NlpDefinition],
773 cfg_processor_name: Optional[str],
774 commit: bool = False,
775 ) -> None:
776 # see documentation above
777 super().__init__(
778 nlpdef=nlpdef,
779 cfg_processor_name=cfg_processor_name,
780 commit=commit,
781 cell_type_regex_text=self.LYMPHOCYTES,
782 variable=self.NAME,
783 )
785 def test(self, verbose: bool = False) -> None:
786 # docstring in superclass
787 self.test_numerical_parser(
788 [
789 ("lymphocytes (should fail)", []), # should fail; no values
790 ("absolute lymphocyte count 6", [6]),
791 ("lymphs = 6", [6]),
792 ("L0 6 x 10^9/L (should fail)", []),
793 ("lymphocyte count 6 x 10 ^ 9 / L", [6]),
794 ("lymphs 6.2", [6.2]),
795 ("lymph 6.2", [6.2]),
796 ("lympho 6.2", [6.2]),
797 ("lymphos 9800/mm3", [9.8]),
798 ("absolute lymphocytes 9800 cell/mm3", [9.8]),
799 ("lymphocytes count 9800 cells/mm3", [9.8]),
800 ("lymphocytes 9800 per cmm", [9.8]),
801 ("lymphs-6.3", [6.3]),
802 # We are not supporting "L0":
803 ("l0 9800 per cubic mm (should fail)", []),
804 ("l0 9800 per cmm (should fail)", []),
805 ("l0 17,600/mm3 (should fail)", []),
806 ("Lymphocyte count (42M..) 6.3", [6.3]),
807 ],
808 verbose=verbose,
809 )
812class LymphocytesValidator(ValidatorBase):
813 """
814 Validator for Lymphocytes (see help for explanation).
815 """
817 @classmethod
818 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
819 return Lymphocytes.NAME, [Lymphocytes.LYMPHOCYTES]
822# -----------------------------------------------------------------------------
823# Monocytes
824# -----------------------------------------------------------------------------
827class Monocytes(WbcBase):
828 """
829 HAEMATOLOGY (FBC).
831 Monocyte count (absolute).
832 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
833 """
835 MONOCYTES_BASE = r"""
836 (?: \b absolute \s* )?
837 \b Mono(?:cyte)?s? \b
838 (?: \s* count \b )?
839 """
840 MONOCYTES = regex_or(
841 *regex_components_from_read_codes(
842 ReadCodes.MONOCYTE_COUNT,
843 ),
844 MONOCYTES_BASE,
845 wrap_each_in_noncapture_group=True,
846 wrap_result_in_noncapture_group=False,
847 )
848 NAME = "monocytes"
850 def __init__(
851 self,
852 nlpdef: Optional[NlpDefinition],
853 cfg_processor_name: Optional[str],
854 commit: bool = False,
855 ) -> None:
856 # see documentation above
857 super().__init__(
858 nlpdef=nlpdef,
859 cfg_processor_name=cfg_processor_name,
860 commit=commit,
861 cell_type_regex_text=self.MONOCYTES,
862 variable=self.NAME,
863 )
865 def test(self, verbose: bool = False) -> None:
866 # docstring in superclass
867 self.test_numerical_parser(
868 [
869 ("monocytes (should fail)", []), # should fail; no values
870 ("absolute monocyte count 6", [6]),
871 ("monos = 6", [6]),
872 ("M0 6 x 10^9/L (should fail)", []),
873 ("monocyte count 6 x 10 ^ 9 / L", [6]),
874 ("monos 6.2", [6.2]),
875 ("mono 6.2", [6.2]),
876 ("monos 9800/mm3", [9.8]),
877 ("absolute mono 9800 cell/mm3", [9.8]),
878 ("monocytes count 9800 cells/mm3", [9.8]),
879 ("monocytes 9800 per cmm", [9.8]),
880 ("monocytes-5.2", [5.2]),
881 # We are not supporting "M0":
882 ("m0 9800 per cubic mm (should fail)", []),
883 ("m0 17,600/mm3 (should fail)", []),
884 ("Monocyte count (42N..) 5.2", [5.2]),
885 ],
886 verbose=verbose,
887 )
890class MonocytesValidator(ValidatorBase):
891 """
892 Validator for Monocytes (see help for explanation).
893 """
895 @classmethod
896 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
897 return Monocytes.NAME, [Monocytes.MONOCYTES]
900# -----------------------------------------------------------------------------
901# Basophils
902# -----------------------------------------------------------------------------
905class Basophils(WbcBase):
906 """
907 HAEMATOLOGY (FBC).
909 Basophil count (absolute).
910 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
911 """
913 BASOPHILS_BASE = r"""
914 (?: \b absolute \s* )?
915 \b Baso(?:phil)?s? \b
916 (?: \s* count \b )?
917 """
918 BASOPHILS = regex_or(
919 *regex_components_from_read_codes(
920 ReadCodes.BASOPHIL_COUNT,
921 ),
922 BASOPHILS_BASE,
923 wrap_each_in_noncapture_group=True,
924 wrap_result_in_noncapture_group=False,
925 )
926 NAME = "basophils"
928 def __init__(
929 self,
930 nlpdef: Optional[NlpDefinition],
931 cfg_processor_name: Optional[str],
932 commit: bool = False,
933 ) -> None:
934 # see documentation above
935 super().__init__(
936 nlpdef=nlpdef,
937 cfg_processor_name=cfg_processor_name,
938 commit=commit,
939 cell_type_regex_text=self.BASOPHILS,
940 variable=self.NAME,
941 )
943 def test(self, verbose=False) -> None:
944 # docstring in superclass
945 self.test_numerical_parser(
946 [
947 ("basophils (should fail)", []), # should fail; no values
948 ("absolute basophil count 6", [6]),
949 ("basos = 6", [6]),
950 ("B0 6 x 10^9/L (should fail)", []),
951 ("basophil count 6 x 10 ^ 9 / L", [6]),
952 ("basos 6.2", [6.2]),
953 ("baso 6.2", [6.2]),
954 ("basos 9800/mm3", [9.8]),
955 ("absolute basophil 9800 cell/mm3", [9.8]),
956 ("basophils count 9800 cells/mm3", [9.8]),
957 ("basophils 9800 per cmm", [9.8]),
958 ("basophils-5.2", [5.2]),
959 # We are not supporting "B0":
960 ("b0 9800 per cubic mm (should fail)", []),
961 ("b0 17,600/mm3 (should fail)", []),
962 ("Basophil count (42L..) 5.2", [5.2]),
963 ],
964 verbose=verbose,
965 )
968class BasophilsValidator(ValidatorBase):
969 """
970 Validator for Basophils (see help for explanation).
971 """
973 @classmethod
974 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
975 return Basophils.NAME, [Basophils.BASOPHILS]
978# -----------------------------------------------------------------------------
979# Eosinophils
980# -----------------------------------------------------------------------------
983class Eosinophils(WbcBase):
984 """
985 HAEMATOLOGY (FBC).
987 Eosinophil count (absolute).
988 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
989 """
991 EOSINOPHILS_BASE = r"""
992 (?: \b absolute \s* )?
993 \b Eo(?:sin(?:o(?:phil)?)?)?s? \b
994 (?: \s* count \b )?
995 """
996 EOSINOPHILS = regex_or(
997 *regex_components_from_read_codes(
998 ReadCodes.EOSINOPHIL_COUNT,
999 ),
1000 EOSINOPHILS_BASE,
1001 wrap_each_in_noncapture_group=True,
1002 wrap_result_in_noncapture_group=False,
1003 )
1004 NAME = "eosinophils"
1006 def __init__(
1007 self,
1008 nlpdef: Optional[NlpDefinition],
1009 cfg_processor_name: Optional[str],
1010 commit: bool = False,
1011 ) -> None:
1012 # see documentation above
1013 super().__init__(
1014 nlpdef=nlpdef,
1015 cfg_processor_name=cfg_processor_name,
1016 commit=commit,
1017 cell_type_regex_text=self.EOSINOPHILS,
1018 variable=self.NAME,
1019 )
1021 def test(self, verbose: bool = False) -> None:
1022 # docstring in superclass
1023 self.test_numerical_parser(
1024 [
1025 ("eosinophils (should fail)", []), # should fail; no values
1026 ("absolute eosinophil count 6", [6]),
1027 ("eos = 6", [6]),
1028 ("E0 6 x 10^9/L (should fail)", []),
1029 ("eosinophil count 6 x 10 ^ 9 / L", [6]),
1030 ("eosins 6.2", [6.2]),
1031 ("eosino 6.2", [6.2]),
1032 ("eosinos 9800/mm3", [9.8]),
1033 ("absolute eosinophil 9800 cell/mm3", [9.8]),
1034 ("eosinophils count 9800 cells/mm3", [9.8]),
1035 ("eosinophils 9800 per cmm", [9.8]),
1036 ("eosinophils-5.3", [5.3]),
1037 # We are not supporting "E0":
1038 ("e0 9800 per cubic mm (should fail)", []),
1039 ("e0 17,600/mm3 (should fail)", []),
1040 ("Eosinophil count (42K..) 5.2", [5.2]),
1041 ("Eosinophil count - observation (42K..) 5.2", [5.2]),
1042 ],
1043 verbose=verbose,
1044 )
1047class EosinophilsValidator(ValidatorBase):
1048 """
1049 Validator for Eosinophils (see help for explanation).
1050 """
1052 @classmethod
1053 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
1054 return Eosinophils.NAME, [Eosinophils.EOSINOPHILS]
1057# -----------------------------------------------------------------------------
1058# Platelet count
1059# -----------------------------------------------------------------------------
1062class Platelets(WbcBase):
1063 """
1064 HAEMATOLOGY (FBC).
1066 Platelet count.
1067 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL.
1069 Not actually a white blood cell, of course, but can share the same base
1070 class; platelets are expressed in the same units, of 10^9 / L.
1071 Typical values 150–450 ×10^9 / L (or 150,000–450,000 per μL).
1072 """
1074 PLATELETS_BASE = r"""
1075 \b (?: Platelets? | plts? ) \b # platelet(s), plt(s)
1076 (?: \s* count \b )? # optional "count"
1077 """
1078 PLATELETS = regex_or(
1079 *regex_components_from_read_codes(
1080 ReadCodes.PLATELET_COUNT,
1081 ),
1082 PLATELETS_BASE,
1083 wrap_each_in_noncapture_group=True,
1084 wrap_result_in_noncapture_group=False,
1085 )
1086 NAME = "platelets"
1088 def __init__(
1089 self,
1090 nlpdef: Optional[NlpDefinition],
1091 cfg_processor_name: Optional[str],
1092 commit: bool = False,
1093 ) -> None:
1094 # see documentation above
1095 super().__init__(
1096 nlpdef=nlpdef,
1097 cfg_processor_name=cfg_processor_name,
1098 commit=commit,
1099 cell_type_regex_text=self.PLATELETS,
1100 variable=self.NAME,
1101 )
1103 def test(self, verbose: bool = False) -> None:
1104 # docstring in superclass
1105 self.test_numerical_parser(
1106 [
1107 ("platelets (should fail)", []), # should fail; no values
1108 ("platelet count 150", [150]),
1109 ("plt = 150", [150]),
1110 ("PLT 150 x 10^9/L", [150]),
1111 ("platelet count 150 x 10 ^ 9 / L", [150]),
1112 ("plt 400", [400]),
1113 ("plts 400", [400]),
1114 ("plt 400000/mm3", [400]),
1115 ("plt count 400000/μL", [400]),
1116 ("plts 400000 per microliter", [400]),
1117 ("Platelet count (42P..) 150", [150]),
1118 ],
1119 verbose=verbose,
1120 )
1123class PlateletsValidator(ValidatorBase):
1124 """
1125 Validator for Platelets (see help for explanation).
1126 """
1128 @classmethod
1129 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
1130 return Platelets.NAME, [Platelets.PLATELETS]
1133# =============================================================================
1134# All classes in this module
1135# =============================================================================
1137ALL_HAEMATOLOGY_NLP_AND_VALIDATORS = [
1138 (Basophils, BasophilsValidator),
1139 (Eosinophils, EosinophilsValidator),
1140 (Esr, EsrValidator),
1141 (Haematocrit, HaematocritValidator),
1142 (Haemoglobin, HaemoglobinValidator),
1143 (Lymphocytes, LymphocytesValidator),
1144 (Monocytes, MonocytesValidator),
1145 (Neutrophils, NeutrophilsValidator),
1146 (Platelets, PlateletsValidator),
1147 (RBC, RBCValidator),
1148 (Wbc, WbcValidator),
1149]