Coverage for nlp_manager/regex_parser.py: 89%
286 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/regex_parser.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Shared elements for regex-based NLP work.**
28"""
30from abc import abstractmethod, ABC
31import logging
32from typing import Any, Dict, Generator, List, Optional, Tuple
34from sqlalchemy import Column, Integer, Float, String, Text
36from crate_anon.common.regex_helpers import (
37 LEFT_BRACKET as LB,
38 RIGHT_BRACKET as RB,
39)
40from crate_anon.nlp_manager.constants import (
41 MAX_SQL_FIELD_LEN,
42 ProcessorConfigKeys,
43 SqlTypeDbIdentifier,
44)
45from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser
46from crate_anon.nlp_manager.nlp_definition import NlpDefinition
47from crate_anon.nlp_manager.number import to_float, to_pos_float
48from crate_anon.nlp_manager.regex_func import (
49 compile_regex,
50 compile_regex_dict,
51 get_regex_dict_match,
52)
53from crate_anon.nlp_manager.regex_numbers import (
54 SIGNED_FLOAT,
55 IGNORESIGN_INTEGER,
56)
57from crate_anon.nlp_manager.regex_units import (
58 OUT_OF_SEPARATOR,
59 SCORE,
60)
62log = logging.getLogger(__name__)
65# =============================================================================
66# Generic entities
67# =============================================================================
69# -----------------------------------------------------------------------------
70# Blood results
71# -----------------------------------------------------------------------------
73OPTIONAL_RESULTS_IGNORABLES = r"""
74 (?: # OPTIONAL_RESULTS_IGNORABLES
75 \s | \| | \: # whitespace, bar, colon
76 | \bHH?\b | \(HH?\) # H/HH at a word boundary; (H)/(HH)
77 | \bLL?\b | \(LL?\) # L/LL etc.
78 | \* | \(\*\) # *, (*)
79 | — | -- # em dash, double hyphen-minus
80 | –\s+ | -\s+ | ‐\s+ # en dash/hyphen-minus/Unicode hyphen; whitespace
81 )* # ... any of those, repeated 0 or more times
82"""
83# - you often get | characters when people copy/paste tables
84# - blood test abnormality markers can look like e.g.
85# 17 (H), 17 (*), 17 HH
86# Re parentheses:
87# - you can also see things like "CRP (5)"
88# ... but we'll handle that
89# - However, if there's a right parenthesis only, that's less good, e.g.
90# "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting."
91# ... which we don't want to be interpreted as "sodium 1.0".
92# HOW BEST TO DO THIS?
93# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa: E501
94# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa: E501
95# - ... simplest is perhaps: base ignorables, or those with brackets, as above
96# - ... even better than a nested thing is just a list of alternatives
98OPTIONAL_POC = r"""
99 (?: ,? \s+ POC )? # OPTIONAL_POC: point-of-care testing, "[,] POC"
100"""
101# ... e.g. "Glucose, POC"; "Potassium, POC".
102# Seen in CUH for
103#
104# sodium, POC
105# potassium, POC
106# creatinine, POC
107# urea, POC
108# glucose, POC
109# lactate, POC
110# bilirubin, POC
111# HCT, POC
112# alkaline phosphatase, POC
113# alanine transferase, POC
114#
115# HGB, POC
116# WBC, POC
117# PLT, POC
118# MCV, POC
119# MCH, POC
120# neutrophil count, POC
121# lymphocyte count, POC
123# -----------------------------------------------------------------------------
124# Tense indicators
125# -----------------------------------------------------------------------------
127IS = "is"
128WAS = "was"
129TENSE_INDICATOR = rf"(?: \b {IS} \b | \b {WAS} \b )"
131# Standardized result values; see MAX_TENSE_TEXT_LENGTH
132PAST = "past"
133PRESENT = "present"
134EVER = "ever" # e.g. for "never"
136TENSE_LOOKUP = compile_regex_dict(
137 {
138 IS: PRESENT,
139 WAS: PAST,
140 }
141)
143# -----------------------------------------------------------------------------
144# Mathematical relations
145# -----------------------------------------------------------------------------
146# ... don't use unnamed groups here; EQ is also used as a return value
148LT = r"(?: < | less \s+ than | under )"
149LE = "<="
150EQ = r"(?: = | equals | equal \s+ to )"
151GE = ">="
152GT = r"(?: > | (?:more|greater) \s+ than | over )"
153# OF = "\b of \b" # as in: "a BMI of 30"... but too likely to be mistaken for a target? # noqa: E501
155RELATION = rf"(?: {LE} | {LT} | {EQ} | {GE} | {GT} )"
156# ... ORDER MATTERS: greedier things first, i.e.
157# - LE before LT
158# - GE before GT
160RELATION_LOOKUP = compile_regex_dict(
161 {
162 # To standardize the output, so (for example) "=" and "equals" can both
163 # map to "=".
164 LT: "<",
165 LE: "<=",
166 EQ: "=",
167 GE: ">=",
168 GT: ">",
169 }
170)
172# -----------------------------------------------------------------------------
173# Punctuation
174# -----------------------------------------------------------------------------
176APOSTROPHE = "['’]" # ASCII apostrophe; right single quote (U+2019)
179# =============================================================================
180# Regex assembly functions
181# =============================================================================
184# =============================================================================
185# Functions to handle processed data
186# =============================================================================
189def common_tense(
190 tense_text: Optional[str], relation_text: Optional[str]
191) -> Tuple[Optional[str], Optional[str]]:
192 """
193 Takes strings potentially representing "tense" and "equality" concepts
194 and unifies them.
196 - Used, for example, to help impute that "CRP was 72" means that relation
197 was EQ in the PAST, etc.
199 Args:
200 tense_text: putative tense information
201 relation_text: putative relationship (equals, less than, etc.)
203 Returns:
204 tuple: ``tense, relation``; either may be ``None``.
205 """
206 tense = None
207 if tense_text:
208 _, tense = get_regex_dict_match(tense_text, TENSE_LOOKUP)
209 elif relation_text:
210 _, tense = get_regex_dict_match(relation_text, TENSE_LOOKUP)
212 _, relation = get_regex_dict_match(relation_text, RELATION_LOOKUP, "=")
214 return tense, relation
217# =============================================================================
218# Constants for generic processors
219# =============================================================================
221FN_VARIABLE_NAME = "variable_name"
222FN_CONTENT = "_content"
223FN_START = "_start"
224FN_END = "_end"
225FN_VARIABLE_TEXT = "variable_text"
226FN_RELATION_TEXT = "relation_text"
227FN_RELATION = "relation"
228FN_VALUE_TEXT = "value_text"
229FN_UNITS = "units"
230FN_TENSE_TEXT = "tense_text"
231FN_TENSE = "tense"
233HELP_VARIABLE_NAME = "Variable name"
234HELP_CONTENT = "Matching text contents"
235HELP_START = "Start position (of matching string within whole text)"
236HELP_END = "End position (of matching string within whole text)"
237HELP_VARIABLE_TEXT = "Text that matched the variable name"
238HELP_RELATION_TEXT = (
239 "Text that matched the mathematical relationship between variable and "
240 "value (e.g. '=', '<=', 'less than')"
241)
242HELP_RELATION = (
243 "Standardized mathematical relationship between variable and value "
244 "(e.g. '=', '<=')"
245)
246HELP_VALUE_TEXT = "Matched numerical value, as text"
247HELP_UNITS = "Matched units, as text"
248HELP_TARGET_UNIT = "Numerical value in preferred units, if known"
249HELP_TENSE_TEXT = f"Tense text, if known (e.g. '{IS}', '{WAS}')"
250HELP_TENSE = f"Calculated tense, if known (e.g. '{PAST}', '{PRESENT}')"
252MAX_RELATION_TEXT_LENGTH = 50
253MAX_RELATION_LENGTH = max(len(x) for x in RELATION_LOOKUP.values())
254MAX_VALUE_TEXT_LENGTH = 50
255MAX_UNITS_LENGTH = 50
256MAX_TENSE_TEXT_LENGTH = 50
257MAX_TENSE_LENGTH = max(len(x) for x in TENSE_LOOKUP.values())
260# =============================================================================
261# Generic processors
262# =============================================================================
264# -----------------------------------------------------------------------------
265# NumericalResultParser
266# -----------------------------------------------------------------------------
269class NumericalResultParser(BaseNlpParser):
270 """
271 DO NOT USE DIRECTLY. Base class for generic numerical results, where
272 a SINGLE variable is produced.
273 """
275 def __init__(
276 self,
277 nlpdef: NlpDefinition,
278 cfg_processor_name: str,
279 variable: str,
280 target_unit: str,
281 regex_str_for_debugging: str,
282 commit: bool = False,
283 ) -> None:
284 r"""
285 Init function for NumericalResultParser.
287 Args:
288 nlpdef:
289 A :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`.
291 cfg_processor_name:
292 Config section name in the :ref:`NLP config file <nlp_config>`.
294 variable:
295 Used by subclasses as the record value for ``variable_name``.
297 target_unit:
298 Fieldname used for the primary output quantity.
300 regex_str_for_debugging:
301 String form of regex, for debugging.
303 commit:
304 Force a COMMIT whenever we insert data? You should specify this
305 in multiprocess mode, or you may get database deadlocks.
307 Subclasses will extend this method.
308 """
309 # NB This docstring was associated with Sphinx errors!
310 super().__init__(
311 nlpdef=nlpdef,
312 cfg_processor_name=cfg_processor_name,
313 commit=commit,
314 friendly_name=variable,
315 )
316 self.variable = variable
317 self.target_unit = target_unit
318 self.regex_str_for_debugging = regex_str_for_debugging
320 if nlpdef is None: # only None for debugging!
321 self.tablename = self.classname().lower()
322 self.assume_preferred_unit = True
323 else:
324 self.tablename = self._cfgsection.opt_str(
325 ProcessorConfigKeys.DESTTABLE, required=True
326 )
327 self.assume_preferred_unit = self._cfgsection.opt_bool(
328 ProcessorConfigKeys.ASSUME_PREFERRED_UNIT, default=True
329 )
331 # Sanity checks
332 assert (
333 len(self.variable) <= MAX_SQL_FIELD_LEN
334 ), f"Variable name too long (max {MAX_SQL_FIELD_LEN} characters)"
336 def get_regex_str_for_debugging(self) -> str:
337 """
338 Returns the string version of the regex, for debugging.
339 """
340 return self.regex_str_for_debugging
342 def set_tablename(self, tablename: str) -> None:
343 """
344 In case a friend class wants to override.
345 """
346 self.tablename = tablename
348 def dest_tables_columns(self) -> Dict[str, List[Column]]:
349 # docstring in superclass
350 return {
351 self.tablename: [
352 Column(
353 FN_VARIABLE_NAME,
354 SqlTypeDbIdentifier,
355 comment=HELP_VARIABLE_NAME,
356 ),
357 Column(FN_CONTENT, Text, comment=HELP_CONTENT),
358 Column(FN_START, Integer, comment=HELP_START),
359 Column(FN_END, Integer, comment=HELP_END),
360 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT),
361 Column(
362 FN_RELATION_TEXT,
363 String(MAX_RELATION_TEXT_LENGTH),
364 comment=HELP_RELATION_TEXT,
365 ),
366 Column(
367 FN_RELATION,
368 String(MAX_RELATION_LENGTH),
369 comment=HELP_RELATION,
370 ),
371 Column(FN_VALUE_TEXT, Text, comment=HELP_VALUE_TEXT),
372 Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS),
373 Column(self.target_unit, Float, comment=HELP_TARGET_UNIT),
374 Column(
375 FN_TENSE_TEXT,
376 String(MAX_TENSE_TEXT_LENGTH),
377 comment=HELP_TENSE_TEXT,
378 ),
379 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE),
380 ]
381 }
383 @abstractmethod
384 def parse(
385 self, text: str
386 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
387 # docstring in superclass
388 raise NotImplementedError
390 def test_numerical_parser(
391 self,
392 test_expected_list: List[Tuple[str, List[float]]],
393 add_test_no_plain_number: bool = True,
394 verbose: bool = False,
395 ) -> None:
396 """
397 Args:
398 test_expected_list:
399 list of tuples ``test_string, expected_values``. The parser
400 will parse ``test_string`` and compare the result (each value
401 of the target unit) to ``expected_values``, which is a list of
402 numerical (``float``), and can be an empty list.
403 verbose:
404 show the regex string too
406 Raises:
407 :exc:`AssertionError` if a comparison fails
409 Compare also :func:`test_numerical_parser_detailed`.
410 """
411 log.info(f"Testing parser: {self.classname()}")
412 if verbose:
413 log.debug(f"... regex string:\n{self.regex_str_for_debugging}")
414 if add_test_no_plain_number:
415 test_expected_list = test_expected_list + [
416 ("999", []) # no quantity specified
417 ] # use "+ [...]", not append(), so as not to modify for caller
418 for test_string, expected_values in test_expected_list:
419 full_result = list(self.parse(test_string))
420 actual_values = list(x[self.target_unit] for t, x in full_result)
421 assert actual_values == expected_values, (
422 f"Parser {self.classname()}: Expected {expected_values!r}, "
423 f"got {actual_values!r}, when parsing {test_string!r}; "
424 f"full result:\n{full_result!r}"
425 )
426 log.info("... OK")
428 def detailed_test(
429 self, text: str, expected: List[Dict[str, Any]], verbose: bool = False
430 ) -> None:
431 """
432 Runs a more detailed check. Whereas :func:`test_numerical_parser` tests
433 the primary numerical results, this function tests other key/value
434 pairs returned by the parser.
436 Args:
437 text:
438 text to parse
439 expected:
440 list of ``resultdict`` dictionaries (each mapping column names
441 to values).
443 - The parser should return one result dictionary for
444 every entry in ``expected``.
445 - It's fine for the ``resultdict`` not to include all the
446 columns returned for the parser. However, for any column that
447 is present, the parser must provide the corresponding value.
449 verbose:
450 be verbose
451 """
452 full_result = list(self.parse(text))
453 if len(full_result) != len(expected):
454 raise ValueError(
455 f"Parser {self.classname()}: expected {len(expected)} results "
456 f"but got {len(full_result)} when parsing {text!r}; "
457 f"full result:\n{full_result!r}"
458 )
459 if verbose:
460 log.info(f"detailed_test: {text!r} -> {full_result!r}")
461 for i, text_result in enumerate(full_result):
462 _, result = text_result
463 expected_dict = expected[i]
464 for k, expected_value in expected_dict.items():
465 if k not in result:
466 raise ValueError(
467 f"Parser {self.classname()}: Expected value dict "
468 f"had key {k!r} but this is absent from result "
469 f"{result!r}"
470 )
471 observed_value = result[k]
472 if observed_value != expected_value:
473 raise ValueError(
474 f"Parser {self.classname()}: expected {k} = "
475 f"{expected_value!r}, got {observed_value!r}, "
476 f"when parsing {text!r}; full result:\n"
477 f"{full_result!r}"
478 )
480 def detailed_test_multiple(
481 self,
482 tests: List[Tuple[str, List[Dict[str, Any]]]],
483 verbose: bool = False,
484 ) -> None:
485 """
486 Args:
487 tests:
488 list of tuples ``test_string, expected``. The parser will parse
489 ``test_string`` and compare the result(s) to ``expected``. This
490 is list of dictionaries with keys that can be like ``values``,
491 ``tense``, etc. Each dictionary value is the corresponding
492 expected value.
493 verbose:
494 show the regex string too
496 Raises:
497 :exc:`AssertionError` if a comparison fails
498 """
499 log.info(f"Detailed tests for parser: {self.classname()}")
500 if verbose:
501 log.debug(f"... regex string:\n{self.regex_str_for_debugging}")
502 for test_string, expected_dict_list in tests:
503 self.detailed_test(
504 test_string, expected_dict_list, verbose=verbose
505 )
506 log.info("... OK")
509# -----------------------------------------------------------------------------
510# SimpleNumericalResultParser
511# -----------------------------------------------------------------------------
513GROUP_NUMBER_WHOLE_EXPRESSION = 0
515GROUP_NAME_QUANTITY = "quantity"
516GROUP_NAME_RELATION = "relation"
517GROUP_NAME_TENSE = "tense"
518GROUP_NAME_UNITS = "units"
519GROUP_NAME_VALUE = "value"
522def make_simple_numeric_regex(
523 quantity: str,
524 units: str,
525 value: str = SIGNED_FLOAT,
526 tense_indicator: str = TENSE_INDICATOR,
527 relation: str = RELATION,
528 optional_results_ignorables: str = OPTIONAL_RESULTS_IGNORABLES,
529 optional_ignorable_after_quantity: str = "",
530 units_optional: bool = True,
531) -> str:
532 r"""
533 Makes a regex with named groups to handle simple numerical results.
535 Copes with formats like:
537 .. code-block:: none
539 sodium 132 mM
540 sodium (mM) 132
541 sodium (132 mM)
543 ... and lots more.
545 Args:
546 quantity:
547 Regex for the quantity (e.g. for "sodium" or "Na").
548 units:
549 Regex for units.
550 value:
551 Regex for the numerical value (e.g. our ``SIGNED_FLOAT`` regex).
552 tense_indicator:
553 Regex for tense indicator.
554 relation:
555 Regex for mathematical relationship (e.g. equals, less than).
556 optional_results_ignorables:
557 Regex for junk to ignore in between the other things.
558 Should include its own "optionality" (e.g. ``*``).
559 optional_ignorable_after_quantity:
560 Regex for additional things that can be ignored right after the
561 quantity. Should include its own "optionality" (e.g. ``?``).
562 units_optional:
563 The units are allowed to be omitted. Usually true.
565 The resulting regex groups are named, not numbered:
567 .. code-block:: none
569 0: Whole thing; integer, as in: m.group(0)
570 'quantity': Quantity
571 'tense': Tense (optional)
572 'relation': Relation (optional)
573 'value': Value
574 'units': Units (optional)
576 ... as used by :class:`SimpleNumericalResultParser`.
578 Just to check re overlap:
580 .. code-block:: python
582 import regex
583 s1 = r"(?P<quantity>Sodium)\s+(?P<value>\d+)\s+(?P<units>mM)"
584 s2 = r"(?P<quantity>Sodium)\s+\((?P<units>mM)\)\s+(?P<value>\d+)"
585 s = f"{s1}|{s2}"
586 r = regex.compile(s)
587 t1 = "Sodium 132 mM"
588 t2 = "Sodium (mM) 127"
589 m1 = r.match(t1)
590 m2 = r.match(t2)
592 print(m1.group(0)) # Sodium 132 mM
593 print(m1.group("quantity")) # Sodium
594 print(m1.group("value")) # 132
595 print(m1.group("units")) # mM
597 print(m2.group(0)) # Sodium (mM) 127
598 print(m2.group("quantity")) # Sodium
599 print(m2.group("value")) # 127
600 print(m2.group("units")) # mM
602 ... so it's fine in that multiple groups can have the same name.
604 """
606 def group(groupname: str, contents: str, optional: bool = False) -> str:
607 opt_str = "?" if optional else ""
608 return f"(?P<{groupname}> {contents} ){opt_str}"
610 def bracketed(s: str) -> str:
611 return rf"{LB} \s* {s} \s* {RB}"
613 group_quantity = group(GROUP_NAME_QUANTITY, quantity)
614 group_tense_optional = group(GROUP_NAME_TENSE, tense_indicator, True)
615 group_relation_optional = group(GROUP_NAME_RELATION, relation, True)
616 group_units = group(GROUP_NAME_UNITS, units)
617 group_units_bracketed = bracketed(group_units)
618 group_value = group(GROUP_NAME_VALUE, value)
619 group_value_bracketed = bracketed(group_value)
620 value_units_all_bracketed = bracketed(rf"{group_value} \s+ {group_units}")
621 units_optional_descriptor = "optional" if units_optional else "required"
622 qmark_if_units_optional = "?" if units_optional else ""
624 return rf"""
625 # - Either: quantity [tense] [relation] value [units]
626 # or: quantity (units value)
627 # or: quantity (units) [tense] [relation] value
628 # Quantity:
629 {group_quantity}
630 # Ignorable:
631 {optional_ignorable_after_quantity}
632 {optional_results_ignorables}
633 (?:
634 (?:
635 # (units) ... [tense] ... [relation] ... value
636 # Units, in brackets:
637 {group_units_bracketed}
638 # Tense indicator (optional):
639 {group_tense_optional}
640 # Ignorable:
641 {optional_results_ignorables}
642 # Relation (optional):
643 {group_relation_optional}
644 # Ignorable:
645 {optional_results_ignorables}
646 # Value:
647 {group_value}
648 )
649 |
650 (?:
651 # (value units)
652 {value_units_all_bracketed}
653 )
654 |
655 (?:
656 # [tense] ... [relation] ... value|(value) ... [units]
657 # Tense indicator (optional):
658 {group_tense_optional}
659 # Ignorable:
660 {optional_results_ignorables}
661 # Relation (optional):
662 {group_relation_optional}
663 # Ignorable:
664 {optional_results_ignorables}
665 # Value or (value):
666 (?:
667 {group_value} |
668 {group_value_bracketed}
669 )
670 # Ignorable:
671 {optional_results_ignorables}
672 # Units ({units_optional_descriptor}):
673 {group_units}{qmark_if_units_optional}
674 )
675 )
676 """
679class SimpleNumericalResultParser(NumericalResultParser, ABC):
680 """
681 Base class for simple single-format numerical results. Use this when not
682 only do you have a single variable to produce, but you have a single regex
683 (in a standard format) that can produce it.
684 """
686 def __init__(
687 self,
688 nlpdef: NlpDefinition,
689 cfg_processor_name: str,
690 regex_str: str,
691 variable: str,
692 target_unit: str,
693 units_to_factor: Dict[str, float],
694 take_absolute: bool = False,
695 commit: bool = False,
696 debug: bool = False,
697 ) -> None:
698 """
699 Args:
701 nlpdef:
702 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
704 cfg_processor_name:
705 config section suffix in the :ref:`NLP config file
706 <nlp_config>`
708 regex_str:
709 Regular expression, in string format.
711 This class operates with compiled regexes having this group
712 format (capture groups in this sequence):
714 - variable
715 - tense_indicator
716 - relation
717 - value
718 - units
720 variable:
721 used as the record value for ``variable_name``
723 target_unit:
724 fieldname used for the primary output quantity
726 units_to_factor:
727 dictionary, mapping
729 - FROM (compiled regex for units)
730 - TO EITHER a float (multiple) to multiply those units by, to
731 get the preferred unit
732 - OR a function taking a text parameter and returning a float
733 value in preferred unit
735 Any units present in the regex but absent from
736 ``units_to_factor`` will lead the result to be ignored. For
737 example, this allows you to ignore a relative neutrophil count
738 ("neutrophils 2.2%") while detecting absolute neutrophil counts
739 ("neutrophils 2.2"), or ignoring "docusate sodium 100mg" but
740 detecting "sodium 140 mM".
742 take_absolute:
743 Convert negative values to positive ones? Typical text
744 requiring this option might look like:
746 .. code-block:: none
748 CRP-4
749 CRP-106
750 CRP -97
751 Blood results for today as follows: Na- 142, K-4.1, ...
753 ... occurring in 23 out of 8054 hits for CRP of one test set in
754 our data.
756 For many quantities, we know that they cannot be negative, so
757 this is just a notation rather than a minus sign. We have to
758 account for it, or it'll distort our values. Preferable to
759 account for it here rather than later; see manual.
761 commit:
762 force a COMMIT whenever we insert data? You should specify this
763 in multiprocess mode, or you may get database deadlocks.
765 debug:
766 print the regex?
768 """
769 super().__init__(
770 nlpdef=nlpdef,
771 cfg_processor_name=cfg_processor_name,
772 variable=variable,
773 target_unit=target_unit,
774 regex_str_for_debugging=regex_str,
775 commit=commit,
776 )
777 if debug:
778 log.debug(f"Regex for {self.classname()}: {regex_str}")
779 self.compiled_regex = compile_regex(regex_str)
780 self.units_to_factor = compile_regex_dict(units_to_factor)
781 self.take_absolute = take_absolute
783 def parse(
784 self, text: str, debug: bool = False
785 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
786 # docstring in superclass
787 if not text:
788 return
789 for m in self.compiled_regex.finditer(text):
790 startpos = m.start()
791 endpos = m.end()
792 # groups = repr(m.groups()) # all matching groups
793 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
794 # matching_text = text[startpos:endpos] # same thing
796 variable_text = m.group(GROUP_NAME_QUANTITY)
797 tense_text = m.group(GROUP_NAME_TENSE)
798 relation_text = m.group(GROUP_NAME_RELATION)
799 value_text = m.group(GROUP_NAME_VALUE)
800 units = m.group(GROUP_NAME_UNITS)
802 # If units are known (or we're choosing to assume preferred units
803 # if none are specified), calculate an absolute value
804 value_in_target_units = None
805 if units:
806 matched_unit, multiple_or_fn = get_regex_dict_match(
807 units, self.units_to_factor
808 )
809 if not matched_unit:
810 # None of our units match. But there is a unit, and the
811 # regex matched. So this is a BAD unit. Skip the value.
812 continue
813 # Otherwise: we did match a unit.
814 if callable(multiple_or_fn):
815 value_in_target_units = multiple_or_fn(value_text)
816 else:
817 value_in_target_units = (
818 to_float(value_text) * multiple_or_fn
819 )
820 elif self.assume_preferred_unit: # unit is None or empty
821 value_in_target_units = to_float(value_text)
823 if value_in_target_units is not None and self.take_absolute:
824 value_in_target_units = abs(value_in_target_units)
826 tense, relation = common_tense(tense_text, relation_text)
828 result = {
829 FN_VARIABLE_NAME: self.variable,
830 FN_CONTENT: matching_text,
831 FN_START: startpos,
832 FN_END: endpos,
833 FN_VARIABLE_TEXT: variable_text,
834 FN_RELATION_TEXT: relation_text,
835 FN_RELATION: relation,
836 FN_VALUE_TEXT: value_text,
837 FN_UNITS: units,
838 self.target_unit: value_in_target_units,
839 FN_TENSE_TEXT: tense_text,
840 FN_TENSE: tense,
841 }
842 if debug:
843 log.debug(f"Match {m} for {text!r} -> {result}")
844 yield self.tablename, result
847# -----------------------------------------------------------------------------
848# NumeratorOutOfDenominatorParser
849# -----------------------------------------------------------------------------
852class NumeratorOutOfDenominatorParser(BaseNlpParser, ABC):
853 """
854 Base class for X-out-of-Y numerical results, e.g. for MMSE/ACE.
856 - Integer denominator, expected to be positive.
857 - Otherwise similar to :class:`SimpleNumericalResultParser`.
858 """
860 def __init__(
861 self,
862 nlpdef: NlpDefinition,
863 cfg_processor_name: str,
864 variable_name: str, # e.g. "MMSE"
865 variable_regex_str: str, # e.g. regex for MMSE
866 expected_denominator: int,
867 numerator_text_fieldname: str = "numerator_text",
868 numerator_fieldname: str = "numerator",
869 denominator_text_fieldname: str = "denominator_text",
870 denominator_fieldname: str = "denominator",
871 correct_numerator_fieldname: str = None, # default below
872 take_absolute: bool = True,
873 commit: bool = False,
874 debug: bool = False,
875 ) -> None:
876 """
877 This class operates with compiled regexes having this group format:
878 - quantity_regex_str: e.g. to find "MMSE"
880 Args:
881 nlpdef:
882 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
883 cfg_processor_name:
884 the suffix (name) of a CRATE NLP config file processor section
885 (from which we may choose to get extra config information)
886 variable_name:
887 becomes the content of the ``variable_name`` output column
888 variable_regex_str:
889 regex for the text that states the variable
890 expected_denominator:
891 the integer value that's expected as the "out of Y" part. For
892 example, an MMSE is out of 30; an ACE-III total is out of 100.
893 If the text just says "MMSE 17", we will infer "17 out of 30";
894 so, for the MMSE, ``expected_denominator`` should be 30.
895 numerator_text_fieldname:
896 field (column) name in which to store the text retrieved as the
897 numerator
898 numerator_fieldname:
899 field (column) name in which to store the numerical value
900 retrieved as the numerator
901 denominator_text_fieldname:
902 field (column) name in which to store the text retrieved as the
903 denominator
904 denominator_fieldname:
905 field (column) name in which to store the numerical value
906 retrieved as the denominator
907 correct_numerator_fieldname:
908 field (column) name in which we store the principal validated
909 numerator. For example, if an MMSE processor sees "17" or
910 "17/30", this field will end up containing 17; but if it sees
911 "17/100", it will remain NULL.
912 take_absolute:
913 Convert negative values to positive ones?
914 As for :class:`SimpleNumericalResultParser`.
915 commit:
916 force a COMMIT whenever we insert data? You should specify this
917 in multiprocess mode, or you may get database deadlocks.
918 debug:
919 print the regex?
921 """
922 self.variable_name = variable_name
923 assert expected_denominator > 0
924 self.expected_denominator = expected_denominator
925 self.numerator_text_fieldname = numerator_text_fieldname
926 self.numerator_fieldname = numerator_fieldname
927 self.denominator_text_fieldname = denominator_text_fieldname
928 self.denominator_fieldname = denominator_fieldname
929 self.correct_numerator_fieldname = (
930 correct_numerator_fieldname or f"out_of_{expected_denominator}"
931 )
932 self.take_absolute = take_absolute
934 super().__init__(
935 nlpdef=nlpdef,
936 cfg_processor_name=cfg_processor_name,
937 commit=commit,
938 friendly_name=variable_name,
939 )
940 if nlpdef is None: # only None for debugging!
941 self.tablename = self.classname().lower()
942 else:
943 self.tablename = self._cfgsection.opt_str(
944 ProcessorConfigKeys.DESTTABLE, required=True
945 )
947 regex_str = rf"""
948 ( {variable_regex_str} ) # 1. group for variable (thing being measured)
949 {OPTIONAL_RESULTS_IGNORABLES}
950 {SCORE}? # optional "score" or similar
951 {OPTIONAL_RESULTS_IGNORABLES}
952 ( {TENSE_INDICATOR} )? # 2. optional group for tense indicator
953 {OPTIONAL_RESULTS_IGNORABLES}
954 ( {RELATION} )? # 3. optional group for relation
955 {OPTIONAL_RESULTS_IGNORABLES}
956 ( {SIGNED_FLOAT} ) # 4. group for numerator
957 (?: # optional "/ denominator"
958 \s* {OUT_OF_SEPARATOR} \s*
959 ( {IGNORESIGN_INTEGER} ) # 5. group for denominator
960 )?
961 """ # noqa: E501
962 if debug:
963 log.debug(f"Regex for {self.classname()}: {regex_str}")
964 self.regex_str = regex_str
965 self.compiled_regex = compile_regex(regex_str)
967 def dest_tables_columns(self) -> Dict[str, List[Column]]:
968 # docstring in superclass
969 return {
970 self.tablename: [
971 Column(
972 FN_VARIABLE_NAME,
973 SqlTypeDbIdentifier,
974 comment=HELP_VARIABLE_NAME,
975 ),
976 Column(FN_CONTENT, Text, comment=HELP_CONTENT),
977 Column(FN_START, Integer, comment=HELP_START),
978 Column(FN_END, Integer, comment=HELP_END),
979 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT),
980 Column(
981 FN_RELATION_TEXT,
982 String(MAX_RELATION_TEXT_LENGTH),
983 comment=HELP_RELATION_TEXT,
984 ),
985 Column(
986 FN_RELATION,
987 String(MAX_RELATION_LENGTH),
988 comment=HELP_RELATION,
989 ),
990 Column(
991 self.numerator_text_fieldname,
992 String(MAX_VALUE_TEXT_LENGTH),
993 comment="Numerator, as text",
994 ),
995 Column(self.numerator_fieldname, Float, comment="Numerator"),
996 Column(
997 self.denominator_text_fieldname,
998 String(MAX_VALUE_TEXT_LENGTH),
999 comment="Denominator, as text",
1000 ),
1001 Column(
1002 self.denominator_fieldname, Float, comment="Denominator"
1003 ),
1004 Column(
1005 self.correct_numerator_fieldname,
1006 Float,
1007 comment="Numerator, if denominator is as expected (units "
1008 "are correct)",
1009 ),
1010 Column(
1011 FN_TENSE_TEXT,
1012 String(MAX_TENSE_TEXT_LENGTH),
1013 comment=HELP_TENSE_TEXT,
1014 ),
1015 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE),
1016 ]
1017 }
1019 def parse(
1020 self, text: str, debug: bool = False
1021 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
1022 # docstring in superclass
1023 for m in self.compiled_regex.finditer(text):
1024 startpos = m.start()
1025 endpos = m.end()
1026 # groups = repr(m.groups()) # all matching groups
1027 matching_text = m.group(0) # the whole thing
1028 # matching_text = text[startpos:endpos] # same thing
1030 variable_text = m.group(1)
1031 tense_text = m.group(2)
1032 relation_text = m.group(3)
1033 numerator_text = m.group(4)
1034 denominator_text = m.group(5)
1036 if self.take_absolute:
1037 numerator = to_pos_float(numerator_text)
1038 else:
1039 numerator = to_float(numerator_text)
1040 denominator = to_float(denominator_text)
1042 if numerator is None:
1043 log.critical("bug - numerator is None, should be impossible")
1044 continue
1045 correct_numerator = None
1046 if denominator is None:
1047 if numerator <= self.expected_denominator:
1048 correct_numerator = numerator
1049 else:
1050 if numerator <= denominator == self.expected_denominator:
1051 correct_numerator = numerator
1053 tense, relation = common_tense(tense_text, relation_text)
1055 result = {
1056 FN_VARIABLE_NAME: self.variable_name,
1057 FN_CONTENT: matching_text,
1058 FN_START: startpos,
1059 FN_END: endpos,
1060 FN_VARIABLE_TEXT: variable_text,
1061 FN_RELATION_TEXT: relation_text,
1062 FN_RELATION: relation,
1063 self.numerator_text_fieldname: numerator_text,
1064 self.numerator_fieldname: numerator,
1065 self.denominator_text_fieldname: denominator_text,
1066 self.denominator_fieldname: denominator,
1067 self.correct_numerator_fieldname: correct_numerator,
1068 FN_TENSE_TEXT: tense_text,
1069 FN_TENSE: tense,
1070 }
1071 if debug:
1072 log.debug(f"Match {m} for {text!r} -> {result}")
1073 yield self.tablename, result
1075 def test_numerator_denominator_parser(
1076 self,
1077 test_expected_list: List[Tuple[str, List[Tuple[float, float]]]],
1078 verbose: bool = False,
1079 ) -> None:
1080 """
1081 Test the parser.
1083 Args:
1084 test_expected_list:
1085 list of tuples ``test_string, expected_values``. The parser
1086 will parse ``test_string`` and compare the result (each value
1087 of the target unit) to ``expected_values``, which is a list of
1088 tuples ``numerator, denominator``, and can be an empty list.
1089 verbose:
1090 print the regex?
1092 Raises:
1093 :exc:`AssertionError` if a comparison fails
1094 """
1095 log.info(f"Testing parser: {self.classname()}")
1096 if verbose:
1097 log.debug(f"... regex:\n{self.regex_str}")
1098 for test_string, expected_values in test_expected_list:
1099 actual_values = list(
1100 (x[self.numerator_fieldname], x[self.denominator_fieldname])
1101 for t, x in self.parse(test_string)
1102 )
1103 assert actual_values == expected_values, (
1104 "Parser {name}: Expected {expected}, got {actual}, when "
1105 "parsing {test_string}; full result:\n{full}".format(
1106 name=self.classname(),
1107 expected=expected_values,
1108 actual=actual_values,
1109 test_string=repr(test_string),
1110 full=repr(list(self.parse(test_string))),
1111 )
1112 )
1113 log.info("... OK")
1116# =============================================================================
1117# Validator base class (for testing regex NLP classes)
1118# =============================================================================
1121class ValidatorBase(BaseNlpParser):
1122 r"""
1123 DO NOT USE DIRECTLY. Base class for **validating** regex parser
1124 sensitivity.
1126 The validator will find fields that refer to the variable, whether or not
1127 they meet the other criteria of the actual NLP processors (i.e. whether or
1128 not they contain a valid value). More explanation below.
1130 Suppose we're validating C-reactive protein (CRP). Key concepts:
1132 - source (true state of the world): Pr present, Ab absent
1133 - software decision: Y yes, N no
1134 - signal detection theory classification:
1136 - hit = Pr & Y = true positive
1137 - miss = Pr & N = false negative
1138 - false alarm = Ab & Y = false positive
1139 - correct rejection = Ab & N = true negative
1141 - common SDT metrics:
1143 - positive predictive value, PPV = P(Pr | Y) = precision (\*)
1144 - negative predictive value, NPV = P(Ab | N)
1145 - sensitivity = P(Y | Pr) = recall (\*) = true positive rate
1146 - specificity = P(N | Ab) = true negative rate
1148 (\*) common names used in the NLP context.
1150 - other common classifier metric:
1152 .. code-block:: none
1154 F_beta score = (1 + beta^2) * precision * recall /
1155 ((beta^2 * precision) + recall)
1157 ... which measures performance when you value recall beta times as much
1158 as precision (thus, for example, the F1 score when beta = 1). See
1159 https://en.wikipedia.org/wiki/F1_score/
1161 Working from source to NLP, we can see there are a few types of "absent":
1163 - X. unselected database field containing text
1165 - Q. field contains "CRP", "C-reactive protein", etc.; something
1166 that a human (or as a proxy: a machine) would judge as
1167 containing a textual reference to CRP.
1169 - Pr. Present: a human would judge that a CRP value is present,
1170 e.g. "today her CRP is 7, which I am not concerned about."
1172 - H. Hit: software reports the value.
1173 - M. Miss: software misses the value.
1174 (Maybe: "his CRP was twenty-one".)
1176 - Ab1. Absent: reference to CRP, but no numerical information,
1177 e.g. "her CRP was normal".
1179 - FA1. False alarm: software reports a numerical value.
1180 (Maybe: "my CRP was 7 hours behind my boss's deadline")
1181 - CR1. Correct rejection: software doesn't report a value.
1183 - Ab2. field contains no reference to CRP at all.
1185 - FA2. False alarm: software reports a numerical value.
1186 (A bit harder to think of examples... but imagine a bug
1187 that gives a hit for "number of carp: 7". Or an alternative
1188 abbreviation meaning, e.g. "took part in a cardiac
1189 rehabilitation programme (CRP) 4 hours/week".)
1191 - CR2. Correct rejection: software doesn't report a value.
1193 From NLP backwards to source:
1195 - Y. Software says value present.
1197 - H. Hit: value is present.
1198 - FA. False alarm: value is absent.
1200 - N. Software says value absent.
1202 - CR. Correct rejection: value is absent.
1203 - M. Miss: value is present.
1205 The key metrics are:
1207 - precision = positive predictive value = P(Pr | Y)
1209 ... relatively easy to check; find all the "Y" records and check
1210 manually that they're correct.
1212 - sensitivity = recall = P(Y | Pr)
1214 ... Here, we want a sample that is enriched for "symptom actually
1215 present", for human reasons. For example, if 0.1% of text entries
1216 refer to CRP, then to assess 100 "Pr" samples we would have to
1217 review 100,000 text records, 99,900 of which are completely
1218 irrelevant. So we want an automated way of finding "Pr" records.
1219 That's what the validator classes do.
1221 You can enrich for "Pr" records with SQL, e.g.
1223 .. code-block:: sql
1225 SELECT textfield FROM sometable WHERE (
1226 textfield LIKE '%CRP%'
1227 OR textfield LIKE '%C-reactive protein%');
1229 or similar, but really we want the best "CRP detector" possible. That is
1230 probably to use a regex, either in SQL (... ``WHERE textfield REGEX
1231 'myregex'``) or using these validator classes. (The main NLP regexes don't
1232 distinguish between "CRP present, no valid value" and "CRP absent",
1233 because regexes either match or don't.)
1235 Each validator class implements the core variable-finding part of its
1236 corresponding NLP regex class, but without the value or units. For example,
1237 the CRP class looks for things like "CRP is 6" or "CRP 20 mg/L", whereas
1238 the CRP validator looks for things like "CRP".
1240 """
1242 def __init__(
1243 self,
1244 nlpdef: Optional[NlpDefinition],
1245 cfg_processor_name: Optional[str],
1246 commit: bool = False,
1247 ) -> None:
1248 """
1249 Args:
1250 nlpdef:
1251 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
1253 cfg_processor_name:
1254 config section suffix in the :ref:`NLP config file
1255 <nlp_config>`
1257 commit:
1258 force a COMMIT whenever we insert data? You should specify this
1259 in multiprocess mode, or you may get database deadlocks.
1260 """
1261 (
1262 validated_variable,
1263 regex_str_list,
1264 ) = self.get_variablename_regexstrlist()
1265 vname = f"{validated_variable}_validator"
1266 super().__init__(
1267 nlpdef=nlpdef,
1268 cfg_processor_name=cfg_processor_name,
1269 commit=commit,
1270 friendly_name=vname,
1271 )
1272 self.regex_str_list = regex_str_list # for debugging only
1273 self.compiled_regex_list = [compile_regex(r) for r in regex_str_list]
1274 self.variable = vname
1275 self.NAME = self.variable
1277 if nlpdef is None: # only None for debugging!
1278 self.tablename = self.classname().lower()
1279 else:
1280 self.tablename = self._cfgsection.opt_str(
1281 ProcessorConfigKeys.DESTTABLE, required=True
1282 )
1284 @classmethod
1285 @abstractmethod
1286 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
1287 """
1288 To be overridden.
1290 Returns:
1291 tuple: ``(validated_variable_name, regex_str_list)``, where:
1293 regex_str_list:
1294 List of regular expressions, each in string format.
1296 This class operates with compiled regexes having this group
1297 format (capture groups in this sequence):
1299 - variable
1301 validated_variable:
1302 used to set our ``variable`` attribute and thus the value of
1303 the field ``variable_name`` in the NLP output; for example, if
1304 ``validated_variable == 'crp'``, then the ``variable_name``
1305 field will be set to ``crp_validator``.
1307 """
1308 raise NotImplementedError
1310 def set_tablename(self, tablename: str) -> None:
1311 """
1312 In case a friend class wants to override.
1313 """
1314 self.tablename = tablename
1316 def dest_tables_columns(self) -> Dict[str, List[Column]]:
1317 # docstring in superclass
1318 return {
1319 self.tablename: [
1320 Column(
1321 FN_VARIABLE_NAME,
1322 SqlTypeDbIdentifier,
1323 comment=HELP_VARIABLE_NAME,
1324 ),
1325 Column(FN_CONTENT, Text, comment=HELP_CONTENT),
1326 Column(FN_START, Integer, comment=HELP_START),
1327 Column(FN_END, Integer, comment=HELP_END),
1328 ]
1329 }
1331 def parse(
1332 self, text: str
1333 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
1334 # docstring in superclass
1335 for compiled_regex in self.compiled_regex_list:
1336 for m in compiled_regex.finditer(text):
1337 startpos = m.start()
1338 endpos = m.end()
1339 # groups = repr(m.groups()) # all matching groups
1340 matching_text = m.group(0) # the whole thing
1341 # matching_text = text[startpos:endpos] # same thing
1343 yield self.tablename, {
1344 FN_VARIABLE_NAME: self.variable,
1345 FN_CONTENT: matching_text,
1346 FN_START: startpos,
1347 FN_END: endpos,
1348 }
1350 def test_validator(
1351 self, test_expected_list: List[Tuple[str, bool]], verbose: bool = False
1352 ) -> None:
1353 """
1354 The 'bool' part of test_expected_list is: should it match any?
1355 ... noting that "match anywhere" is the "search" function, whereas
1356 "match" matches at the beginning:
1358 https://docs.python.org/3/library/re.html#re.regex.match
1359 """
1360 log.info(f"Testing validator: {self.classname()}")
1361 if verbose:
1362 n = len(self.regex_str_list)
1363 for i, r in enumerate(self.regex_str_list, start=1):
1364 log.debug(f"... regex #{i}/{n}: {r}\n")
1365 for test_string, expected_match in test_expected_list:
1366 results = list(
1367 r.search(test_string) for r in self.compiled_regex_list
1368 )
1369 actual_match = any(results)
1370 assert actual_match == expected_match, (
1371 f"Validator {self.classname()}: Expected 'at least one regex "
1372 f"should match somewhere (search)' to be {expected_match}, "
1373 f"got {actual_match}, when parsing {test_string!r}; "
1374 f"full results = {results}"
1375 )
1376 log.info("... OK")
1378 def test(self, verbose: bool = False) -> None:
1379 log.info(f"... no tests implemented for validator {self.classname()}")
1382# =============================================================================
1383# More general testing
1384# =============================================================================
1387def learning_alternative_regex_groups() -> None:
1388 """
1389 Function to learn about regex syntax.
1390 """
1391 regex_str = r"""
1392 (
1393 (?:
1394 \s*
1395 (?: (a) | (b) | (c) | (d) )
1396 \s*
1397 )*
1398 ( fish )?
1399 )
1400 """
1401 compiled_regex = compile_regex(regex_str)
1402 for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"):
1403 m = compiled_regex.match(test_str)
1404 log.info(f"Match: {m}; groups: {m.groups()}")
1405 """
1406 So:
1407 - groups can overlap
1408 - groups are ordered by their opening bracket
1409 - matches are filled in neatly
1410 """