Coverage for nlp_manager/parse_clinical.py: 91%
240 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/parse_clinical.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Python regex-based NLP processors for clinical assessment data.**
28Most inherit from
29:class:`crate_anon.nlp_manager.regex_parser.SimpleNumericalResultParser` and
30are constructed with these arguments:
32nlpdef:
33 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
34cfgsection:
35 the name of a CRATE NLP config file section (from which we may
36 choose to get extra config information)
37commit:
38 force a COMMIT whenever we insert data? You should specify this
39 in multiprocess mode, or you may get database deadlocks.
41± these:
43debug:
44 show debugging information
46"""
48import logging
49from typing import Any, Dict, Generator, List, Optional, Tuple
51from sqlalchemy import Column, Integer, Float, String, Text
53from crate_anon.common.regex_helpers import WORD_BOUNDARY
54from crate_anon.nlp_manager.constants import ProcessorConfigKeys
55from crate_anon.nlp_manager.nlp_definition import NlpDefinition
56from crate_anon.nlp_manager.regex_parser import (
57 BaseNlpParser,
58 common_tense,
59 compile_regex,
60 FN_CONTENT,
61 FN_END,
62 FN_RELATION,
63 FN_RELATION_TEXT,
64 FN_START,
65 FN_TENSE,
66 FN_TENSE_TEXT,
67 FN_UNITS,
68 FN_VALUE_TEXT,
69 FN_VARIABLE_NAME,
70 FN_VARIABLE_TEXT,
71 HELP_CONTENT,
72 HELP_END,
73 HELP_RELATION,
74 HELP_RELATION_TEXT,
75 HELP_START,
76 HELP_TENSE,
77 HELP_TENSE_TEXT,
78 HELP_UNITS,
79 HELP_VALUE_TEXT,
80 HELP_VARIABLE_TEXT,
81 make_simple_numeric_regex,
82 MAX_RELATION_LENGTH,
83 MAX_RELATION_TEXT_LENGTH,
84 MAX_TENSE_LENGTH,
85 MAX_TENSE_TEXT_LENGTH,
86 MAX_UNITS_LENGTH,
87 MAX_VALUE_TEXT_LENGTH,
88 NumericalResultParser,
89 OPTIONAL_RESULTS_IGNORABLES,
90 RELATION,
91 SimpleNumericalResultParser,
92 TENSE_INDICATOR,
93 to_float,
94 to_pos_float,
95 ValidatorBase,
96)
97from crate_anon.nlp_manager.regex_numbers import SIGNED_FLOAT
98from crate_anon.nlp_manager.regex_units import (
99 assemble_units,
100 CM,
101 FEET,
102 INCHES,
103 KG,
104 kg_from_st_lb_oz,
105 KG_PER_SQ_M,
106 LB,
107 M,
108 m_from_ft_in,
109 m_from_m_cm,
110 MM_HG,
111 STONES,
112)
114log = logging.getLogger(__name__)
117# =============================================================================
118# Anthropometrics
119# =============================================================================
121# -----------------------------------------------------------------------------
122# Height
123# -----------------------------------------------------------------------------
126class Height(NumericalResultParser):
127 """
128 CLINICAL EXAMINATION.
130 Height. Handles metric (e.g. "1.8m") and imperial (e.g. "5 ft 2 in").
131 """
133 METRIC_HEIGHT = rf"""
134 ( # capture group 4
135 (?:
136 ( {SIGNED_FLOAT} ) # capture group 5
137 {OPTIONAL_RESULTS_IGNORABLES}
138 ( {M} ) # capture group 6
139 {OPTIONAL_RESULTS_IGNORABLES}
140 ( {SIGNED_FLOAT} ) # capture group 7
141 {OPTIONAL_RESULTS_IGNORABLES}
142 ( {CM} ) # capture group 8
143 )
144 | (?:
145 ( {SIGNED_FLOAT} ) # capture group 9
146 {OPTIONAL_RESULTS_IGNORABLES}
147 ( {M} ) # capture group 10
148 )
149 | (?:
150 ( {SIGNED_FLOAT} ) # capture group 11
151 {OPTIONAL_RESULTS_IGNORABLES}
152 ( {CM} ) # capture group 12
153 )
154 )
155 """
156 IMPERIAL_HEIGHT = rf"""
157 ( # capture group 13
158 (?:
159 ( {SIGNED_FLOAT} ) # capture group 14
160 {OPTIONAL_RESULTS_IGNORABLES}
161 ( {FEET} ) # capture group 15
162 {OPTIONAL_RESULTS_IGNORABLES}
163 ( {SIGNED_FLOAT} ) # capture group 16
164 {OPTIONAL_RESULTS_IGNORABLES}
165 ( {INCHES} ) # capture group 17
166 )
167 | (?:
168 ( {SIGNED_FLOAT} ) # capture group 18
169 {OPTIONAL_RESULTS_IGNORABLES}
170 ( {FEET} ) # capture group 19
171 )
172 | (?:
173 ( {SIGNED_FLOAT} ) # capture group 20
174 {OPTIONAL_RESULTS_IGNORABLES}
175 ( {INCHES} ) # capture group 21
176 )
177 )
178 """
179 HEIGHT = r"(?: \b height \b)"
180 REGEX = rf"""
181 ( {HEIGHT} ) # group 1 for "height" or equivalent
182 {OPTIONAL_RESULTS_IGNORABLES}
183 ( {TENSE_INDICATOR} )? # optional group 2 for tense
184 {OPTIONAL_RESULTS_IGNORABLES}
185 ( {RELATION} )? # optional group 3 for relation
186 {OPTIONAL_RESULTS_IGNORABLES}
187 (?:
188 {METRIC_HEIGHT}
189 | {IMPERIAL_HEIGHT}
190 )
191 """
193 COMPILED_REGEX = compile_regex(REGEX)
194 NAME = "Height"
195 PREFERRED_UNIT_COLUMN = "value_m"
197 def __init__(
198 self,
199 nlpdef: Optional[NlpDefinition],
200 cfg_processor_name: Optional[str],
201 commit: bool = False,
202 debug: bool = False,
203 ) -> None:
204 # see documentation above
205 super().__init__(
206 nlpdef=nlpdef,
207 cfg_processor_name=cfg_processor_name,
208 variable=self.NAME,
209 target_unit=self.PREFERRED_UNIT_COLUMN,
210 regex_str_for_debugging=self.REGEX,
211 commit=commit,
212 )
213 if debug:
214 print(f"Regex for {self.classname()}: {self.REGEX}")
216 def parse(
217 self, text: str, debug: bool = False
218 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
219 """
220 Parser for Height. Specialized for complex unit conversion.
221 """
222 for m in self.COMPILED_REGEX.finditer(text): # watch out: 'm'/metres
223 if debug:
224 log.info(f"Match {m} for {text!r}")
225 startpos = m.start()
226 endpos = m.end()
227 matching_text = m.group(0) # the whole thing
228 variable_text = m.group(1)
229 tense_text = m.group(2)
230 relation_text = m.group(3)
231 metric_expression = m.group(4)
232 metric_m_and_cm_m = m.group(5)
233 metric_m_and_cm_m_units = m.group(6)
234 metric_m_and_cm_cm = m.group(7)
235 metric_m_and_cm_cm_units = m.group(8)
236 metric_m_only_m = m.group(9)
237 metric_m_only_m_units = m.group(10)
238 metric_cm_only_cm = m.group(11)
239 metric_cm_only_cm_units = m.group(12)
240 imperial_expression = m.group(13)
241 imperial_ft_and_in_ft = m.group(14)
242 imperial_ft_and_in_ft_units = m.group(15)
243 imperial_ft_and_in_in = m.group(16)
244 imperial_ft_and_in_in_units = m.group(17)
245 imperial_ft_only_ft = m.group(18)
246 imperial_ft_only_ft_units = m.group(19)
247 imperial_in_only_in = m.group(20)
248 imperial_in_only_in_units = m.group(21)
250 expression = None
251 value_m = None
252 units = None
253 if metric_expression:
254 expression = metric_expression
255 if metric_m_and_cm_m and metric_m_and_cm_cm:
256 metres = to_pos_float(metric_m_and_cm_m)
257 # ... beware: 'm' above
258 cm = to_pos_float(metric_m_and_cm_cm)
259 value_m = m_from_m_cm(metres=metres, centimetres=cm)
260 units = assemble_units(
261 [metric_m_and_cm_m_units, metric_m_and_cm_cm_units]
262 )
263 elif metric_m_only_m:
264 value_m = to_pos_float(metric_m_only_m)
265 units = metric_m_only_m_units
266 elif metric_cm_only_cm:
267 cm = to_pos_float(metric_cm_only_cm)
268 value_m = m_from_m_cm(centimetres=cm)
269 units = metric_cm_only_cm_units
270 elif imperial_expression:
271 expression = imperial_expression
272 if imperial_ft_and_in_ft and imperial_ft_and_in_in:
273 ft = to_pos_float(imperial_ft_and_in_ft)
274 inches = to_pos_float(imperial_ft_and_in_in)
275 value_m = m_from_ft_in(feet=ft, inches=inches)
276 units = assemble_units(
277 [
278 imperial_ft_and_in_ft_units,
279 imperial_ft_and_in_in_units,
280 ]
281 )
282 elif imperial_ft_only_ft:
283 ft = to_pos_float(imperial_ft_only_ft)
284 value_m = m_from_ft_in(feet=ft)
285 units = imperial_ft_only_ft_units
286 elif imperial_in_only_in:
287 inches = to_pos_float(imperial_in_only_in)
288 value_m = m_from_ft_in(inches=inches)
289 units = imperial_in_only_in_units
291 tense, relation = common_tense(tense_text, relation_text)
293 result = {
294 FN_VARIABLE_NAME: self.variable,
295 FN_CONTENT: matching_text,
296 FN_START: startpos,
297 FN_END: endpos,
298 FN_VARIABLE_TEXT: variable_text,
299 FN_RELATION_TEXT: relation_text,
300 FN_RELATION: relation,
301 FN_VALUE_TEXT: expression,
302 FN_UNITS: units,
303 self.target_unit: value_m,
304 FN_TENSE_TEXT: tense_text,
305 FN_TENSE: tense,
306 }
307 # log.debug(result)
308 yield self.tablename, result
310 def test(self, verbose: bool = False) -> None:
311 # docstring in superclass
312 self.test_numerical_parser(
313 [
314 ("Height", []), # should fail; no values
315 ("her height was 1.6m", [1.6]),
316 ("Height = 1.23 m", [1.23]),
317 ("her height is 1.5m", [1.5]),
318 ("""Height 5'8" """, [m_from_ft_in(feet=5, inches=8)]),
319 ("Height 5 ft 8 in", [m_from_ft_in(feet=5, inches=8)]),
320 ("Height 5 feet 8 inches", [m_from_ft_in(feet=5, inches=8)]),
321 ],
322 verbose=verbose,
323 )
324 self.detailed_test(
325 "Height 5 ft 11 in",
326 [
327 {
328 self.target_unit: m_from_ft_in(feet=5, inches=11),
329 FN_UNITS: "ft in",
330 }
331 ],
332 verbose=verbose,
333 )
334 # todo: Height NLP: deal with "tall" and plain "is", e.g.
335 # she is 6'2"; she is 1.5m tall
338class HeightValidator(ValidatorBase):
339 """
340 Validator for Height (see help for explanation).
341 """
343 @classmethod
344 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
345 return Height.NAME, [Height.HEIGHT]
348# -----------------------------------------------------------------------------
349# Weight (mass)
350# -----------------------------------------------------------------------------
353class Weight(NumericalResultParser):
354 """
355 CLINICAL EXAMINATION.
357 Weight. Handles metric (e.g. "57kg") and imperial (e.g. "10 st 2 lb").
358 Requires units to be specified.
359 """
361 METRIC_WEIGHT = rf"""
362 ( # capture group 4
363 ( {SIGNED_FLOAT} ) # capture group 5
364 {OPTIONAL_RESULTS_IGNORABLES}
365 ( {KG} ) # capture group 6
366 )
367 """
368 IMPERIAL_WEIGHT = rf"""
369 ( # capture group 7
370 (?:
371 ( {SIGNED_FLOAT} ) # capture group 8
372 {OPTIONAL_RESULTS_IGNORABLES}
373 ( {STONES} ) # capture group 9
374 {OPTIONAL_RESULTS_IGNORABLES}
375 ( {SIGNED_FLOAT} ) # capture group 10
376 {OPTIONAL_RESULTS_IGNORABLES}
377 ( {LB} ) # capture group 11
378 )
379 | (?:
380 ( {SIGNED_FLOAT} ) # capture group 12
381 {OPTIONAL_RESULTS_IGNORABLES}
382 ( {STONES} ) # capture group 13
383 )
384 | (?:
385 ( {SIGNED_FLOAT} ) # capture group 14
386 {OPTIONAL_RESULTS_IGNORABLES}
387 ( {LB} ) # capture group 15
388 )
389 )
390 """
391 WEIGHT = r"(?: \b weigh[ts] \b )" # weight, weighs
392 REGEX = rf"""
393 ( {WEIGHT} ) # group 1 for "weight" or equivalent
394 {OPTIONAL_RESULTS_IGNORABLES}
395 ( {TENSE_INDICATOR} )? # optional group 2 for tense
396 {OPTIONAL_RESULTS_IGNORABLES}
397 ( {RELATION} )? # optional group 3 for relation
398 {OPTIONAL_RESULTS_IGNORABLES}
399 (?:
400 {METRIC_WEIGHT}
401 | {IMPERIAL_WEIGHT}
402 )
403 """
405 COMPILED_REGEX = compile_regex(REGEX)
406 NAME = "Weight"
407 PREFERRED_UNIT_COLUMN = "value_kg"
409 def __init__(
410 self,
411 nlpdef: Optional[NlpDefinition],
412 cfg_processor_name: Optional[str],
413 commit: bool = False,
414 debug: bool = False,
415 ) -> None:
416 # see documentation above
417 super().__init__(
418 nlpdef=nlpdef,
419 cfg_processor_name=cfg_processor_name,
420 variable=self.NAME,
421 target_unit=self.PREFERRED_UNIT_COLUMN,
422 regex_str_for_debugging=self.REGEX,
423 commit=commit,
424 )
425 if debug:
426 print(f"Regex for {self.classname()}: {self.REGEX}")
428 def parse(
429 self, text: str, debug: bool = False
430 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
431 """
432 Parser for Weight. Specialized for complex unit conversion.
433 """
434 for m in self.COMPILED_REGEX.finditer(text):
435 if debug:
436 log.info(f"Match {m} for {text!r}")
437 startpos = m.start()
438 endpos = m.end()
439 matching_text = m.group(0) # the whole thing
440 variable_text = m.group(1)
441 tense_text = m.group(2)
442 relation_text = m.group(3)
443 metric_expression = m.group(4)
444 metric_value = m.group(5)
445 metric_units = m.group(6)
446 imperial_expression = m.group(7)
447 imperial_st_and_lb_st = m.group(8)
448 imperial_st_and_lb_st_units = m.group(9)
449 imperial_st_and_lb_lb = m.group(10)
450 imperial_st_and_lb_lb_units = m.group(11)
451 imperial_st_only_st = m.group(12)
452 imperial_st_only_st_units = m.group(13)
453 imperial_lb_only_lb = m.group(14)
454 imperial_lb_only_lb_units = m.group(15)
456 expression = None
457 value_kg = None
458 units = None
459 if metric_expression:
460 expression = metric_expression
461 value_kg = to_float(metric_value)
462 units = metric_units
463 elif imperial_expression:
464 expression = imperial_expression
465 if imperial_st_and_lb_st and imperial_st_and_lb_lb:
466 st = to_float(imperial_st_and_lb_st)
467 lb = to_float(imperial_st_and_lb_lb)
468 value_kg = kg_from_st_lb_oz(stones=st, pounds=lb)
469 units = assemble_units(
470 [
471 imperial_st_and_lb_st_units,
472 imperial_st_and_lb_lb_units,
473 ]
474 )
475 elif imperial_st_only_st:
476 st = to_float(imperial_st_only_st)
477 value_kg = kg_from_st_lb_oz(stones=st)
478 units = imperial_st_only_st_units
479 elif imperial_lb_only_lb:
480 lb = to_float(imperial_lb_only_lb)
481 value_kg = kg_from_st_lb_oz(pounds=lb)
482 units = imperial_lb_only_lb_units
484 # All left as signed float, as you definitely see things like
485 # "weight -0.3 kg" for weight changes.
487 tense, relation = common_tense(tense_text, relation_text)
489 result = {
490 FN_VARIABLE_NAME: self.variable,
491 FN_CONTENT: matching_text,
492 FN_START: startpos,
493 FN_END: endpos,
494 FN_VARIABLE_TEXT: variable_text,
495 FN_RELATION_TEXT: relation_text,
496 FN_RELATION: relation,
497 FN_VALUE_TEXT: expression,
498 FN_UNITS: units,
499 self.target_unit: value_kg,
500 FN_TENSE_TEXT: tense_text,
501 FN_TENSE: tense,
502 }
503 # log.debug(result)
504 yield self.tablename, result
506 def test(self, verbose: bool = False) -> None:
507 # docstring in superclass
508 self.test_numerical_parser(
509 [
510 ("Weight", []), # should fail; no values
511 ("her weight was 60.2kg", [60.2]),
512 ("her weight was 60.2", []), # needs units
513 ("Weight = 52.3kg", [52.3]),
514 ("Weight: 80.8kgs", [80.8]),
515 ("she weighs 61kg", [61]),
516 ("she weighs 61 kg", [61]),
517 ("she weighs 61 kgs", [61]),
518 ("she weighs 61 kilo", [61]),
519 ("she weighs 61 kilos", [61]),
520 ("she weighs 8 stones ", [kg_from_st_lb_oz(stones=8)]),
521 ("she weighs 200 lb", [kg_from_st_lb_oz(pounds=200)]),
522 ("she weighs 200 pounds", [kg_from_st_lb_oz(pounds=200)]),
523 (
524 "she weighs 6 st 12 lb",
525 [kg_from_st_lb_oz(stones=6, pounds=12)],
526 ),
527 ("change in weight -0.4kg", [-0.4]),
528 (
529 "change in weight - 0.4kg",
530 [0.4],
531 ), # ASCII hyphen (hyphen-minus)
532 ("change in weight ‐ 0.4kg", [0.4]), # Unicode hyphen
533 # ("failme", [999]),
534 ("change in weight −0.4kg", [-0.4]), # Unicode minus
535 ("change in weight –0.4kg", [-0.4]), # en dash
536 ("change in weight —0.4kg", [0.4]), # em dash
537 ],
538 verbose=verbose,
539 )
540 self.detailed_test(
541 "Weight: 80.8kgs",
542 [
543 {
544 self.target_unit: 80.8,
545 FN_UNITS: "kgs",
546 }
547 ],
548 verbose=verbose,
549 )
552class WeightValidator(ValidatorBase):
553 """
554 Validator for Weight (see help for explanation).
555 """
557 @classmethod
558 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
559 return Weight.NAME, [Weight.WEIGHT]
562# -----------------------------------------------------------------------------
563# Body mass index (BMI)
564# -----------------------------------------------------------------------------
567class Bmi(SimpleNumericalResultParser):
568 """
569 CLINICAL EXAMINATION.
571 Body mass index (BMI), in kg / m^2.
572 """
574 BMI = rf"""
575 {WORD_BOUNDARY}
576 (?: BMI | body \s+ mass \s+ index )
577 {WORD_BOUNDARY}
578 """
579 REGEX = make_simple_numeric_regex(quantity=BMI, units=KG_PER_SQ_M)
580 NAME = "BMI"
581 PREFERRED_UNIT_COLUMN = "value_kg_per_sq_m"
582 UNIT_MAPPING = {
583 KG_PER_SQ_M: 1, # preferred unit
584 }
585 # deal with "a BMI of 30"?
587 def __init__(
588 self,
589 nlpdef: Optional[NlpDefinition],
590 cfg_processor_name: Optional[str],
591 commit: bool = False,
592 ) -> None:
593 # see documentation above
594 super().__init__(
595 nlpdef=nlpdef,
596 cfg_processor_name=cfg_processor_name,
597 regex_str=self.REGEX,
598 variable=self.NAME,
599 target_unit=self.PREFERRED_UNIT_COLUMN,
600 units_to_factor=self.UNIT_MAPPING,
601 commit=commit,
602 take_absolute=True,
603 )
605 def test(self, verbose: bool = False) -> None:
606 # docstring in superclass
607 self.test_numerical_parser(
608 [
609 ("BMI", []), # should fail; no values
610 ("body mass index was 30", [30]),
611 ("his BMI (30) is too high", [30]),
612 ("BMI 25 kg/sq m", [25]),
613 ("BMI was 18.4 kg/m^-2", [18.4]),
614 ("ACE 79", []),
615 ("BMI-23", [23]),
616 ],
617 verbose=verbose,
618 )
621class BmiValidator(ValidatorBase):
622 """
623 Validator for Bmi (see help for explanation).
624 """
626 @classmethod
627 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
628 return Bmi.NAME, [Bmi.BMI]
631# =============================================================================
632# Bedside investigations: BP
633# =============================================================================
636class Bp(BaseNlpParser):
637 """
638 CLINICAL EXAMINATION.
640 Blood pressure, in mmHg. (Systolic and diastolic.)
641 """
643 # Since we produce two variables, SBP and DBP, and we use something a
644 # little more complex than
645 # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa: E501
646 # we subclass :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
647 # directly.)
649 BP = r"(?: \b blood \s+ pressure \b | \b B\.?P\.? \b )"
650 SYSTOLIC_BP = rf"(?: \b systolic \s+ {BP} | \b S\.?B\.?P\.? \b )"
651 DIASTOLIC_BP = rf"(?: \b diastolic \s+ {BP} | \b D\.?B\.?P\.? \b )"
653 TWO_NUMBER_BP = rf"""
654 ( {SIGNED_FLOAT} )
655 \s* (?: \b over \b | \/ ) \s*
656 ( {SIGNED_FLOAT} )
657 """
658 ONE_NUMBER_BP = SIGNED_FLOAT
660 COMPILED_BP = compile_regex(BP)
661 COMPILED_SBP = compile_regex(SYSTOLIC_BP)
662 COMPILED_DBP = compile_regex(DIASTOLIC_BP)
663 COMPILED_ONE_NUMBER_BP = compile_regex(ONE_NUMBER_BP)
664 COMPILED_TWO_NUMBER_BP = compile_regex(TWO_NUMBER_BP)
665 REGEX = rf"""
666 ( # group for "BP" or equivalent
667 {SYSTOLIC_BP} # ... from more to less specific
668 | {DIASTOLIC_BP}
669 | {BP}
670 )
671 {OPTIONAL_RESULTS_IGNORABLES}
672 ( {TENSE_INDICATOR} )? # optional group for tense indicator
673 {OPTIONAL_RESULTS_IGNORABLES}
674 ( {RELATION} )? # optional group for relation
675 {OPTIONAL_RESULTS_IGNORABLES}
676 (
677 {SIGNED_FLOAT} # systolic
678 (?:
679 \s* (?: \b over \b | \/ ) \s* # /
680 {SIGNED_FLOAT} # diastolic
681 )?
682 )
683 {OPTIONAL_RESULTS_IGNORABLES}
684 ( # group for units
685 {MM_HG}
686 )?
687 """
688 COMPILED_REGEX = compile_regex(REGEX)
690 FN_SYSTOLIC_BP_MMHG = "systolic_bp_mmhg"
691 FN_DIASTOLIC_BP_MMHG = "diastolic_bp_mmhg"
693 NAME = "BP"
694 UNIT_MAPPING = {
695 MM_HG: 1, # preferred unit
696 }
698 def __init__(
699 self,
700 nlpdef: Optional[NlpDefinition],
701 cfg_processor_name: Optional[str],
702 commit: bool = False,
703 ) -> None:
704 # see documentation above
705 super().__init__(
706 nlpdef=nlpdef,
707 cfg_processor_name=cfg_processor_name,
708 commit=commit,
709 friendly_name=self.NAME,
710 )
711 if nlpdef is None: # only None for debugging!
712 self.tablename = self.classname().lower()
713 else:
714 self.tablename = self._cfgsection.opt_str(
715 ProcessorConfigKeys.DESTTABLE, required=True
716 )
718 def dest_tables_columns(self) -> Dict[str, List[Column]]:
719 # docstring in superclass
720 return {
721 self.tablename: [
722 Column(FN_CONTENT, Text, comment=HELP_CONTENT),
723 Column(FN_START, Integer, comment=HELP_START),
724 Column(FN_END, Integer, comment=HELP_END),
725 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT),
726 Column(
727 FN_RELATION_TEXT,
728 String(MAX_RELATION_TEXT_LENGTH),
729 comment=HELP_RELATION_TEXT,
730 ),
731 Column(
732 FN_RELATION,
733 String(MAX_RELATION_LENGTH),
734 comment=HELP_RELATION,
735 ),
736 Column(
737 FN_VALUE_TEXT,
738 String(MAX_VALUE_TEXT_LENGTH),
739 comment=HELP_VALUE_TEXT,
740 ),
741 Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS),
742 Column(
743 self.FN_SYSTOLIC_BP_MMHG,
744 Float,
745 comment="Systolic blood pressure in mmHg",
746 ),
747 Column(
748 self.FN_DIASTOLIC_BP_MMHG,
749 Float,
750 comment="Diastolic blood pressure in mmHg",
751 ),
752 Column(
753 FN_TENSE_TEXT,
754 String(MAX_TENSE_TEXT_LENGTH),
755 comment=HELP_TENSE_TEXT,
756 ),
757 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE),
758 ]
759 }
761 def parse(
762 self, text: str, debug: bool = False
763 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
764 """
765 Parser for BP. Specialized because we're fetching two numbers.
766 """
767 for m in self.COMPILED_REGEX.finditer(text):
768 if debug:
769 log.info(f"Match {m} for {text!r}")
770 startpos = m.start()
771 endpos = m.end()
772 matching_text = m.group(0) # the whole thing
773 variable_text = m.group(1)
774 tense_text = m.group(2)
775 relation_text = m.group(3)
776 value_text = m.group(4)
777 units = m.group(5)
779 sbp = None
780 dbp = None
781 if self.COMPILED_SBP.match(variable_text):
782 if self.COMPILED_ONE_NUMBER_BP.match(value_text):
783 sbp = to_pos_float(value_text)
784 elif self.COMPILED_DBP.match(variable_text):
785 if self.COMPILED_ONE_NUMBER_BP.match(value_text):
786 dbp = to_pos_float(value_text)
787 elif self.COMPILED_BP.match(variable_text):
788 bpmatch = self.COMPILED_TWO_NUMBER_BP.match(value_text)
789 if bpmatch:
790 sbp = to_pos_float(bpmatch.group(1))
791 dbp = to_pos_float(bpmatch.group(2))
792 if sbp is None and dbp is None:
793 # This is OK; e.g. "BP 110", which we will ignore.
794 # log.warning(
795 # "Failed interpretation: matching_text={matching_text}, "
796 # "variable_text={variable_text}, "
797 # "tense_indicator={tense_indicator}, "
798 # "relation={relation}, "
799 # "value_text={value_text}, "
800 # "units={units}".format(
801 # matching_text=repr(matching_text),
802 # variable_text=repr(variable_text),
803 # tense_indicator=repr(tense_indicator),
804 # relation=repr(relation),
805 # value_text=repr(value_text),
806 # units=repr(units),
807 # )
808 # )
809 continue
811 tense, relation = common_tense(tense_text, relation_text)
813 yield self.tablename, {
814 FN_CONTENT: matching_text,
815 FN_START: startpos,
816 FN_END: endpos,
817 FN_VARIABLE_TEXT: variable_text,
818 FN_RELATION_TEXT: relation_text,
819 FN_RELATION: relation,
820 FN_VALUE_TEXT: value_text,
821 FN_UNITS: units,
822 self.FN_SYSTOLIC_BP_MMHG: sbp,
823 self.FN_DIASTOLIC_BP_MMHG: dbp,
824 FN_TENSE_TEXT: tense_text,
825 FN_TENSE: tense,
826 }
828 def test_bp_parser(
829 self,
830 test_expected_list: List[Tuple[str, List[Tuple[float, float]]]],
831 verbose: bool = False,
832 ) -> None:
833 """
834 Called by :func:`test`.
836 Args:
837 test_expected_list:
838 tuple ``source_text, expected_values`` where
839 ``expected_values`` is a list of tuples like ``sbp, dbp``.
840 verbose:
841 be verbose?
842 """
843 log.info(f"Testing parser: {self.classname()}")
844 if verbose:
845 log.debug(f"... regex:\n{self.REGEX}")
846 for test_string, expected_values in test_expected_list:
847 actual_values = list(
848 (x[self.FN_SYSTOLIC_BP_MMHG], x[self.FN_DIASTOLIC_BP_MMHG])
849 for t, x in self.parse(test_string)
850 )
851 assert actual_values == expected_values, (
852 "Parser {name}: Expected {expected}, got {actual}, when "
853 "parsing {test_string}; full result={full}".format(
854 name=self.classname(),
855 expected=expected_values,
856 actual=actual_values,
857 test_string=repr(test_string),
858 full=repr(list(self.parse(test_string))),
859 )
860 )
861 log.info("... OK")
863 def test(self, verbose: bool = False) -> None:
864 # docstring in superclass
865 self.test_bp_parser(
866 [
867 ("BP", []), # should fail; no values
868 ("his blood pressure was 120/80", [(120, 80)]),
869 ("BP 120/80 mmhg", [(120, 80)]),
870 ("systolic BP 120", [(120, None)]),
871 ("diastolic BP 80", [(None, 80)]),
872 ("BP-130/70", [(130, 70)]),
873 ("BP 110 /80", [(110, 80)]),
874 ("BP 110 /80 -", [(110, 80)]), # real example
875 ("BP 120 / 70 -", [(120, 70)]), # real example
876 ("BP :115 / 70 -", [(115, 70)]), # real example
877 ("B.P 110", []), # real example
878 ],
879 verbose=verbose,
880 )
881 # 1. Unsure if best to take abs value.
882 # One reason not to might be if people express changes, e.g.
883 # "BP change -40/-10", but I very much doubt it.
884 # Went with abs value using to_pos_float().
885 # 2. "BP 110" - too unreliable; not definitely a blood pressure.
888class BpValidator(ValidatorBase):
889 """
890 Validator for Bp (see help for explanation).
891 """
893 @classmethod
894 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
895 return Bp.NAME, [Bp.REGEX]
898# =============================================================================
899# All classes in this module
900# =============================================================================
902ALL_CLINICAL_NLP_AND_VALIDATORS = [
903 (Bmi, BmiValidator),
904 (Bp, BpValidator),
905 (Height, HeightValidator),
906 (Weight, WeightValidator),
907]