Coverage for nlp_manager/parse_clinical.py: 91%

240 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/parse_clinical.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Python regex-based NLP processors for clinical assessment data.** 

27 

28Most inherit from 

29:class:`crate_anon.nlp_manager.regex_parser.SimpleNumericalResultParser` and 

30are constructed with these arguments: 

31 

32nlpdef: 

33 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

34cfgsection: 

35 the name of a CRATE NLP config file section (from which we may 

36 choose to get extra config information) 

37commit: 

38 force a COMMIT whenever we insert data? You should specify this 

39 in multiprocess mode, or you may get database deadlocks. 

40 

41± these: 

42 

43debug: 

44 show debugging information 

45 

46""" 

47 

48import logging 

49from typing import Any, Dict, Generator, List, Optional, Tuple 

50 

51from sqlalchemy import Column, Integer, Float, String, Text 

52 

53from crate_anon.common.regex_helpers import WORD_BOUNDARY 

54from crate_anon.nlp_manager.constants import ProcessorConfigKeys 

55from crate_anon.nlp_manager.nlp_definition import NlpDefinition 

56from crate_anon.nlp_manager.regex_parser import ( 

57 BaseNlpParser, 

58 common_tense, 

59 compile_regex, 

60 FN_CONTENT, 

61 FN_END, 

62 FN_RELATION, 

63 FN_RELATION_TEXT, 

64 FN_START, 

65 FN_TENSE, 

66 FN_TENSE_TEXT, 

67 FN_UNITS, 

68 FN_VALUE_TEXT, 

69 FN_VARIABLE_NAME, 

70 FN_VARIABLE_TEXT, 

71 HELP_CONTENT, 

72 HELP_END, 

73 HELP_RELATION, 

74 HELP_RELATION_TEXT, 

75 HELP_START, 

76 HELP_TENSE, 

77 HELP_TENSE_TEXT, 

78 HELP_UNITS, 

79 HELP_VALUE_TEXT, 

80 HELP_VARIABLE_TEXT, 

81 make_simple_numeric_regex, 

82 MAX_RELATION_LENGTH, 

83 MAX_RELATION_TEXT_LENGTH, 

84 MAX_TENSE_LENGTH, 

85 MAX_TENSE_TEXT_LENGTH, 

86 MAX_UNITS_LENGTH, 

87 MAX_VALUE_TEXT_LENGTH, 

88 NumericalResultParser, 

89 OPTIONAL_RESULTS_IGNORABLES, 

90 RELATION, 

91 SimpleNumericalResultParser, 

92 TENSE_INDICATOR, 

93 to_float, 

94 to_pos_float, 

95 ValidatorBase, 

96) 

97from crate_anon.nlp_manager.regex_numbers import SIGNED_FLOAT 

98from crate_anon.nlp_manager.regex_units import ( 

99 assemble_units, 

100 CM, 

101 FEET, 

102 INCHES, 

103 KG, 

104 kg_from_st_lb_oz, 

105 KG_PER_SQ_M, 

106 LB, 

107 M, 

108 m_from_ft_in, 

109 m_from_m_cm, 

110 MM_HG, 

111 STONES, 

112) 

113 

114log = logging.getLogger(__name__) 

115 

116 

117# ============================================================================= 

118# Anthropometrics 

119# ============================================================================= 

120 

121# ----------------------------------------------------------------------------- 

122# Height 

123# ----------------------------------------------------------------------------- 

124 

125 

126class Height(NumericalResultParser): 

127 """ 

128 CLINICAL EXAMINATION. 

129 

130 Height. Handles metric (e.g. "1.8m") and imperial (e.g. "5 ft 2 in"). 

131 """ 

132 

133 METRIC_HEIGHT = rf""" 

134 ( # capture group 4 

135 (?: 

136 ( {SIGNED_FLOAT} ) # capture group 5 

137 {OPTIONAL_RESULTS_IGNORABLES} 

138 ( {M} ) # capture group 6 

139 {OPTIONAL_RESULTS_IGNORABLES} 

140 ( {SIGNED_FLOAT} ) # capture group 7 

141 {OPTIONAL_RESULTS_IGNORABLES} 

142 ( {CM} ) # capture group 8 

143 ) 

144 | (?: 

145 ( {SIGNED_FLOAT} ) # capture group 9 

146 {OPTIONAL_RESULTS_IGNORABLES} 

147 ( {M} ) # capture group 10 

148 ) 

149 | (?: 

150 ( {SIGNED_FLOAT} ) # capture group 11 

151 {OPTIONAL_RESULTS_IGNORABLES} 

152 ( {CM} ) # capture group 12 

153 ) 

154 ) 

155 """ 

156 IMPERIAL_HEIGHT = rf""" 

157 ( # capture group 13 

158 (?: 

159 ( {SIGNED_FLOAT} ) # capture group 14 

160 {OPTIONAL_RESULTS_IGNORABLES} 

161 ( {FEET} ) # capture group 15 

162 {OPTIONAL_RESULTS_IGNORABLES} 

163 ( {SIGNED_FLOAT} ) # capture group 16 

164 {OPTIONAL_RESULTS_IGNORABLES} 

165 ( {INCHES} ) # capture group 17 

166 ) 

167 | (?: 

168 ( {SIGNED_FLOAT} ) # capture group 18 

169 {OPTIONAL_RESULTS_IGNORABLES} 

170 ( {FEET} ) # capture group 19 

171 ) 

172 | (?: 

173 ( {SIGNED_FLOAT} ) # capture group 20 

174 {OPTIONAL_RESULTS_IGNORABLES} 

175 ( {INCHES} ) # capture group 21 

176 ) 

177 ) 

178 """ 

179 HEIGHT = r"(?: \b height \b)" 

180 REGEX = rf""" 

181 ( {HEIGHT} ) # group 1 for "height" or equivalent 

182 {OPTIONAL_RESULTS_IGNORABLES} 

183 ( {TENSE_INDICATOR} )? # optional group 2 for tense 

184 {OPTIONAL_RESULTS_IGNORABLES} 

185 ( {RELATION} )? # optional group 3 for relation 

186 {OPTIONAL_RESULTS_IGNORABLES} 

187 (?: 

188 {METRIC_HEIGHT} 

189 | {IMPERIAL_HEIGHT} 

190 ) 

191 """ 

192 

193 COMPILED_REGEX = compile_regex(REGEX) 

194 NAME = "Height" 

195 PREFERRED_UNIT_COLUMN = "value_m" 

196 

197 def __init__( 

198 self, 

199 nlpdef: Optional[NlpDefinition], 

200 cfg_processor_name: Optional[str], 

201 commit: bool = False, 

202 debug: bool = False, 

203 ) -> None: 

204 # see documentation above 

205 super().__init__( 

206 nlpdef=nlpdef, 

207 cfg_processor_name=cfg_processor_name, 

208 variable=self.NAME, 

209 target_unit=self.PREFERRED_UNIT_COLUMN, 

210 regex_str_for_debugging=self.REGEX, 

211 commit=commit, 

212 ) 

213 if debug: 

214 print(f"Regex for {self.classname()}: {self.REGEX}") 

215 

216 def parse( 

217 self, text: str, debug: bool = False 

218 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

219 """ 

220 Parser for Height. Specialized for complex unit conversion. 

221 """ 

222 for m in self.COMPILED_REGEX.finditer(text): # watch out: 'm'/metres 

223 if debug: 

224 log.info(f"Match {m} for {text!r}") 

225 startpos = m.start() 

226 endpos = m.end() 

227 matching_text = m.group(0) # the whole thing 

228 variable_text = m.group(1) 

229 tense_text = m.group(2) 

230 relation_text = m.group(3) 

231 metric_expression = m.group(4) 

232 metric_m_and_cm_m = m.group(5) 

233 metric_m_and_cm_m_units = m.group(6) 

234 metric_m_and_cm_cm = m.group(7) 

235 metric_m_and_cm_cm_units = m.group(8) 

236 metric_m_only_m = m.group(9) 

237 metric_m_only_m_units = m.group(10) 

238 metric_cm_only_cm = m.group(11) 

239 metric_cm_only_cm_units = m.group(12) 

240 imperial_expression = m.group(13) 

241 imperial_ft_and_in_ft = m.group(14) 

242 imperial_ft_and_in_ft_units = m.group(15) 

243 imperial_ft_and_in_in = m.group(16) 

244 imperial_ft_and_in_in_units = m.group(17) 

245 imperial_ft_only_ft = m.group(18) 

246 imperial_ft_only_ft_units = m.group(19) 

247 imperial_in_only_in = m.group(20) 

248 imperial_in_only_in_units = m.group(21) 

249 

250 expression = None 

251 value_m = None 

252 units = None 

253 if metric_expression: 

254 expression = metric_expression 

255 if metric_m_and_cm_m and metric_m_and_cm_cm: 

256 metres = to_pos_float(metric_m_and_cm_m) 

257 # ... beware: 'm' above 

258 cm = to_pos_float(metric_m_and_cm_cm) 

259 value_m = m_from_m_cm(metres=metres, centimetres=cm) 

260 units = assemble_units( 

261 [metric_m_and_cm_m_units, metric_m_and_cm_cm_units] 

262 ) 

263 elif metric_m_only_m: 

264 value_m = to_pos_float(metric_m_only_m) 

265 units = metric_m_only_m_units 

266 elif metric_cm_only_cm: 

267 cm = to_pos_float(metric_cm_only_cm) 

268 value_m = m_from_m_cm(centimetres=cm) 

269 units = metric_cm_only_cm_units 

270 elif imperial_expression: 

271 expression = imperial_expression 

272 if imperial_ft_and_in_ft and imperial_ft_and_in_in: 

273 ft = to_pos_float(imperial_ft_and_in_ft) 

274 inches = to_pos_float(imperial_ft_and_in_in) 

275 value_m = m_from_ft_in(feet=ft, inches=inches) 

276 units = assemble_units( 

277 [ 

278 imperial_ft_and_in_ft_units, 

279 imperial_ft_and_in_in_units, 

280 ] 

281 ) 

282 elif imperial_ft_only_ft: 

283 ft = to_pos_float(imperial_ft_only_ft) 

284 value_m = m_from_ft_in(feet=ft) 

285 units = imperial_ft_only_ft_units 

286 elif imperial_in_only_in: 

287 inches = to_pos_float(imperial_in_only_in) 

288 value_m = m_from_ft_in(inches=inches) 

289 units = imperial_in_only_in_units 

290 

291 tense, relation = common_tense(tense_text, relation_text) 

292 

293 result = { 

294 FN_VARIABLE_NAME: self.variable, 

295 FN_CONTENT: matching_text, 

296 FN_START: startpos, 

297 FN_END: endpos, 

298 FN_VARIABLE_TEXT: variable_text, 

299 FN_RELATION_TEXT: relation_text, 

300 FN_RELATION: relation, 

301 FN_VALUE_TEXT: expression, 

302 FN_UNITS: units, 

303 self.target_unit: value_m, 

304 FN_TENSE_TEXT: tense_text, 

305 FN_TENSE: tense, 

306 } 

307 # log.debug(result) 

308 yield self.tablename, result 

309 

310 def test(self, verbose: bool = False) -> None: 

311 # docstring in superclass 

312 self.test_numerical_parser( 

313 [ 

314 ("Height", []), # should fail; no values 

315 ("her height was 1.6m", [1.6]), 

316 ("Height = 1.23 m", [1.23]), 

317 ("her height is 1.5m", [1.5]), 

318 ("""Height 5'8" """, [m_from_ft_in(feet=5, inches=8)]), 

319 ("Height 5 ft 8 in", [m_from_ft_in(feet=5, inches=8)]), 

320 ("Height 5 feet 8 inches", [m_from_ft_in(feet=5, inches=8)]), 

321 ], 

322 verbose=verbose, 

323 ) 

324 self.detailed_test( 

325 "Height 5 ft 11 in", 

326 [ 

327 { 

328 self.target_unit: m_from_ft_in(feet=5, inches=11), 

329 FN_UNITS: "ft in", 

330 } 

331 ], 

332 verbose=verbose, 

333 ) 

334 # todo: Height NLP: deal with "tall" and plain "is", e.g. 

335 # she is 6'2"; she is 1.5m tall 

336 

337 

338class HeightValidator(ValidatorBase): 

339 """ 

340 Validator for Height (see help for explanation). 

341 """ 

342 

343 @classmethod 

344 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

345 return Height.NAME, [Height.HEIGHT] 

346 

347 

348# ----------------------------------------------------------------------------- 

349# Weight (mass) 

350# ----------------------------------------------------------------------------- 

351 

352 

353class Weight(NumericalResultParser): 

354 """ 

355 CLINICAL EXAMINATION. 

356 

357 Weight. Handles metric (e.g. "57kg") and imperial (e.g. "10 st 2 lb"). 

358 Requires units to be specified. 

359 """ 

360 

361 METRIC_WEIGHT = rf""" 

362 ( # capture group 4 

363 ( {SIGNED_FLOAT} ) # capture group 5 

364 {OPTIONAL_RESULTS_IGNORABLES} 

365 ( {KG} ) # capture group 6 

366 ) 

367 """ 

368 IMPERIAL_WEIGHT = rf""" 

369 ( # capture group 7 

370 (?: 

371 ( {SIGNED_FLOAT} ) # capture group 8 

372 {OPTIONAL_RESULTS_IGNORABLES} 

373 ( {STONES} ) # capture group 9 

374 {OPTIONAL_RESULTS_IGNORABLES} 

375 ( {SIGNED_FLOAT} ) # capture group 10 

376 {OPTIONAL_RESULTS_IGNORABLES} 

377 ( {LB} ) # capture group 11 

378 ) 

379 | (?: 

380 ( {SIGNED_FLOAT} ) # capture group 12 

381 {OPTIONAL_RESULTS_IGNORABLES} 

382 ( {STONES} ) # capture group 13 

383 ) 

384 | (?: 

385 ( {SIGNED_FLOAT} ) # capture group 14 

386 {OPTIONAL_RESULTS_IGNORABLES} 

387 ( {LB} ) # capture group 15 

388 ) 

389 ) 

390 """ 

391 WEIGHT = r"(?: \b weigh[ts] \b )" # weight, weighs 

392 REGEX = rf""" 

393 ( {WEIGHT} ) # group 1 for "weight" or equivalent 

394 {OPTIONAL_RESULTS_IGNORABLES} 

395 ( {TENSE_INDICATOR} )? # optional group 2 for tense 

396 {OPTIONAL_RESULTS_IGNORABLES} 

397 ( {RELATION} )? # optional group 3 for relation 

398 {OPTIONAL_RESULTS_IGNORABLES} 

399 (?: 

400 {METRIC_WEIGHT} 

401 | {IMPERIAL_WEIGHT} 

402 ) 

403 """ 

404 

405 COMPILED_REGEX = compile_regex(REGEX) 

406 NAME = "Weight" 

407 PREFERRED_UNIT_COLUMN = "value_kg" 

408 

409 def __init__( 

410 self, 

411 nlpdef: Optional[NlpDefinition], 

412 cfg_processor_name: Optional[str], 

413 commit: bool = False, 

414 debug: bool = False, 

415 ) -> None: 

416 # see documentation above 

417 super().__init__( 

418 nlpdef=nlpdef, 

419 cfg_processor_name=cfg_processor_name, 

420 variable=self.NAME, 

421 target_unit=self.PREFERRED_UNIT_COLUMN, 

422 regex_str_for_debugging=self.REGEX, 

423 commit=commit, 

424 ) 

425 if debug: 

426 print(f"Regex for {self.classname()}: {self.REGEX}") 

427 

428 def parse( 

429 self, text: str, debug: bool = False 

430 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

431 """ 

432 Parser for Weight. Specialized for complex unit conversion. 

433 """ 

434 for m in self.COMPILED_REGEX.finditer(text): 

435 if debug: 

436 log.info(f"Match {m} for {text!r}") 

437 startpos = m.start() 

438 endpos = m.end() 

439 matching_text = m.group(0) # the whole thing 

440 variable_text = m.group(1) 

441 tense_text = m.group(2) 

442 relation_text = m.group(3) 

443 metric_expression = m.group(4) 

444 metric_value = m.group(5) 

445 metric_units = m.group(6) 

446 imperial_expression = m.group(7) 

447 imperial_st_and_lb_st = m.group(8) 

448 imperial_st_and_lb_st_units = m.group(9) 

449 imperial_st_and_lb_lb = m.group(10) 

450 imperial_st_and_lb_lb_units = m.group(11) 

451 imperial_st_only_st = m.group(12) 

452 imperial_st_only_st_units = m.group(13) 

453 imperial_lb_only_lb = m.group(14) 

454 imperial_lb_only_lb_units = m.group(15) 

455 

456 expression = None 

457 value_kg = None 

458 units = None 

459 if metric_expression: 

460 expression = metric_expression 

461 value_kg = to_float(metric_value) 

462 units = metric_units 

463 elif imperial_expression: 

464 expression = imperial_expression 

465 if imperial_st_and_lb_st and imperial_st_and_lb_lb: 

466 st = to_float(imperial_st_and_lb_st) 

467 lb = to_float(imperial_st_and_lb_lb) 

468 value_kg = kg_from_st_lb_oz(stones=st, pounds=lb) 

469 units = assemble_units( 

470 [ 

471 imperial_st_and_lb_st_units, 

472 imperial_st_and_lb_lb_units, 

473 ] 

474 ) 

475 elif imperial_st_only_st: 

476 st = to_float(imperial_st_only_st) 

477 value_kg = kg_from_st_lb_oz(stones=st) 

478 units = imperial_st_only_st_units 

479 elif imperial_lb_only_lb: 

480 lb = to_float(imperial_lb_only_lb) 

481 value_kg = kg_from_st_lb_oz(pounds=lb) 

482 units = imperial_lb_only_lb_units 

483 

484 # All left as signed float, as you definitely see things like 

485 # "weight -0.3 kg" for weight changes. 

486 

487 tense, relation = common_tense(tense_text, relation_text) 

488 

489 result = { 

490 FN_VARIABLE_NAME: self.variable, 

491 FN_CONTENT: matching_text, 

492 FN_START: startpos, 

493 FN_END: endpos, 

494 FN_VARIABLE_TEXT: variable_text, 

495 FN_RELATION_TEXT: relation_text, 

496 FN_RELATION: relation, 

497 FN_VALUE_TEXT: expression, 

498 FN_UNITS: units, 

499 self.target_unit: value_kg, 

500 FN_TENSE_TEXT: tense_text, 

501 FN_TENSE: tense, 

502 } 

503 # log.debug(result) 

504 yield self.tablename, result 

505 

506 def test(self, verbose: bool = False) -> None: 

507 # docstring in superclass 

508 self.test_numerical_parser( 

509 [ 

510 ("Weight", []), # should fail; no values 

511 ("her weight was 60.2kg", [60.2]), 

512 ("her weight was 60.2", []), # needs units 

513 ("Weight = 52.3kg", [52.3]), 

514 ("Weight: 80.8kgs", [80.8]), 

515 ("she weighs 61kg", [61]), 

516 ("she weighs 61 kg", [61]), 

517 ("she weighs 61 kgs", [61]), 

518 ("she weighs 61 kilo", [61]), 

519 ("she weighs 61 kilos", [61]), 

520 ("she weighs 8 stones ", [kg_from_st_lb_oz(stones=8)]), 

521 ("she weighs 200 lb", [kg_from_st_lb_oz(pounds=200)]), 

522 ("she weighs 200 pounds", [kg_from_st_lb_oz(pounds=200)]), 

523 ( 

524 "she weighs 6 st 12 lb", 

525 [kg_from_st_lb_oz(stones=6, pounds=12)], 

526 ), 

527 ("change in weight -0.4kg", [-0.4]), 

528 ( 

529 "change in weight - 0.4kg", 

530 [0.4], 

531 ), # ASCII hyphen (hyphen-minus) 

532 ("change in weight ‐ 0.4kg", [0.4]), # Unicode hyphen 

533 # ("failme", [999]), 

534 ("change in weight −0.4kg", [-0.4]), # Unicode minus 

535 ("change in weight –0.4kg", [-0.4]), # en dash 

536 ("change in weight —0.4kg", [0.4]), # em dash 

537 ], 

538 verbose=verbose, 

539 ) 

540 self.detailed_test( 

541 "Weight: 80.8kgs", 

542 [ 

543 { 

544 self.target_unit: 80.8, 

545 FN_UNITS: "kgs", 

546 } 

547 ], 

548 verbose=verbose, 

549 ) 

550 

551 

552class WeightValidator(ValidatorBase): 

553 """ 

554 Validator for Weight (see help for explanation). 

555 """ 

556 

557 @classmethod 

558 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

559 return Weight.NAME, [Weight.WEIGHT] 

560 

561 

562# ----------------------------------------------------------------------------- 

563# Body mass index (BMI) 

564# ----------------------------------------------------------------------------- 

565 

566 

567class Bmi(SimpleNumericalResultParser): 

568 """ 

569 CLINICAL EXAMINATION. 

570 

571 Body mass index (BMI), in kg / m^2. 

572 """ 

573 

574 BMI = rf""" 

575 {WORD_BOUNDARY} 

576 (?: BMI | body \s+ mass \s+ index ) 

577 {WORD_BOUNDARY} 

578 """ 

579 REGEX = make_simple_numeric_regex(quantity=BMI, units=KG_PER_SQ_M) 

580 NAME = "BMI" 

581 PREFERRED_UNIT_COLUMN = "value_kg_per_sq_m" 

582 UNIT_MAPPING = { 

583 KG_PER_SQ_M: 1, # preferred unit 

584 } 

585 # deal with "a BMI of 30"? 

586 

587 def __init__( 

588 self, 

589 nlpdef: Optional[NlpDefinition], 

590 cfg_processor_name: Optional[str], 

591 commit: bool = False, 

592 ) -> None: 

593 # see documentation above 

594 super().__init__( 

595 nlpdef=nlpdef, 

596 cfg_processor_name=cfg_processor_name, 

597 regex_str=self.REGEX, 

598 variable=self.NAME, 

599 target_unit=self.PREFERRED_UNIT_COLUMN, 

600 units_to_factor=self.UNIT_MAPPING, 

601 commit=commit, 

602 take_absolute=True, 

603 ) 

604 

605 def test(self, verbose: bool = False) -> None: 

606 # docstring in superclass 

607 self.test_numerical_parser( 

608 [ 

609 ("BMI", []), # should fail; no values 

610 ("body mass index was 30", [30]), 

611 ("his BMI (30) is too high", [30]), 

612 ("BMI 25 kg/sq m", [25]), 

613 ("BMI was 18.4 kg/m^-2", [18.4]), 

614 ("ACE 79", []), 

615 ("BMI-23", [23]), 

616 ], 

617 verbose=verbose, 

618 ) 

619 

620 

621class BmiValidator(ValidatorBase): 

622 """ 

623 Validator for Bmi (see help for explanation). 

624 """ 

625 

626 @classmethod 

627 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

628 return Bmi.NAME, [Bmi.BMI] 

629 

630 

631# ============================================================================= 

632# Bedside investigations: BP 

633# ============================================================================= 

634 

635 

636class Bp(BaseNlpParser): 

637 """ 

638 CLINICAL EXAMINATION. 

639 

640 Blood pressure, in mmHg. (Systolic and diastolic.) 

641 """ 

642 

643 # Since we produce two variables, SBP and DBP, and we use something a 

644 # little more complex than 

645 # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa: E501 

646 # we subclass :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser` 

647 # directly.) 

648 

649 BP = r"(?: \b blood \s+ pressure \b | \b B\.?P\.? \b )" 

650 SYSTOLIC_BP = rf"(?: \b systolic \s+ {BP} | \b S\.?B\.?P\.? \b )" 

651 DIASTOLIC_BP = rf"(?: \b diastolic \s+ {BP} | \b D\.?B\.?P\.? \b )" 

652 

653 TWO_NUMBER_BP = rf""" 

654 ( {SIGNED_FLOAT} ) 

655 \s* (?: \b over \b | \/ ) \s* 

656 ( {SIGNED_FLOAT} ) 

657 """ 

658 ONE_NUMBER_BP = SIGNED_FLOAT 

659 

660 COMPILED_BP = compile_regex(BP) 

661 COMPILED_SBP = compile_regex(SYSTOLIC_BP) 

662 COMPILED_DBP = compile_regex(DIASTOLIC_BP) 

663 COMPILED_ONE_NUMBER_BP = compile_regex(ONE_NUMBER_BP) 

664 COMPILED_TWO_NUMBER_BP = compile_regex(TWO_NUMBER_BP) 

665 REGEX = rf""" 

666 ( # group for "BP" or equivalent 

667 {SYSTOLIC_BP} # ... from more to less specific 

668 | {DIASTOLIC_BP} 

669 | {BP} 

670 ) 

671 {OPTIONAL_RESULTS_IGNORABLES} 

672 ( {TENSE_INDICATOR} )? # optional group for tense indicator 

673 {OPTIONAL_RESULTS_IGNORABLES} 

674 ( {RELATION} )? # optional group for relation 

675 {OPTIONAL_RESULTS_IGNORABLES} 

676 ( 

677 {SIGNED_FLOAT} # systolic 

678 (?: 

679 \s* (?: \b over \b | \/ ) \s* # / 

680 {SIGNED_FLOAT} # diastolic 

681 )? 

682 ) 

683 {OPTIONAL_RESULTS_IGNORABLES} 

684 ( # group for units 

685 {MM_HG} 

686 )? 

687 """ 

688 COMPILED_REGEX = compile_regex(REGEX) 

689 

690 FN_SYSTOLIC_BP_MMHG = "systolic_bp_mmhg" 

691 FN_DIASTOLIC_BP_MMHG = "diastolic_bp_mmhg" 

692 

693 NAME = "BP" 

694 UNIT_MAPPING = { 

695 MM_HG: 1, # preferred unit 

696 } 

697 

698 def __init__( 

699 self, 

700 nlpdef: Optional[NlpDefinition], 

701 cfg_processor_name: Optional[str], 

702 commit: bool = False, 

703 ) -> None: 

704 # see documentation above 

705 super().__init__( 

706 nlpdef=nlpdef, 

707 cfg_processor_name=cfg_processor_name, 

708 commit=commit, 

709 friendly_name=self.NAME, 

710 ) 

711 if nlpdef is None: # only None for debugging! 

712 self.tablename = self.classname().lower() 

713 else: 

714 self.tablename = self._cfgsection.opt_str( 

715 ProcessorConfigKeys.DESTTABLE, required=True 

716 ) 

717 

718 def dest_tables_columns(self) -> Dict[str, List[Column]]: 

719 # docstring in superclass 

720 return { 

721 self.tablename: [ 

722 Column(FN_CONTENT, Text, comment=HELP_CONTENT), 

723 Column(FN_START, Integer, comment=HELP_START), 

724 Column(FN_END, Integer, comment=HELP_END), 

725 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT), 

726 Column( 

727 FN_RELATION_TEXT, 

728 String(MAX_RELATION_TEXT_LENGTH), 

729 comment=HELP_RELATION_TEXT, 

730 ), 

731 Column( 

732 FN_RELATION, 

733 String(MAX_RELATION_LENGTH), 

734 comment=HELP_RELATION, 

735 ), 

736 Column( 

737 FN_VALUE_TEXT, 

738 String(MAX_VALUE_TEXT_LENGTH), 

739 comment=HELP_VALUE_TEXT, 

740 ), 

741 Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS), 

742 Column( 

743 self.FN_SYSTOLIC_BP_MMHG, 

744 Float, 

745 comment="Systolic blood pressure in mmHg", 

746 ), 

747 Column( 

748 self.FN_DIASTOLIC_BP_MMHG, 

749 Float, 

750 comment="Diastolic blood pressure in mmHg", 

751 ), 

752 Column( 

753 FN_TENSE_TEXT, 

754 String(MAX_TENSE_TEXT_LENGTH), 

755 comment=HELP_TENSE_TEXT, 

756 ), 

757 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE), 

758 ] 

759 } 

760 

761 def parse( 

762 self, text: str, debug: bool = False 

763 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

764 """ 

765 Parser for BP. Specialized because we're fetching two numbers. 

766 """ 

767 for m in self.COMPILED_REGEX.finditer(text): 

768 if debug: 

769 log.info(f"Match {m} for {text!r}") 

770 startpos = m.start() 

771 endpos = m.end() 

772 matching_text = m.group(0) # the whole thing 

773 variable_text = m.group(1) 

774 tense_text = m.group(2) 

775 relation_text = m.group(3) 

776 value_text = m.group(4) 

777 units = m.group(5) 

778 

779 sbp = None 

780 dbp = None 

781 if self.COMPILED_SBP.match(variable_text): 

782 if self.COMPILED_ONE_NUMBER_BP.match(value_text): 

783 sbp = to_pos_float(value_text) 

784 elif self.COMPILED_DBP.match(variable_text): 

785 if self.COMPILED_ONE_NUMBER_BP.match(value_text): 

786 dbp = to_pos_float(value_text) 

787 elif self.COMPILED_BP.match(variable_text): 

788 bpmatch = self.COMPILED_TWO_NUMBER_BP.match(value_text) 

789 if bpmatch: 

790 sbp = to_pos_float(bpmatch.group(1)) 

791 dbp = to_pos_float(bpmatch.group(2)) 

792 if sbp is None and dbp is None: 

793 # This is OK; e.g. "BP 110", which we will ignore. 

794 # log.warning( 

795 # "Failed interpretation: matching_text={matching_text}, " 

796 # "variable_text={variable_text}, " 

797 # "tense_indicator={tense_indicator}, " 

798 # "relation={relation}, " 

799 # "value_text={value_text}, " 

800 # "units={units}".format( 

801 # matching_text=repr(matching_text), 

802 # variable_text=repr(variable_text), 

803 # tense_indicator=repr(tense_indicator), 

804 # relation=repr(relation), 

805 # value_text=repr(value_text), 

806 # units=repr(units), 

807 # ) 

808 # ) 

809 continue 

810 

811 tense, relation = common_tense(tense_text, relation_text) 

812 

813 yield self.tablename, { 

814 FN_CONTENT: matching_text, 

815 FN_START: startpos, 

816 FN_END: endpos, 

817 FN_VARIABLE_TEXT: variable_text, 

818 FN_RELATION_TEXT: relation_text, 

819 FN_RELATION: relation, 

820 FN_VALUE_TEXT: value_text, 

821 FN_UNITS: units, 

822 self.FN_SYSTOLIC_BP_MMHG: sbp, 

823 self.FN_DIASTOLIC_BP_MMHG: dbp, 

824 FN_TENSE_TEXT: tense_text, 

825 FN_TENSE: tense, 

826 } 

827 

828 def test_bp_parser( 

829 self, 

830 test_expected_list: List[Tuple[str, List[Tuple[float, float]]]], 

831 verbose: bool = False, 

832 ) -> None: 

833 """ 

834 Called by :func:`test`. 

835 

836 Args: 

837 test_expected_list: 

838 tuple ``source_text, expected_values`` where 

839 ``expected_values`` is a list of tuples like ``sbp, dbp``. 

840 verbose: 

841 be verbose? 

842 """ 

843 log.info(f"Testing parser: {self.classname()}") 

844 if verbose: 

845 log.debug(f"... regex:\n{self.REGEX}") 

846 for test_string, expected_values in test_expected_list: 

847 actual_values = list( 

848 (x[self.FN_SYSTOLIC_BP_MMHG], x[self.FN_DIASTOLIC_BP_MMHG]) 

849 for t, x in self.parse(test_string) 

850 ) 

851 assert actual_values == expected_values, ( 

852 "Parser {name}: Expected {expected}, got {actual}, when " 

853 "parsing {test_string}; full result={full}".format( 

854 name=self.classname(), 

855 expected=expected_values, 

856 actual=actual_values, 

857 test_string=repr(test_string), 

858 full=repr(list(self.parse(test_string))), 

859 ) 

860 ) 

861 log.info("... OK") 

862 

863 def test(self, verbose: bool = False) -> None: 

864 # docstring in superclass 

865 self.test_bp_parser( 

866 [ 

867 ("BP", []), # should fail; no values 

868 ("his blood pressure was 120/80", [(120, 80)]), 

869 ("BP 120/80 mmhg", [(120, 80)]), 

870 ("systolic BP 120", [(120, None)]), 

871 ("diastolic BP 80", [(None, 80)]), 

872 ("BP-130/70", [(130, 70)]), 

873 ("BP 110 /80", [(110, 80)]), 

874 ("BP 110 /80 -", [(110, 80)]), # real example 

875 ("BP 120 / 70 -", [(120, 70)]), # real example 

876 ("BP :115 / 70 -", [(115, 70)]), # real example 

877 ("B.P 110", []), # real example 

878 ], 

879 verbose=verbose, 

880 ) 

881 # 1. Unsure if best to take abs value. 

882 # One reason not to might be if people express changes, e.g. 

883 # "BP change -40/-10", but I very much doubt it. 

884 # Went with abs value using to_pos_float(). 

885 # 2. "BP 110" - too unreliable; not definitely a blood pressure. 

886 

887 

888class BpValidator(ValidatorBase): 

889 """ 

890 Validator for Bp (see help for explanation). 

891 """ 

892 

893 @classmethod 

894 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

895 return Bp.NAME, [Bp.REGEX] 

896 

897 

898# ============================================================================= 

899# All classes in this module 

900# ============================================================================= 

901 

902ALL_CLINICAL_NLP_AND_VALIDATORS = [ 

903 (Bmi, BmiValidator), 

904 (Bp, BpValidator), 

905 (Height, HeightValidator), 

906 (Weight, WeightValidator), 

907]