Coverage for nlp_manager/regex_parser.py: 89%

286 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/regex_parser.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Shared elements for regex-based NLP work.** 

27 

28""" 

29 

30from abc import abstractmethod, ABC 

31import logging 

32from typing import Any, Dict, Generator, List, Optional, Tuple 

33 

34from sqlalchemy import Column, Integer, Float, String, Text 

35 

36from crate_anon.common.regex_helpers import ( 

37 LEFT_BRACKET as LB, 

38 RIGHT_BRACKET as RB, 

39) 

40from crate_anon.nlp_manager.constants import ( 

41 MAX_SQL_FIELD_LEN, 

42 ProcessorConfigKeys, 

43 SqlTypeDbIdentifier, 

44) 

45from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser 

46from crate_anon.nlp_manager.nlp_definition import NlpDefinition 

47from crate_anon.nlp_manager.number import to_float, to_pos_float 

48from crate_anon.nlp_manager.regex_func import ( 

49 compile_regex, 

50 compile_regex_dict, 

51 get_regex_dict_match, 

52) 

53from crate_anon.nlp_manager.regex_numbers import ( 

54 SIGNED_FLOAT, 

55 IGNORESIGN_INTEGER, 

56) 

57from crate_anon.nlp_manager.regex_units import ( 

58 OUT_OF_SEPARATOR, 

59 SCORE, 

60) 

61 

62log = logging.getLogger(__name__) 

63 

64 

65# ============================================================================= 

66# Generic entities 

67# ============================================================================= 

68 

69# ----------------------------------------------------------------------------- 

70# Blood results 

71# ----------------------------------------------------------------------------- 

72 

73OPTIONAL_RESULTS_IGNORABLES = r""" 

74 (?: # OPTIONAL_RESULTS_IGNORABLES 

75 \s | \| | \: # whitespace, bar, colon 

76 | \bHH?\b | \(HH?\) # H/HH at a word boundary; (H)/(HH) 

77 | \bLL?\b | \(LL?\) # L/LL etc. 

78 | \* | \(\*\) # *, (*) 

79 | — | -- # em dash, double hyphen-minus 

80 | –\s+ | -\s+ | ‐\s+ # en dash/hyphen-minus/Unicode hyphen; whitespace 

81 )* # ... any of those, repeated 0 or more times 

82""" 

83# - you often get | characters when people copy/paste tables 

84# - blood test abnormality markers can look like e.g. 

85# 17 (H), 17 (*), 17 HH 

86# Re parentheses: 

87# - you can also see things like "CRP (5)" 

88# ... but we'll handle that 

89# - However, if there's a right parenthesis only, that's less good, e.g. 

90# "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting." 

91# ... which we don't want to be interpreted as "sodium 1.0". 

92# HOW BEST TO DO THIS? 

93# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa: E501 

94# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa: E501 

95# - ... simplest is perhaps: base ignorables, or those with brackets, as above 

96# - ... even better than a nested thing is just a list of alternatives 

97 

98OPTIONAL_POC = r""" 

99 (?: ,? \s+ POC )? # OPTIONAL_POC: point-of-care testing, "[,] POC" 

100""" 

101# ... e.g. "Glucose, POC"; "Potassium, POC". 

102# Seen in CUH for 

103# 

104# sodium, POC 

105# potassium, POC 

106# creatinine, POC 

107# urea, POC 

108# glucose, POC 

109# lactate, POC 

110# bilirubin, POC 

111# HCT, POC 

112# alkaline phosphatase, POC 

113# alanine transferase, POC 

114# 

115# HGB, POC 

116# WBC, POC 

117# PLT, POC 

118# MCV, POC 

119# MCH, POC 

120# neutrophil count, POC 

121# lymphocyte count, POC 

122 

123# ----------------------------------------------------------------------------- 

124# Tense indicators 

125# ----------------------------------------------------------------------------- 

126 

127IS = "is" 

128WAS = "was" 

129TENSE_INDICATOR = rf"(?: \b {IS} \b | \b {WAS} \b )" 

130 

131# Standardized result values; see MAX_TENSE_TEXT_LENGTH 

132PAST = "past" 

133PRESENT = "present" 

134EVER = "ever" # e.g. for "never" 

135 

136TENSE_LOOKUP = compile_regex_dict( 

137 { 

138 IS: PRESENT, 

139 WAS: PAST, 

140 } 

141) 

142 

143# ----------------------------------------------------------------------------- 

144# Mathematical relations 

145# ----------------------------------------------------------------------------- 

146# ... don't use unnamed groups here; EQ is also used as a return value 

147 

148LT = r"(?: < | less \s+ than | under )" 

149LE = "<=" 

150EQ = r"(?: = | equals | equal \s+ to )" 

151GE = ">=" 

152GT = r"(?: > | (?:more|greater) \s+ than | over )" 

153# OF = "\b of \b" # as in: "a BMI of 30"... but too likely to be mistaken for a target? # noqa: E501 

154 

155RELATION = rf"(?: {LE} | {LT} | {EQ} | {GE} | {GT} )" 

156# ... ORDER MATTERS: greedier things first, i.e. 

157# - LE before LT 

158# - GE before GT 

159 

160RELATION_LOOKUP = compile_regex_dict( 

161 { 

162 # To standardize the output, so (for example) "=" and "equals" can both 

163 # map to "=". 

164 LT: "<", 

165 LE: "<=", 

166 EQ: "=", 

167 GE: ">=", 

168 GT: ">", 

169 } 

170) 

171 

172# ----------------------------------------------------------------------------- 

173# Punctuation 

174# ----------------------------------------------------------------------------- 

175 

176APOSTROPHE = "['’]" # ASCII apostrophe; right single quote (U+2019) 

177 

178 

179# ============================================================================= 

180# Regex assembly functions 

181# ============================================================================= 

182 

183 

184# ============================================================================= 

185# Functions to handle processed data 

186# ============================================================================= 

187 

188 

189def common_tense( 

190 tense_text: Optional[str], relation_text: Optional[str] 

191) -> Tuple[Optional[str], Optional[str]]: 

192 """ 

193 Takes strings potentially representing "tense" and "equality" concepts 

194 and unifies them. 

195 

196 - Used, for example, to help impute that "CRP was 72" means that relation 

197 was EQ in the PAST, etc. 

198 

199 Args: 

200 tense_text: putative tense information 

201 relation_text: putative relationship (equals, less than, etc.) 

202 

203 Returns: 

204 tuple: ``tense, relation``; either may be ``None``. 

205 """ 

206 tense = None 

207 if tense_text: 

208 _, tense = get_regex_dict_match(tense_text, TENSE_LOOKUP) 

209 elif relation_text: 

210 _, tense = get_regex_dict_match(relation_text, TENSE_LOOKUP) 

211 

212 _, relation = get_regex_dict_match(relation_text, RELATION_LOOKUP, "=") 

213 

214 return tense, relation 

215 

216 

217# ============================================================================= 

218# Constants for generic processors 

219# ============================================================================= 

220 

221FN_VARIABLE_NAME = "variable_name" 

222FN_CONTENT = "_content" 

223FN_START = "_start" 

224FN_END = "_end" 

225FN_VARIABLE_TEXT = "variable_text" 

226FN_RELATION_TEXT = "relation_text" 

227FN_RELATION = "relation" 

228FN_VALUE_TEXT = "value_text" 

229FN_UNITS = "units" 

230FN_TENSE_TEXT = "tense_text" 

231FN_TENSE = "tense" 

232 

233HELP_VARIABLE_NAME = "Variable name" 

234HELP_CONTENT = "Matching text contents" 

235HELP_START = "Start position (of matching string within whole text)" 

236HELP_END = "End position (of matching string within whole text)" 

237HELP_VARIABLE_TEXT = "Text that matched the variable name" 

238HELP_RELATION_TEXT = ( 

239 "Text that matched the mathematical relationship between variable and " 

240 "value (e.g. '=', '<=', 'less than')" 

241) 

242HELP_RELATION = ( 

243 "Standardized mathematical relationship between variable and value " 

244 "(e.g. '=', '<=')" 

245) 

246HELP_VALUE_TEXT = "Matched numerical value, as text" 

247HELP_UNITS = "Matched units, as text" 

248HELP_TARGET_UNIT = "Numerical value in preferred units, if known" 

249HELP_TENSE_TEXT = f"Tense text, if known (e.g. '{IS}', '{WAS}')" 

250HELP_TENSE = f"Calculated tense, if known (e.g. '{PAST}', '{PRESENT}')" 

251 

252MAX_RELATION_TEXT_LENGTH = 50 

253MAX_RELATION_LENGTH = max(len(x) for x in RELATION_LOOKUP.values()) 

254MAX_VALUE_TEXT_LENGTH = 50 

255MAX_UNITS_LENGTH = 50 

256MAX_TENSE_TEXT_LENGTH = 50 

257MAX_TENSE_LENGTH = max(len(x) for x in TENSE_LOOKUP.values()) 

258 

259 

260# ============================================================================= 

261# Generic processors 

262# ============================================================================= 

263 

264# ----------------------------------------------------------------------------- 

265# NumericalResultParser 

266# ----------------------------------------------------------------------------- 

267 

268 

269class NumericalResultParser(BaseNlpParser): 

270 """ 

271 DO NOT USE DIRECTLY. Base class for generic numerical results, where 

272 a SINGLE variable is produced. 

273 """ 

274 

275 def __init__( 

276 self, 

277 nlpdef: NlpDefinition, 

278 cfg_processor_name: str, 

279 variable: str, 

280 target_unit: str, 

281 regex_str_for_debugging: str, 

282 commit: bool = False, 

283 ) -> None: 

284 r""" 

285 Init function for NumericalResultParser. 

286 

287 Args: 

288 nlpdef: 

289 A :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`. 

290 

291 cfg_processor_name: 

292 Config section name in the :ref:`NLP config file <nlp_config>`. 

293 

294 variable: 

295 Used by subclasses as the record value for ``variable_name``. 

296 

297 target_unit: 

298 Fieldname used for the primary output quantity. 

299 

300 regex_str_for_debugging: 

301 String form of regex, for debugging. 

302 

303 commit: 

304 Force a COMMIT whenever we insert data? You should specify this 

305 in multiprocess mode, or you may get database deadlocks. 

306 

307 Subclasses will extend this method. 

308 """ 

309 # NB This docstring was associated with Sphinx errors! 

310 super().__init__( 

311 nlpdef=nlpdef, 

312 cfg_processor_name=cfg_processor_name, 

313 commit=commit, 

314 friendly_name=variable, 

315 ) 

316 self.variable = variable 

317 self.target_unit = target_unit 

318 self.regex_str_for_debugging = regex_str_for_debugging 

319 

320 if nlpdef is None: # only None for debugging! 

321 self.tablename = self.classname().lower() 

322 self.assume_preferred_unit = True 

323 else: 

324 self.tablename = self._cfgsection.opt_str( 

325 ProcessorConfigKeys.DESTTABLE, required=True 

326 ) 

327 self.assume_preferred_unit = self._cfgsection.opt_bool( 

328 ProcessorConfigKeys.ASSUME_PREFERRED_UNIT, default=True 

329 ) 

330 

331 # Sanity checks 

332 assert ( 

333 len(self.variable) <= MAX_SQL_FIELD_LEN 

334 ), f"Variable name too long (max {MAX_SQL_FIELD_LEN} characters)" 

335 

336 def get_regex_str_for_debugging(self) -> str: 

337 """ 

338 Returns the string version of the regex, for debugging. 

339 """ 

340 return self.regex_str_for_debugging 

341 

342 def set_tablename(self, tablename: str) -> None: 

343 """ 

344 In case a friend class wants to override. 

345 """ 

346 self.tablename = tablename 

347 

348 def dest_tables_columns(self) -> Dict[str, List[Column]]: 

349 # docstring in superclass 

350 return { 

351 self.tablename: [ 

352 Column( 

353 FN_VARIABLE_NAME, 

354 SqlTypeDbIdentifier, 

355 comment=HELP_VARIABLE_NAME, 

356 ), 

357 Column(FN_CONTENT, Text, comment=HELP_CONTENT), 

358 Column(FN_START, Integer, comment=HELP_START), 

359 Column(FN_END, Integer, comment=HELP_END), 

360 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT), 

361 Column( 

362 FN_RELATION_TEXT, 

363 String(MAX_RELATION_TEXT_LENGTH), 

364 comment=HELP_RELATION_TEXT, 

365 ), 

366 Column( 

367 FN_RELATION, 

368 String(MAX_RELATION_LENGTH), 

369 comment=HELP_RELATION, 

370 ), 

371 Column(FN_VALUE_TEXT, Text, comment=HELP_VALUE_TEXT), 

372 Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS), 

373 Column(self.target_unit, Float, comment=HELP_TARGET_UNIT), 

374 Column( 

375 FN_TENSE_TEXT, 

376 String(MAX_TENSE_TEXT_LENGTH), 

377 comment=HELP_TENSE_TEXT, 

378 ), 

379 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE), 

380 ] 

381 } 

382 

383 @abstractmethod 

384 def parse( 

385 self, text: str 

386 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

387 # docstring in superclass 

388 raise NotImplementedError 

389 

390 def test_numerical_parser( 

391 self, 

392 test_expected_list: List[Tuple[str, List[float]]], 

393 add_test_no_plain_number: bool = True, 

394 verbose: bool = False, 

395 ) -> None: 

396 """ 

397 Args: 

398 test_expected_list: 

399 list of tuples ``test_string, expected_values``. The parser 

400 will parse ``test_string`` and compare the result (each value 

401 of the target unit) to ``expected_values``, which is a list of 

402 numerical (``float``), and can be an empty list. 

403 verbose: 

404 show the regex string too 

405 

406 Raises: 

407 :exc:`AssertionError` if a comparison fails 

408 

409 Compare also :func:`test_numerical_parser_detailed`. 

410 """ 

411 log.info(f"Testing parser: {self.classname()}") 

412 if verbose: 

413 log.debug(f"... regex string:\n{self.regex_str_for_debugging}") 

414 if add_test_no_plain_number: 

415 test_expected_list = test_expected_list + [ 

416 ("999", []) # no quantity specified 

417 ] # use "+ [...]", not append(), so as not to modify for caller 

418 for test_string, expected_values in test_expected_list: 

419 full_result = list(self.parse(test_string)) 

420 actual_values = list(x[self.target_unit] for t, x in full_result) 

421 assert actual_values == expected_values, ( 

422 f"Parser {self.classname()}: Expected {expected_values!r}, " 

423 f"got {actual_values!r}, when parsing {test_string!r}; " 

424 f"full result:\n{full_result!r}" 

425 ) 

426 log.info("... OK") 

427 

428 def detailed_test( 

429 self, text: str, expected: List[Dict[str, Any]], verbose: bool = False 

430 ) -> None: 

431 """ 

432 Runs a more detailed check. Whereas :func:`test_numerical_parser` tests 

433 the primary numerical results, this function tests other key/value 

434 pairs returned by the parser. 

435 

436 Args: 

437 text: 

438 text to parse 

439 expected: 

440 list of ``resultdict`` dictionaries (each mapping column names 

441 to values). 

442 

443 - The parser should return one result dictionary for 

444 every entry in ``expected``. 

445 - It's fine for the ``resultdict`` not to include all the 

446 columns returned for the parser. However, for any column that 

447 is present, the parser must provide the corresponding value. 

448 

449 verbose: 

450 be verbose 

451 """ 

452 full_result = list(self.parse(text)) 

453 if len(full_result) != len(expected): 

454 raise ValueError( 

455 f"Parser {self.classname()}: expected {len(expected)} results " 

456 f"but got {len(full_result)} when parsing {text!r}; " 

457 f"full result:\n{full_result!r}" 

458 ) 

459 if verbose: 

460 log.info(f"detailed_test: {text!r} -> {full_result!r}") 

461 for i, text_result in enumerate(full_result): 

462 _, result = text_result 

463 expected_dict = expected[i] 

464 for k, expected_value in expected_dict.items(): 

465 if k not in result: 

466 raise ValueError( 

467 f"Parser {self.classname()}: Expected value dict " 

468 f"had key {k!r} but this is absent from result " 

469 f"{result!r}" 

470 ) 

471 observed_value = result[k] 

472 if observed_value != expected_value: 

473 raise ValueError( 

474 f"Parser {self.classname()}: expected {k} = " 

475 f"{expected_value!r}, got {observed_value!r}, " 

476 f"when parsing {text!r}; full result:\n" 

477 f"{full_result!r}" 

478 ) 

479 

480 def detailed_test_multiple( 

481 self, 

482 tests: List[Tuple[str, List[Dict[str, Any]]]], 

483 verbose: bool = False, 

484 ) -> None: 

485 """ 

486 Args: 

487 tests: 

488 list of tuples ``test_string, expected``. The parser will parse 

489 ``test_string`` and compare the result(s) to ``expected``. This 

490 is list of dictionaries with keys that can be like ``values``, 

491 ``tense``, etc. Each dictionary value is the corresponding 

492 expected value. 

493 verbose: 

494 show the regex string too 

495 

496 Raises: 

497 :exc:`AssertionError` if a comparison fails 

498 """ 

499 log.info(f"Detailed tests for parser: {self.classname()}") 

500 if verbose: 

501 log.debug(f"... regex string:\n{self.regex_str_for_debugging}") 

502 for test_string, expected_dict_list in tests: 

503 self.detailed_test( 

504 test_string, expected_dict_list, verbose=verbose 

505 ) 

506 log.info("... OK") 

507 

508 

509# ----------------------------------------------------------------------------- 

510# SimpleNumericalResultParser 

511# ----------------------------------------------------------------------------- 

512 

513GROUP_NUMBER_WHOLE_EXPRESSION = 0 

514 

515GROUP_NAME_QUANTITY = "quantity" 

516GROUP_NAME_RELATION = "relation" 

517GROUP_NAME_TENSE = "tense" 

518GROUP_NAME_UNITS = "units" 

519GROUP_NAME_VALUE = "value" 

520 

521 

522def make_simple_numeric_regex( 

523 quantity: str, 

524 units: str, 

525 value: str = SIGNED_FLOAT, 

526 tense_indicator: str = TENSE_INDICATOR, 

527 relation: str = RELATION, 

528 optional_results_ignorables: str = OPTIONAL_RESULTS_IGNORABLES, 

529 optional_ignorable_after_quantity: str = "", 

530 units_optional: bool = True, 

531) -> str: 

532 r""" 

533 Makes a regex with named groups to handle simple numerical results. 

534 

535 Copes with formats like: 

536 

537 .. code-block:: none 

538 

539 sodium 132 mM 

540 sodium (mM) 132 

541 sodium (132 mM) 

542 

543 ... and lots more. 

544 

545 Args: 

546 quantity: 

547 Regex for the quantity (e.g. for "sodium" or "Na"). 

548 units: 

549 Regex for units. 

550 value: 

551 Regex for the numerical value (e.g. our ``SIGNED_FLOAT`` regex). 

552 tense_indicator: 

553 Regex for tense indicator. 

554 relation: 

555 Regex for mathematical relationship (e.g. equals, less than). 

556 optional_results_ignorables: 

557 Regex for junk to ignore in between the other things. 

558 Should include its own "optionality" (e.g. ``*``). 

559 optional_ignorable_after_quantity: 

560 Regex for additional things that can be ignored right after the 

561 quantity. Should include its own "optionality" (e.g. ``?``). 

562 units_optional: 

563 The units are allowed to be omitted. Usually true. 

564 

565 The resulting regex groups are named, not numbered: 

566 

567 .. code-block:: none 

568 

569 0: Whole thing; integer, as in: m.group(0) 

570 'quantity': Quantity 

571 'tense': Tense (optional) 

572 'relation': Relation (optional) 

573 'value': Value 

574 'units': Units (optional) 

575 

576 ... as used by :class:`SimpleNumericalResultParser`. 

577 

578 Just to check re overlap: 

579 

580 .. code-block:: python 

581 

582 import regex 

583 s1 = r"(?P<quantity>Sodium)\s+(?P<value>\d+)\s+(?P<units>mM)" 

584 s2 = r"(?P<quantity>Sodium)\s+\((?P<units>mM)\)\s+(?P<value>\d+)" 

585 s = f"{s1}|{s2}" 

586 r = regex.compile(s) 

587 t1 = "Sodium 132 mM" 

588 t2 = "Sodium (mM) 127" 

589 m1 = r.match(t1) 

590 m2 = r.match(t2) 

591 

592 print(m1.group(0)) # Sodium 132 mM 

593 print(m1.group("quantity")) # Sodium 

594 print(m1.group("value")) # 132 

595 print(m1.group("units")) # mM 

596 

597 print(m2.group(0)) # Sodium (mM) 127 

598 print(m2.group("quantity")) # Sodium 

599 print(m2.group("value")) # 127 

600 print(m2.group("units")) # mM 

601 

602 ... so it's fine in that multiple groups can have the same name. 

603 

604 """ 

605 

606 def group(groupname: str, contents: str, optional: bool = False) -> str: 

607 opt_str = "?" if optional else "" 

608 return f"(?P<{groupname}> {contents} ){opt_str}" 

609 

610 def bracketed(s: str) -> str: 

611 return rf"{LB} \s* {s} \s* {RB}" 

612 

613 group_quantity = group(GROUP_NAME_QUANTITY, quantity) 

614 group_tense_optional = group(GROUP_NAME_TENSE, tense_indicator, True) 

615 group_relation_optional = group(GROUP_NAME_RELATION, relation, True) 

616 group_units = group(GROUP_NAME_UNITS, units) 

617 group_units_bracketed = bracketed(group_units) 

618 group_value = group(GROUP_NAME_VALUE, value) 

619 group_value_bracketed = bracketed(group_value) 

620 value_units_all_bracketed = bracketed(rf"{group_value} \s+ {group_units}") 

621 units_optional_descriptor = "optional" if units_optional else "required" 

622 qmark_if_units_optional = "?" if units_optional else "" 

623 

624 return rf""" 

625 # - Either: quantity [tense] [relation] value [units] 

626 # or: quantity (units value) 

627 # or: quantity (units) [tense] [relation] value 

628 # Quantity: 

629 {group_quantity} 

630 # Ignorable: 

631 {optional_ignorable_after_quantity} 

632 {optional_results_ignorables} 

633 (?: 

634 (?: 

635 # (units) ... [tense] ... [relation] ... value 

636 # Units, in brackets: 

637 {group_units_bracketed} 

638 # Tense indicator (optional): 

639 {group_tense_optional} 

640 # Ignorable: 

641 {optional_results_ignorables} 

642 # Relation (optional): 

643 {group_relation_optional} 

644 # Ignorable: 

645 {optional_results_ignorables} 

646 # Value: 

647 {group_value} 

648 ) 

649 | 

650 (?: 

651 # (value units) 

652 {value_units_all_bracketed} 

653 ) 

654 | 

655 (?: 

656 # [tense] ... [relation] ... value|(value) ... [units] 

657 # Tense indicator (optional): 

658 {group_tense_optional} 

659 # Ignorable: 

660 {optional_results_ignorables} 

661 # Relation (optional): 

662 {group_relation_optional} 

663 # Ignorable: 

664 {optional_results_ignorables} 

665 # Value or (value): 

666 (?: 

667 {group_value} | 

668 {group_value_bracketed} 

669 ) 

670 # Ignorable: 

671 {optional_results_ignorables} 

672 # Units ({units_optional_descriptor}): 

673 {group_units}{qmark_if_units_optional} 

674 ) 

675 ) 

676 """ 

677 

678 

679class SimpleNumericalResultParser(NumericalResultParser, ABC): 

680 """ 

681 Base class for simple single-format numerical results. Use this when not 

682 only do you have a single variable to produce, but you have a single regex 

683 (in a standard format) that can produce it. 

684 """ 

685 

686 def __init__( 

687 self, 

688 nlpdef: NlpDefinition, 

689 cfg_processor_name: str, 

690 regex_str: str, 

691 variable: str, 

692 target_unit: str, 

693 units_to_factor: Dict[str, float], 

694 take_absolute: bool = False, 

695 commit: bool = False, 

696 debug: bool = False, 

697 ) -> None: 

698 """ 

699 Args: 

700 

701 nlpdef: 

702 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

703 

704 cfg_processor_name: 

705 config section suffix in the :ref:`NLP config file 

706 <nlp_config>` 

707 

708 regex_str: 

709 Regular expression, in string format. 

710 

711 This class operates with compiled regexes having this group 

712 format (capture groups in this sequence): 

713 

714 - variable 

715 - tense_indicator 

716 - relation 

717 - value 

718 - units 

719 

720 variable: 

721 used as the record value for ``variable_name`` 

722 

723 target_unit: 

724 fieldname used for the primary output quantity 

725 

726 units_to_factor: 

727 dictionary, mapping 

728 

729 - FROM (compiled regex for units) 

730 - TO EITHER a float (multiple) to multiply those units by, to 

731 get the preferred unit 

732 - OR a function taking a text parameter and returning a float 

733 value in preferred unit 

734 

735 Any units present in the regex but absent from 

736 ``units_to_factor`` will lead the result to be ignored. For 

737 example, this allows you to ignore a relative neutrophil count 

738 ("neutrophils 2.2%") while detecting absolute neutrophil counts 

739 ("neutrophils 2.2"), or ignoring "docusate sodium 100mg" but 

740 detecting "sodium 140 mM". 

741 

742 take_absolute: 

743 Convert negative values to positive ones? Typical text 

744 requiring this option might look like: 

745 

746 .. code-block:: none 

747 

748 CRP-4 

749 CRP-106 

750 CRP -97 

751 Blood results for today as follows: Na- 142, K-4.1, ... 

752 

753 ... occurring in 23 out of 8054 hits for CRP of one test set in 

754 our data. 

755 

756 For many quantities, we know that they cannot be negative, so 

757 this is just a notation rather than a minus sign. We have to 

758 account for it, or it'll distort our values. Preferable to 

759 account for it here rather than later; see manual. 

760 

761 commit: 

762 force a COMMIT whenever we insert data? You should specify this 

763 in multiprocess mode, or you may get database deadlocks. 

764 

765 debug: 

766 print the regex? 

767 

768 """ 

769 super().__init__( 

770 nlpdef=nlpdef, 

771 cfg_processor_name=cfg_processor_name, 

772 variable=variable, 

773 target_unit=target_unit, 

774 regex_str_for_debugging=regex_str, 

775 commit=commit, 

776 ) 

777 if debug: 

778 log.debug(f"Regex for {self.classname()}: {regex_str}") 

779 self.compiled_regex = compile_regex(regex_str) 

780 self.units_to_factor = compile_regex_dict(units_to_factor) 

781 self.take_absolute = take_absolute 

782 

783 def parse( 

784 self, text: str, debug: bool = False 

785 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

786 # docstring in superclass 

787 if not text: 

788 return 

789 for m in self.compiled_regex.finditer(text): 

790 startpos = m.start() 

791 endpos = m.end() 

792 # groups = repr(m.groups()) # all matching groups 

793 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) 

794 # matching_text = text[startpos:endpos] # same thing 

795 

796 variable_text = m.group(GROUP_NAME_QUANTITY) 

797 tense_text = m.group(GROUP_NAME_TENSE) 

798 relation_text = m.group(GROUP_NAME_RELATION) 

799 value_text = m.group(GROUP_NAME_VALUE) 

800 units = m.group(GROUP_NAME_UNITS) 

801 

802 # If units are known (or we're choosing to assume preferred units 

803 # if none are specified), calculate an absolute value 

804 value_in_target_units = None 

805 if units: 

806 matched_unit, multiple_or_fn = get_regex_dict_match( 

807 units, self.units_to_factor 

808 ) 

809 if not matched_unit: 

810 # None of our units match. But there is a unit, and the 

811 # regex matched. So this is a BAD unit. Skip the value. 

812 continue 

813 # Otherwise: we did match a unit. 

814 if callable(multiple_or_fn): 

815 value_in_target_units = multiple_or_fn(value_text) 

816 else: 

817 value_in_target_units = ( 

818 to_float(value_text) * multiple_or_fn 

819 ) 

820 elif self.assume_preferred_unit: # unit is None or empty 

821 value_in_target_units = to_float(value_text) 

822 

823 if value_in_target_units is not None and self.take_absolute: 

824 value_in_target_units = abs(value_in_target_units) 

825 

826 tense, relation = common_tense(tense_text, relation_text) 

827 

828 result = { 

829 FN_VARIABLE_NAME: self.variable, 

830 FN_CONTENT: matching_text, 

831 FN_START: startpos, 

832 FN_END: endpos, 

833 FN_VARIABLE_TEXT: variable_text, 

834 FN_RELATION_TEXT: relation_text, 

835 FN_RELATION: relation, 

836 FN_VALUE_TEXT: value_text, 

837 FN_UNITS: units, 

838 self.target_unit: value_in_target_units, 

839 FN_TENSE_TEXT: tense_text, 

840 FN_TENSE: tense, 

841 } 

842 if debug: 

843 log.debug(f"Match {m} for {text!r} -> {result}") 

844 yield self.tablename, result 

845 

846 

847# ----------------------------------------------------------------------------- 

848# NumeratorOutOfDenominatorParser 

849# ----------------------------------------------------------------------------- 

850 

851 

852class NumeratorOutOfDenominatorParser(BaseNlpParser, ABC): 

853 """ 

854 Base class for X-out-of-Y numerical results, e.g. for MMSE/ACE. 

855 

856 - Integer denominator, expected to be positive. 

857 - Otherwise similar to :class:`SimpleNumericalResultParser`. 

858 """ 

859 

860 def __init__( 

861 self, 

862 nlpdef: NlpDefinition, 

863 cfg_processor_name: str, 

864 variable_name: str, # e.g. "MMSE" 

865 variable_regex_str: str, # e.g. regex for MMSE 

866 expected_denominator: int, 

867 numerator_text_fieldname: str = "numerator_text", 

868 numerator_fieldname: str = "numerator", 

869 denominator_text_fieldname: str = "denominator_text", 

870 denominator_fieldname: str = "denominator", 

871 correct_numerator_fieldname: str = None, # default below 

872 take_absolute: bool = True, 

873 commit: bool = False, 

874 debug: bool = False, 

875 ) -> None: 

876 """ 

877 This class operates with compiled regexes having this group format: 

878 - quantity_regex_str: e.g. to find "MMSE" 

879 

880 Args: 

881 nlpdef: 

882 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

883 cfg_processor_name: 

884 the suffix (name) of a CRATE NLP config file processor section 

885 (from which we may choose to get extra config information) 

886 variable_name: 

887 becomes the content of the ``variable_name`` output column 

888 variable_regex_str: 

889 regex for the text that states the variable 

890 expected_denominator: 

891 the integer value that's expected as the "out of Y" part. For 

892 example, an MMSE is out of 30; an ACE-III total is out of 100. 

893 If the text just says "MMSE 17", we will infer "17 out of 30"; 

894 so, for the MMSE, ``expected_denominator`` should be 30. 

895 numerator_text_fieldname: 

896 field (column) name in which to store the text retrieved as the 

897 numerator 

898 numerator_fieldname: 

899 field (column) name in which to store the numerical value 

900 retrieved as the numerator 

901 denominator_text_fieldname: 

902 field (column) name in which to store the text retrieved as the 

903 denominator 

904 denominator_fieldname: 

905 field (column) name in which to store the numerical value 

906 retrieved as the denominator 

907 correct_numerator_fieldname: 

908 field (column) name in which we store the principal validated 

909 numerator. For example, if an MMSE processor sees "17" or 

910 "17/30", this field will end up containing 17; but if it sees 

911 "17/100", it will remain NULL. 

912 take_absolute: 

913 Convert negative values to positive ones? 

914 As for :class:`SimpleNumericalResultParser`. 

915 commit: 

916 force a COMMIT whenever we insert data? You should specify this 

917 in multiprocess mode, or you may get database deadlocks. 

918 debug: 

919 print the regex? 

920 

921 """ 

922 self.variable_name = variable_name 

923 assert expected_denominator > 0 

924 self.expected_denominator = expected_denominator 

925 self.numerator_text_fieldname = numerator_text_fieldname 

926 self.numerator_fieldname = numerator_fieldname 

927 self.denominator_text_fieldname = denominator_text_fieldname 

928 self.denominator_fieldname = denominator_fieldname 

929 self.correct_numerator_fieldname = ( 

930 correct_numerator_fieldname or f"out_of_{expected_denominator}" 

931 ) 

932 self.take_absolute = take_absolute 

933 

934 super().__init__( 

935 nlpdef=nlpdef, 

936 cfg_processor_name=cfg_processor_name, 

937 commit=commit, 

938 friendly_name=variable_name, 

939 ) 

940 if nlpdef is None: # only None for debugging! 

941 self.tablename = self.classname().lower() 

942 else: 

943 self.tablename = self._cfgsection.opt_str( 

944 ProcessorConfigKeys.DESTTABLE, required=True 

945 ) 

946 

947 regex_str = rf""" 

948 ( {variable_regex_str} ) # 1. group for variable (thing being measured) 

949 {OPTIONAL_RESULTS_IGNORABLES} 

950 {SCORE}? # optional "score" or similar 

951 {OPTIONAL_RESULTS_IGNORABLES} 

952 ( {TENSE_INDICATOR} )? # 2. optional group for tense indicator 

953 {OPTIONAL_RESULTS_IGNORABLES} 

954 ( {RELATION} )? # 3. optional group for relation 

955 {OPTIONAL_RESULTS_IGNORABLES} 

956 ( {SIGNED_FLOAT} ) # 4. group for numerator 

957 (?: # optional "/ denominator" 

958 \s* {OUT_OF_SEPARATOR} \s* 

959 ( {IGNORESIGN_INTEGER} ) # 5. group for denominator 

960 )? 

961 """ # noqa: E501 

962 if debug: 

963 log.debug(f"Regex for {self.classname()}: {regex_str}") 

964 self.regex_str = regex_str 

965 self.compiled_regex = compile_regex(regex_str) 

966 

967 def dest_tables_columns(self) -> Dict[str, List[Column]]: 

968 # docstring in superclass 

969 return { 

970 self.tablename: [ 

971 Column( 

972 FN_VARIABLE_NAME, 

973 SqlTypeDbIdentifier, 

974 comment=HELP_VARIABLE_NAME, 

975 ), 

976 Column(FN_CONTENT, Text, comment=HELP_CONTENT), 

977 Column(FN_START, Integer, comment=HELP_START), 

978 Column(FN_END, Integer, comment=HELP_END), 

979 Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT), 

980 Column( 

981 FN_RELATION_TEXT, 

982 String(MAX_RELATION_TEXT_LENGTH), 

983 comment=HELP_RELATION_TEXT, 

984 ), 

985 Column( 

986 FN_RELATION, 

987 String(MAX_RELATION_LENGTH), 

988 comment=HELP_RELATION, 

989 ), 

990 Column( 

991 self.numerator_text_fieldname, 

992 String(MAX_VALUE_TEXT_LENGTH), 

993 comment="Numerator, as text", 

994 ), 

995 Column(self.numerator_fieldname, Float, comment="Numerator"), 

996 Column( 

997 self.denominator_text_fieldname, 

998 String(MAX_VALUE_TEXT_LENGTH), 

999 comment="Denominator, as text", 

1000 ), 

1001 Column( 

1002 self.denominator_fieldname, Float, comment="Denominator" 

1003 ), 

1004 Column( 

1005 self.correct_numerator_fieldname, 

1006 Float, 

1007 comment="Numerator, if denominator is as expected (units " 

1008 "are correct)", 

1009 ), 

1010 Column( 

1011 FN_TENSE_TEXT, 

1012 String(MAX_TENSE_TEXT_LENGTH), 

1013 comment=HELP_TENSE_TEXT, 

1014 ), 

1015 Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE), 

1016 ] 

1017 } 

1018 

1019 def parse( 

1020 self, text: str, debug: bool = False 

1021 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

1022 # docstring in superclass 

1023 for m in self.compiled_regex.finditer(text): 

1024 startpos = m.start() 

1025 endpos = m.end() 

1026 # groups = repr(m.groups()) # all matching groups 

1027 matching_text = m.group(0) # the whole thing 

1028 # matching_text = text[startpos:endpos] # same thing 

1029 

1030 variable_text = m.group(1) 

1031 tense_text = m.group(2) 

1032 relation_text = m.group(3) 

1033 numerator_text = m.group(4) 

1034 denominator_text = m.group(5) 

1035 

1036 if self.take_absolute: 

1037 numerator = to_pos_float(numerator_text) 

1038 else: 

1039 numerator = to_float(numerator_text) 

1040 denominator = to_float(denominator_text) 

1041 

1042 if numerator is None: 

1043 log.critical("bug - numerator is None, should be impossible") 

1044 continue 

1045 correct_numerator = None 

1046 if denominator is None: 

1047 if numerator <= self.expected_denominator: 

1048 correct_numerator = numerator 

1049 else: 

1050 if numerator <= denominator == self.expected_denominator: 

1051 correct_numerator = numerator 

1052 

1053 tense, relation = common_tense(tense_text, relation_text) 

1054 

1055 result = { 

1056 FN_VARIABLE_NAME: self.variable_name, 

1057 FN_CONTENT: matching_text, 

1058 FN_START: startpos, 

1059 FN_END: endpos, 

1060 FN_VARIABLE_TEXT: variable_text, 

1061 FN_RELATION_TEXT: relation_text, 

1062 FN_RELATION: relation, 

1063 self.numerator_text_fieldname: numerator_text, 

1064 self.numerator_fieldname: numerator, 

1065 self.denominator_text_fieldname: denominator_text, 

1066 self.denominator_fieldname: denominator, 

1067 self.correct_numerator_fieldname: correct_numerator, 

1068 FN_TENSE_TEXT: tense_text, 

1069 FN_TENSE: tense, 

1070 } 

1071 if debug: 

1072 log.debug(f"Match {m} for {text!r} -> {result}") 

1073 yield self.tablename, result 

1074 

1075 def test_numerator_denominator_parser( 

1076 self, 

1077 test_expected_list: List[Tuple[str, List[Tuple[float, float]]]], 

1078 verbose: bool = False, 

1079 ) -> None: 

1080 """ 

1081 Test the parser. 

1082 

1083 Args: 

1084 test_expected_list: 

1085 list of tuples ``test_string, expected_values``. The parser 

1086 will parse ``test_string`` and compare the result (each value 

1087 of the target unit) to ``expected_values``, which is a list of 

1088 tuples ``numerator, denominator``, and can be an empty list. 

1089 verbose: 

1090 print the regex? 

1091 

1092 Raises: 

1093 :exc:`AssertionError` if a comparison fails 

1094 """ 

1095 log.info(f"Testing parser: {self.classname()}") 

1096 if verbose: 

1097 log.debug(f"... regex:\n{self.regex_str}") 

1098 for test_string, expected_values in test_expected_list: 

1099 actual_values = list( 

1100 (x[self.numerator_fieldname], x[self.denominator_fieldname]) 

1101 for t, x in self.parse(test_string) 

1102 ) 

1103 assert actual_values == expected_values, ( 

1104 "Parser {name}: Expected {expected}, got {actual}, when " 

1105 "parsing {test_string}; full result:\n{full}".format( 

1106 name=self.classname(), 

1107 expected=expected_values, 

1108 actual=actual_values, 

1109 test_string=repr(test_string), 

1110 full=repr(list(self.parse(test_string))), 

1111 ) 

1112 ) 

1113 log.info("... OK") 

1114 

1115 

1116# ============================================================================= 

1117# Validator base class (for testing regex NLP classes) 

1118# ============================================================================= 

1119 

1120 

1121class ValidatorBase(BaseNlpParser): 

1122 r""" 

1123 DO NOT USE DIRECTLY. Base class for **validating** regex parser 

1124 sensitivity. 

1125 

1126 The validator will find fields that refer to the variable, whether or not 

1127 they meet the other criteria of the actual NLP processors (i.e. whether or 

1128 not they contain a valid value). More explanation below. 

1129 

1130 Suppose we're validating C-reactive protein (CRP). Key concepts: 

1131 

1132 - source (true state of the world): Pr present, Ab absent 

1133 - software decision: Y yes, N no 

1134 - signal detection theory classification: 

1135 

1136 - hit = Pr & Y = true positive 

1137 - miss = Pr & N = false negative 

1138 - false alarm = Ab & Y = false positive 

1139 - correct rejection = Ab & N = true negative 

1140 

1141 - common SDT metrics: 

1142 

1143 - positive predictive value, PPV = P(Pr | Y) = precision (\*) 

1144 - negative predictive value, NPV = P(Ab | N) 

1145 - sensitivity = P(Y | Pr) = recall (\*) = true positive rate 

1146 - specificity = P(N | Ab) = true negative rate 

1147 

1148 (\*) common names used in the NLP context. 

1149 

1150 - other common classifier metric: 

1151 

1152 .. code-block:: none 

1153 

1154 F_beta score = (1 + beta^2) * precision * recall / 

1155 ((beta^2 * precision) + recall) 

1156 

1157 ... which measures performance when you value recall beta times as much 

1158 as precision (thus, for example, the F1 score when beta = 1). See 

1159 https://en.wikipedia.org/wiki/F1_score/ 

1160 

1161 Working from source to NLP, we can see there are a few types of "absent": 

1162 

1163 - X. unselected database field containing text 

1164 

1165 - Q. field contains "CRP", "C-reactive protein", etc.; something 

1166 that a human (or as a proxy: a machine) would judge as 

1167 containing a textual reference to CRP. 

1168 

1169 - Pr. Present: a human would judge that a CRP value is present, 

1170 e.g. "today her CRP is 7, which I am not concerned about." 

1171 

1172 - H. Hit: software reports the value. 

1173 - M. Miss: software misses the value. 

1174 (Maybe: "his CRP was twenty-one".) 

1175 

1176 - Ab1. Absent: reference to CRP, but no numerical information, 

1177 e.g. "her CRP was normal". 

1178 

1179 - FA1. False alarm: software reports a numerical value. 

1180 (Maybe: "my CRP was 7 hours behind my boss's deadline") 

1181 - CR1. Correct rejection: software doesn't report a value. 

1182 

1183 - Ab2. field contains no reference to CRP at all. 

1184 

1185 - FA2. False alarm: software reports a numerical value. 

1186 (A bit harder to think of examples... but imagine a bug 

1187 that gives a hit for "number of carp: 7". Or an alternative 

1188 abbreviation meaning, e.g. "took part in a cardiac 

1189 rehabilitation programme (CRP) 4 hours/week".) 

1190 

1191 - CR2. Correct rejection: software doesn't report a value. 

1192 

1193 From NLP backwards to source: 

1194 

1195 - Y. Software says value present. 

1196 

1197 - H. Hit: value is present. 

1198 - FA. False alarm: value is absent. 

1199 

1200 - N. Software says value absent. 

1201 

1202 - CR. Correct rejection: value is absent. 

1203 - M. Miss: value is present. 

1204 

1205 The key metrics are: 

1206 

1207 - precision = positive predictive value = P(Pr | Y) 

1208 

1209 ... relatively easy to check; find all the "Y" records and check 

1210 manually that they're correct. 

1211 

1212 - sensitivity = recall = P(Y | Pr) 

1213 

1214 ... Here, we want a sample that is enriched for "symptom actually 

1215 present", for human reasons. For example, if 0.1% of text entries 

1216 refer to CRP, then to assess 100 "Pr" samples we would have to 

1217 review 100,000 text records, 99,900 of which are completely 

1218 irrelevant. So we want an automated way of finding "Pr" records. 

1219 That's what the validator classes do. 

1220 

1221 You can enrich for "Pr" records with SQL, e.g. 

1222 

1223 .. code-block:: sql 

1224 

1225 SELECT textfield FROM sometable WHERE ( 

1226 textfield LIKE '%CRP%' 

1227 OR textfield LIKE '%C-reactive protein%'); 

1228 

1229 or similar, but really we want the best "CRP detector" possible. That is 

1230 probably to use a regex, either in SQL (... ``WHERE textfield REGEX 

1231 'myregex'``) or using these validator classes. (The main NLP regexes don't 

1232 distinguish between "CRP present, no valid value" and "CRP absent", 

1233 because regexes either match or don't.) 

1234 

1235 Each validator class implements the core variable-finding part of its 

1236 corresponding NLP regex class, but without the value or units. For example, 

1237 the CRP class looks for things like "CRP is 6" or "CRP 20 mg/L", whereas 

1238 the CRP validator looks for things like "CRP". 

1239 

1240 """ 

1241 

1242 def __init__( 

1243 self, 

1244 nlpdef: Optional[NlpDefinition], 

1245 cfg_processor_name: Optional[str], 

1246 commit: bool = False, 

1247 ) -> None: 

1248 """ 

1249 Args: 

1250 nlpdef: 

1251 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

1252 

1253 cfg_processor_name: 

1254 config section suffix in the :ref:`NLP config file 

1255 <nlp_config>` 

1256 

1257 commit: 

1258 force a COMMIT whenever we insert data? You should specify this 

1259 in multiprocess mode, or you may get database deadlocks. 

1260 """ 

1261 ( 

1262 validated_variable, 

1263 regex_str_list, 

1264 ) = self.get_variablename_regexstrlist() 

1265 vname = f"{validated_variable}_validator" 

1266 super().__init__( 

1267 nlpdef=nlpdef, 

1268 cfg_processor_name=cfg_processor_name, 

1269 commit=commit, 

1270 friendly_name=vname, 

1271 ) 

1272 self.regex_str_list = regex_str_list # for debugging only 

1273 self.compiled_regex_list = [compile_regex(r) for r in regex_str_list] 

1274 self.variable = vname 

1275 self.NAME = self.variable 

1276 

1277 if nlpdef is None: # only None for debugging! 

1278 self.tablename = self.classname().lower() 

1279 else: 

1280 self.tablename = self._cfgsection.opt_str( 

1281 ProcessorConfigKeys.DESTTABLE, required=True 

1282 ) 

1283 

1284 @classmethod 

1285 @abstractmethod 

1286 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

1287 """ 

1288 To be overridden. 

1289 

1290 Returns: 

1291 tuple: ``(validated_variable_name, regex_str_list)``, where: 

1292 

1293 regex_str_list: 

1294 List of regular expressions, each in string format. 

1295 

1296 This class operates with compiled regexes having this group 

1297 format (capture groups in this sequence): 

1298 

1299 - variable 

1300 

1301 validated_variable: 

1302 used to set our ``variable`` attribute and thus the value of 

1303 the field ``variable_name`` in the NLP output; for example, if 

1304 ``validated_variable == 'crp'``, then the ``variable_name`` 

1305 field will be set to ``crp_validator``. 

1306 

1307 """ 

1308 raise NotImplementedError 

1309 

1310 def set_tablename(self, tablename: str) -> None: 

1311 """ 

1312 In case a friend class wants to override. 

1313 """ 

1314 self.tablename = tablename 

1315 

1316 def dest_tables_columns(self) -> Dict[str, List[Column]]: 

1317 # docstring in superclass 

1318 return { 

1319 self.tablename: [ 

1320 Column( 

1321 FN_VARIABLE_NAME, 

1322 SqlTypeDbIdentifier, 

1323 comment=HELP_VARIABLE_NAME, 

1324 ), 

1325 Column(FN_CONTENT, Text, comment=HELP_CONTENT), 

1326 Column(FN_START, Integer, comment=HELP_START), 

1327 Column(FN_END, Integer, comment=HELP_END), 

1328 ] 

1329 } 

1330 

1331 def parse( 

1332 self, text: str 

1333 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

1334 # docstring in superclass 

1335 for compiled_regex in self.compiled_regex_list: 

1336 for m in compiled_regex.finditer(text): 

1337 startpos = m.start() 

1338 endpos = m.end() 

1339 # groups = repr(m.groups()) # all matching groups 

1340 matching_text = m.group(0) # the whole thing 

1341 # matching_text = text[startpos:endpos] # same thing 

1342 

1343 yield self.tablename, { 

1344 FN_VARIABLE_NAME: self.variable, 

1345 FN_CONTENT: matching_text, 

1346 FN_START: startpos, 

1347 FN_END: endpos, 

1348 } 

1349 

1350 def test_validator( 

1351 self, test_expected_list: List[Tuple[str, bool]], verbose: bool = False 

1352 ) -> None: 

1353 """ 

1354 The 'bool' part of test_expected_list is: should it match any? 

1355 ... noting that "match anywhere" is the "search" function, whereas 

1356 "match" matches at the beginning: 

1357 

1358 https://docs.python.org/3/library/re.html#re.regex.match 

1359 """ 

1360 log.info(f"Testing validator: {self.classname()}") 

1361 if verbose: 

1362 n = len(self.regex_str_list) 

1363 for i, r in enumerate(self.regex_str_list, start=1): 

1364 log.debug(f"... regex #{i}/{n}: {r}\n") 

1365 for test_string, expected_match in test_expected_list: 

1366 results = list( 

1367 r.search(test_string) for r in self.compiled_regex_list 

1368 ) 

1369 actual_match = any(results) 

1370 assert actual_match == expected_match, ( 

1371 f"Validator {self.classname()}: Expected 'at least one regex " 

1372 f"should match somewhere (search)' to be {expected_match}, " 

1373 f"got {actual_match}, when parsing {test_string!r}; " 

1374 f"full results = {results}" 

1375 ) 

1376 log.info("... OK") 

1377 

1378 def test(self, verbose: bool = False) -> None: 

1379 log.info(f"... no tests implemented for validator {self.classname()}") 

1380 

1381 

1382# ============================================================================= 

1383# More general testing 

1384# ============================================================================= 

1385 

1386 

1387def learning_alternative_regex_groups() -> None: 

1388 """ 

1389 Function to learn about regex syntax. 

1390 """ 

1391 regex_str = r""" 

1392 ( 

1393 (?: 

1394 \s* 

1395 (?: (a) | (b) | (c) | (d) ) 

1396 \s* 

1397 )* 

1398 ( fish )? 

1399 ) 

1400 """ 

1401 compiled_regex = compile_regex(regex_str) 

1402 for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"): 

1403 m = compiled_regex.match(test_str) 

1404 log.info(f"Match: {m}; groups: {m.groups()}") 

1405 """ 

1406 So: 

1407 - groups can overlap 

1408 - groups are ordered by their opening bracket 

1409 - matches are filled in neatly 

1410 """