Coverage for nlp_manager/parse_haematology.py: 100%

162 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/parse_haematology.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Python regex-based NLP processors for haematology tests.** 

27 

28All inherit from 

29:class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser` 

30and are constructed with these arguments: 

31 

32nlpdef: 

33 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

34cfgsection: 

35 the name of a CRATE NLP config file section (from which we may 

36 choose to get extra config information) 

37commit: 

38 force a COMMIT whenever we insert data? You should specify this 

39 in multiprocess mode, or you may get database deadlocks. 

40 

41""" 

42 

43from abc import ABC 

44import logging 

45from typing import List, Optional, Tuple 

46 

47from crate_anon.common.regex_helpers import ( 

48 regex_or, 

49 WORD_BOUNDARY, 

50) 

51from crate_anon.nlp_manager.nlp_definition import NlpDefinition 

52from crate_anon.nlp_manager.regex_parser import ( 

53 make_simple_numeric_regex, 

54 OPTIONAL_POC, 

55 SimpleNumericalResultParser, 

56 ValidatorBase, 

57) 

58from crate_anon.nlp_manager.regex_read_codes import ( 

59 ReadCodes, 

60 regex_components_from_read_codes, 

61) 

62from crate_anon.nlp_manager.regex_units import ( 

63 BILLION_PER_L, 

64 CELLS_PER_CUBIC_MM_OR_MICROLITRE, 

65 G_PER_DL, 

66 G_PER_L, 

67 L_PER_L, 

68 MG_PER_DL, 

69 MG_PER_L, 

70 MM_PER_H, 

71 PERCENT, 

72 TRILLION_PER_L, 

73) 

74 

75log = logging.getLogger(__name__) 

76 

77 

78# ============================================================================= 

79# Haemoglobin (Hb) 

80# ============================================================================= 

81 

82 

83class Haemoglobin(SimpleNumericalResultParser): 

84 """ 

85 HAEMATOLOGY (FBC). 

86 

87 Haemoglobin (Hb). Default units are g/L; also supports g/dL. 

88 

89 UK reporting for haemoglobin switched in 2013 from g/dL to g/L; see 

90 e.g. 

91 

92 - http://www.pathology.leedsth.nhs.uk/pathology/Portals/0/PDFs/BP-2013-02%20Hb%20units.pdf 

93 - https://www.acb.org.uk/docs/default-source/committees/scientific/guidelines/acb/pathology-harmony-haematology.pdf 

94 

95 The *DANGER* remains that "Hb 9" may have been from someone assuming 

96 old-style units, 9 g/dL = 90 g/L, but this will be interpreted as 9 g/L. 

97 This problem is hard to avoid. 

98 

99 """ # noqa: E501 

100 

101 HAEMOGLOBIN_BASE = rf""" 

102 {WORD_BOUNDARY} (?: Ha?emoglobin | Hb | HGB ) {WORD_BOUNDARY} 

103 """ 

104 HAEMOGLOBIN = regex_or( 

105 *regex_components_from_read_codes( 

106 ReadCodes.HAEMOGLOBIN_CONCENTRATION, 

107 ), 

108 HAEMOGLOBIN_BASE, 

109 wrap_each_in_noncapture_group=True, 

110 wrap_result_in_noncapture_group=False, 

111 ) 

112 REGEX = make_simple_numeric_regex( 

113 quantity=HAEMOGLOBIN, 

114 units=regex_or(G_PER_L, G_PER_DL), 

115 optional_ignorable_after_quantity=OPTIONAL_POC, 

116 ) 

117 NAME = "Haemoglobin" 

118 PREFERRED_UNIT_COLUMN = "value_g_L" 

119 UNIT_MAPPING = { 

120 G_PER_L: 1, # preferred unit 

121 G_PER_DL: 10, # older unit (e.g. 2000) 

122 } 

123 

124 def __init__( 

125 self, 

126 nlpdef: Optional[NlpDefinition], 

127 cfg_processor_name: Optional[str], 

128 commit: bool = False, 

129 ) -> None: 

130 # see documentation above 

131 super().__init__( 

132 nlpdef=nlpdef, 

133 cfg_processor_name=cfg_processor_name, 

134 regex_str=self.REGEX, 

135 variable=self.NAME, 

136 target_unit=self.PREFERRED_UNIT_COLUMN, 

137 units_to_factor=self.UNIT_MAPPING, 

138 commit=commit, 

139 take_absolute=True, 

140 ) 

141 

142 def test(self, verbose: bool = False) -> None: 

143 # docstring in superclass 

144 self.test_numerical_parser( 

145 [ 

146 ("Haemoglobin (should fail)", []), # should fail; no values 

147 ("Haemoglobin 90 (should succeed)", [90]), 

148 ("Hemoglobin = 60", [60]), 

149 ("Hb 6 g/dL", [60]), 

150 ("Hb 60 g/L", [60]), 

151 ("Hb <80", [80]), 

152 ("Hb <80 g/L", [80]), 

153 ("Hb was 62", [62]), 

154 ("Hb was 62 g/L", [62]), 

155 ("Hb was 62 (L) g/L", [62]), 

156 ("Haemoglobin | 7.6 (H) | g/dL", [76]), 

157 ("Hb-96", [96]), 

158 ("HGB, POC 96", [96]), 

159 ("Haemoglobin concentration (Xa96v) 96", [96]), 

160 ], 

161 verbose=verbose, 

162 ) 

163 

164 

165class HaemoglobinValidator(ValidatorBase): 

166 """ 

167 Validator for Haemoglobin (see help for explanation). 

168 """ 

169 

170 @classmethod 

171 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

172 return Haemoglobin.NAME, [Haemoglobin.HAEMOGLOBIN] 

173 

174 

175# ============================================================================= 

176# Haematocrit (Hct) 

177# ============================================================================= 

178 

179 

180class Haematocrit(SimpleNumericalResultParser): 

181 """ 

182 HAEMATOLOGY (FBC). 

183 

184 Haematocrit (Hct). 

185 A dimensionless quantity (but supports L/L notation). 

186 """ 

187 

188 HAEMATOCRIT_BASE = rf""" 

189 {WORD_BOUNDARY} (?: Ha?ematocrit | Hct ) {WORD_BOUNDARY} 

190 """ 

191 HAEMATOCRIT = regex_or( 

192 *regex_components_from_read_codes( 

193 ReadCodes.HAEMATOCRIT, 

194 ), 

195 HAEMATOCRIT_BASE, 

196 wrap_each_in_noncapture_group=True, 

197 wrap_result_in_noncapture_group=False, 

198 ) 

199 REGEX = make_simple_numeric_regex( 

200 quantity=HAEMATOCRIT, 

201 units=L_PER_L, 

202 optional_ignorable_after_quantity=OPTIONAL_POC, 

203 ) 

204 NAME = "Haematocrit" 

205 PREFERRED_UNIT_COLUMN = "value_L_L" 

206 UNIT_MAPPING = { 

207 L_PER_L: 1, # preferred unit 

208 # not MG_PER_DL, MG_PER_L 

209 } 

210 

211 def __init__( 

212 self, 

213 nlpdef: Optional[NlpDefinition], 

214 cfg_processor_name: Optional[str], 

215 commit: bool = False, 

216 ) -> None: 

217 # see documentation above 

218 super().__init__( 

219 nlpdef=nlpdef, 

220 cfg_processor_name=cfg_processor_name, 

221 regex_str=self.REGEX, 

222 variable=self.NAME, 

223 target_unit=self.PREFERRED_UNIT_COLUMN, 

224 units_to_factor=self.UNIT_MAPPING, 

225 commit=commit, 

226 take_absolute=True, 

227 ) 

228 

229 def test(self, verbose: bool = False) -> None: 

230 # docstring in superclass 

231 self.test_numerical_parser( 

232 [ 

233 ("Haematocrit (should fail)", []), # should fail; no values 

234 ("Haematocrit 0.4 (should succeed)", [0.4]), 

235 ("Hematocrit = 0.4", [0.4]), 

236 ("Hct 0.3 L/L", [0.3]), 

237 ("Haematocrit | 0.33 (H) | L/L", [0.33]), 

238 ( 

239 "my haematocrit was 0.3; his haematocrit was 0.4!", 

240 [0.3, 0.4], 

241 ), 

242 ("Hct-0.48", [0.48]), 

243 ("Haematocrit (X76tb) 0.48", [0.48]), 

244 ], 

245 verbose=verbose, 

246 ) 

247 

248 

249class HaematocritValidator(ValidatorBase): 

250 """ 

251 Validator for Haematocrit (see help for explanation). 

252 """ 

253 

254 @classmethod 

255 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

256 return Haematocrit.NAME, [Haematocrit.HAEMATOCRIT] 

257 

258 

259# ============================================================================= 

260# RBCs 

261# ============================================================================= 

262 

263 

264class RBC(SimpleNumericalResultParser): 

265 """ 

266 HAEMATOLOGY (FBC). 

267 

268 Red blood cell count. 

269 Default units are 10^12/L; also supports cells/mm^3 = cells/μL. 

270 

271 A typical excerpt from a FBC report: 

272 

273 .. code-block:: none 

274 

275 RBC, POC 4.84 10*12/L 

276 RBC, POC 9.99 (H) 10*12/L 

277 """ 

278 

279 RED_BLOOD_CELLS_BASE = rf""" 

280 {WORD_BOUNDARY} 

281 (?: 

282 # Red [blood] cell[s] [(RBC)] [count]: 

283 Red \b \s* (?: blood \s*)? \b cells? \b 

284 (?:\s* \(RBC\) )? 

285 (?:\s* count \b )? 

286 | 

287 # RBC(s): 

288 (?: RBCs? ) 

289 ) 

290 """ 

291 # Beware: \( or \) next to \b becomes unhappy. 

292 RED_BLOOD_CELLS = regex_or( 

293 # The order matters here (so, probably everywhere). Go from more to 

294 # less specific, i.e. Read codes first. 

295 # Otherwise, e.g.: 

296 # 

297 # Expected [6.2], got [426.0], when parsing 

298 # 'Red blood cell count (426..) 6.2' 

299 *regex_components_from_read_codes( 

300 ReadCodes.RBC_COUNT, 

301 ), 

302 RED_BLOOD_CELLS_BASE, 

303 wrap_each_in_noncapture_group=True, 

304 wrap_result_in_noncapture_group=False, 

305 ) 

306 REGEX = make_simple_numeric_regex( 

307 quantity=RED_BLOOD_CELLS, 

308 units=regex_or( 

309 TRILLION_PER_L, # good 

310 CELLS_PER_CUBIC_MM_OR_MICROLITRE, # good 

311 BILLION_PER_L, # bad 

312 ), 

313 optional_ignorable_after_quantity=OPTIONAL_POC, 

314 ) 

315 NAME = "RBC" 

316 PREFERRED_UNIT_COLUMN = "value_trillion_per_l" 

317 UNIT_MAPPING = { 

318 TRILLION_PER_L: 1, # preferred unit; 10^12/L or "per pL" 

319 CELLS_PER_CUBIC_MM_OR_MICROLITRE: 1e-6, 

320 # not BILLION_PER_L 

321 } 

322 

323 def __init__( 

324 self, 

325 nlpdef: Optional[NlpDefinition], 

326 cfg_processor_name: Optional[str], 

327 commit: bool = False, 

328 ) -> None: 

329 # see documentation above 

330 super().__init__( 

331 nlpdef=nlpdef, 

332 cfg_processor_name=cfg_processor_name, 

333 regex_str=self.REGEX, 

334 variable=self.NAME, 

335 target_unit=self.PREFERRED_UNIT_COLUMN, 

336 units_to_factor=self.UNIT_MAPPING, 

337 commit=commit, 

338 take_absolute=True, 

339 ) 

340 

341 def test(self, verbose: bool = False) -> None: 

342 # docstring in superclass 

343 self.test_numerical_parser( 

344 [ 

345 ("RBC (should fail)", []), # should fail; no values 

346 ("RBC 6", [6]), 

347 ("RBC = 6", [6]), 

348 ("RBC 6 x 10^9/L", []), 

349 ("RBC 6 x 10 ^ 9 / L", []), 

350 ("RBC 6 x 10 ^ 12 / L", [6]), 

351 ("RBC 6 10*12/L", [6]), 

352 ("RBCs 6.2", [6.2]), 

353 ("red cells 6.2", [6.2]), 

354 ("red blood cells 6.2", [6.2]), 

355 ("red blood cell count 6.2", [6.2]), 

356 ("red blood cells 5000000/mm3", [5]), 

357 ("red blood cells 5000000 cell/mm3", [5]), 

358 ("red blood cells 5000000 cells/mm3", [5]), 

359 ("red blood cells 5000000 per cubic mm", [5]), 

360 ("red blood cells 5000000 per cmm", [5]), 

361 ("RBC – 6", [6]), # en dash 

362 ("RBC—6", [6]), # em dash 

363 ("RBC -- 6", [6]), # double hyphen used as dash 

364 ("RBC - 6", [6]), 

365 ("RBC-6.5", [6.5]), 

366 ("RBC POC 4.84 10*12/L", [4.84]), 

367 ("RBC, POC 4.84 10*12/L", [4.84]), 

368 ("RBC, POC 4.84 (H) 10*12/L", [4.84]), 

369 ("red blood cells count 6.2", [6.2]), 

370 ("red blood cells (RBC) 6.2", [6.2]), 

371 ("Red blood cell count (426..) 6.2", [6.2]), 

372 ], 

373 verbose=verbose, 

374 ) 

375 

376 

377class RBCValidator(ValidatorBase): 

378 """ 

379 Validator for RBC (see help for explanation). 

380 """ 

381 

382 @classmethod 

383 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

384 return RBC.NAME, [RBC.RED_BLOOD_CELLS] 

385 

386 

387# ============================================================================= 

388# Erythrocyte sedimentation rate (ESR) 

389# ============================================================================= 

390 

391 

392class Esr(SimpleNumericalResultParser): 

393 """ 

394 HAEMATOLOGY (ESR). 

395 

396 Erythrocyte sedimentation rate (ESR), in mm/h. 

397 """ 

398 

399 ESR_BASE = rf""" 

400 {WORD_BOUNDARY} 

401 (?: 

402 Erythrocyte [\s]+ sed(?:\.|imentation)? [\s]+ rate 

403 | ESR 

404 ) 

405 {WORD_BOUNDARY} 

406 """ 

407 ESR = regex_or( 

408 *regex_components_from_read_codes( 

409 ReadCodes.ESR, 

410 ), 

411 ESR_BASE, 

412 wrap_each_in_noncapture_group=True, 

413 wrap_result_in_noncapture_group=False, 

414 ) 

415 REGEX = make_simple_numeric_regex( 

416 quantity=ESR, 

417 units=regex_or(MM_PER_H, MG_PER_DL, MG_PER_L), # good # bad # bad 

418 optional_ignorable_after_quantity=OPTIONAL_POC, 

419 ) 

420 NAME = "ESR" 

421 PREFERRED_UNIT_COLUMN = "value_mm_h" 

422 UNIT_MAPPING = { 

423 MM_PER_H: 1, # preferred unit 

424 # not MG_PER_DL, MG_PER_L 

425 } 

426 

427 def __init__( 

428 self, 

429 nlpdef: Optional[NlpDefinition], 

430 cfg_processor_name: Optional[str], 

431 commit: bool = False, 

432 ) -> None: 

433 # see documentation above 

434 super().__init__( 

435 nlpdef=nlpdef, 

436 cfg_processor_name=cfg_processor_name, 

437 regex_str=self.REGEX, 

438 variable=self.NAME, 

439 target_unit=self.PREFERRED_UNIT_COLUMN, 

440 units_to_factor=self.UNIT_MAPPING, 

441 commit=commit, 

442 take_absolute=True, 

443 ) 

444 

445 def test(self, verbose: bool = False) -> None: 

446 # docstring in superclass 

447 self.test_numerical_parser( 

448 [ 

449 ("ESR (should fail)", []), # should fail; no values 

450 ("ESR 6 (should succeed)", [6]), 

451 ("ESR = 6", [6]), 

452 ("ESR 6 mm/h", [6]), 

453 ("ESR <10", [10]), 

454 ("ESR <10 mm/hr", [10]), 

455 ("ESR >100", [100]), 

456 ("ESR >100 mm/hour", [100]), 

457 ("ESR was 62", [62]), 

458 ("ESR was 62 mm/h", [62]), 

459 ("ESR was 62 (H) mm/h", [62]), 

460 ("ESR was 62 mg/dl (should fail, wrong units)", []), 

461 ("Erythrocyte sed. rate was 19", [19]), 

462 ("his erythrocyte sedimentation rate was 19", [19]), 

463 ("erythrocyte sedimentation rate was 19", [19]), 

464 ("ESR 1.9 mg/L", []), # wrong units 

465 ("ESR 1.9 (H) mg/L", []), # wrong units 

466 ("ESR | 1.9 (H) | mg/L", []), 

467 ("my ESR was 15, but his ESR was 89!", [15, 89]), 

468 ("ESR-18", [18]), 

469 ("Erythrocyte sedimentation rate (XE2m7) 18", [18]), 

470 ], 

471 verbose=verbose, 

472 ) 

473 

474 

475class EsrValidator(ValidatorBase): 

476 """ 

477 Validator for Esr (see help for explanation). 

478 """ 

479 

480 @classmethod 

481 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

482 return Esr.NAME, [Esr.ESR] 

483 

484 

485# ============================================================================= 

486# White blood cell count and differential 

487# ============================================================================= 

488# Do NOT accept my handwritten abbreviations with slashed zeros, e.g. 

489# L0 lymphocytes 

490# N0 neutrophils 

491# M0 monocytes 

492# B0 basophils 

493# E0 eosinophils 

494# ... too likely that these are interpreted in wrong contexts, particularly 

495# if we are not allowing units, like "M0 3": macrophages 3 x 10^9/L, or part 

496# of "T2 N0 M0 ..." cancer staging? 

497 

498 

499class WbcBase(SimpleNumericalResultParser, ABC): 

500 """ 

501 DO NOT USE DIRECTLY. White cell count base class. 

502 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

503 """ 

504 

505 PREFERRED_UNIT_COLUMN = "value_billion_per_l" 

506 UNIT_MAPPING = { 

507 BILLION_PER_L: 1, # preferred unit: 10^9 / L 

508 CELLS_PER_CUBIC_MM_OR_MICROLITRE: 0.001, 

509 # ... 1000 cells/mm^3 -> 1 x 10^9 / L 

510 # but NOT percent (too hard to interpret relative differentials 

511 # reliably) 

512 } 

513 

514 def __init__( 

515 self, 

516 nlpdef: Optional[NlpDefinition], 

517 cfg_processor_name: Optional[str], 

518 cell_type_regex_text: str, 

519 variable: str, 

520 commit: bool = False, 

521 ) -> None: 

522 """ 

523 ``__init__`` function for :class:`WbcBase`. 

524 

525 Args: 

526 nlpdef: 

527 a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` 

528 cfg_processor_name: 

529 the name of a CRATE NLP config file section (from which we may 

530 choose to get extra config information) 

531 cell_type_regex_text: 

532 text for regex for the cell type, representing e.g. 

533 "monocytes" or "basophils" 

534 variable: 

535 used as the record value for ``variable_name`` 

536 commit: 

537 force a COMMIT whenever we insert data? You should specify this 

538 in multiprocess mode, or you may get database deadlocks. 

539 """ 

540 super().__init__( 

541 nlpdef=nlpdef, 

542 cfg_processor_name=cfg_processor_name, 

543 regex_str=self.make_wbc_regex(cell_type_regex_text), 

544 variable=variable, 

545 target_unit=self.PREFERRED_UNIT_COLUMN, 

546 units_to_factor=self.UNIT_MAPPING, 

547 commit=commit, 

548 take_absolute=True, 

549 ) 

550 

551 @staticmethod 

552 def make_wbc_regex(cell_type_regex_text: str) -> str: 

553 """ 

554 Makes a regular expression (as text) from text representing a cell 

555 type. 

556 """ 

557 return make_simple_numeric_regex( 

558 quantity=cell_type_regex_text, 

559 units=regex_or( 

560 BILLION_PER_L, # good 

561 CELLS_PER_CUBIC_MM_OR_MICROLITRE, # good 

562 PERCENT, # bad, so we can ignore it 

563 ), 

564 optional_ignorable_after_quantity=OPTIONAL_POC, 

565 ) 

566 

567 

568# ----------------------------------------------------------------------------- 

569# WBC 

570# ----------------------------------------------------------------------------- 

571 

572 

573class Wbc(WbcBase): 

574 """ 

575 HAEMATOLOGY (FBC). 

576 

577 White cell count (WBC, WCC). 

578 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

579 """ 

580 

581 WBC_BASE = r""" 

582 \b (?: 

583 (?: # White blood cells, white cell count, etc. 

584 White\b [\s]* (?:\bblood\b)? [\s]* \bcell[s]?\b 

585 [\s]* (?:\bcount\b)? [\s]* 

586 (?: # optional suffix WBC, (WBC), (WBCC), (WCC), etc. 

587 [\(]? (?: WBC | WBCC | WCC) [\)]? 

588 )? 

589 ) 

590 | (?: # just WBC(s), WBCC, WCC 

591 (?: WBC[s]? | WBCC | WCC ) 

592 ) 

593 ) \b 

594 """ 

595 WBC = regex_or( 

596 *regex_components_from_read_codes( 

597 ReadCodes.WBC_COUNT, 

598 ), 

599 WBC_BASE, 

600 wrap_each_in_noncapture_group=True, 

601 wrap_result_in_noncapture_group=False, 

602 ) 

603 NAME = "WBC" 

604 

605 def __init__( 

606 self, 

607 nlpdef: Optional[NlpDefinition], 

608 cfg_processor_name: Optional[str], 

609 commit: bool = False, 

610 ) -> None: 

611 # see documentation above 

612 super().__init__( 

613 nlpdef=nlpdef, 

614 cfg_processor_name=cfg_processor_name, 

615 commit=commit, 

616 cell_type_regex_text=self.WBC, 

617 variable=self.NAME, 

618 ) 

619 

620 def test(self, verbose: bool = False) -> None: 

621 # docstring in superclass 

622 self.test_numerical_parser( 

623 [ 

624 ("WBC (should fail)", []), # should fail; no values 

625 ("WBC 6", [6]), 

626 ("WBC = 6", [6]), 

627 ("WBC 6 x 10^9/L", [6]), 

628 ("WBC 6 x 10 ^ 9 / L", [6]), 

629 ("WCC 6.2", [6.2]), 

630 ("white cells 6.2", [6.2]), 

631 ("white cells 6.2", [6.2]), 

632 ("white cells 9800/mm3", [9.8]), 

633 ("white cells 9800 cell/mm3", [9.8]), 

634 ("white cells 9800 cells/mm3", [9.8]), 

635 ("white cells 9800 per cubic mm", [9.8]), 

636 ("white cells 9800 per cmm", [9.8]), 

637 ("white cells 17,600/mm3", [17.6]), 

638 ("white cells 17,600/μL", [17.6]), 

639 ("white cells 17,600/microlitre", [17.6]), 

640 ("WBC – 6", [6]), # en dash 

641 ("WBC—6", [6]), # em dash 

642 ("WBC -- 6", [6]), # double hyphen used as dash 

643 ("WBC - 6", [6]), 

644 ("WBC-6.5", [6.5]), 

645 ("WBC, POC 6.5", [6.5]), 

646 ("Total white blood count (XaIdY) 6.5", [6.5]), 

647 ], 

648 verbose=verbose, 

649 ) 

650 

651 

652class WbcValidator(ValidatorBase): 

653 """ 

654 Validator for Wbc (see help for explanation). 

655 """ 

656 

657 @classmethod 

658 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

659 return Wbc.NAME, [Wbc.WBC] 

660 

661 

662# ----------------------------------------------------------------------------- 

663# Neutrophils 

664# ----------------------------------------------------------------------------- 

665 

666 

667class Neutrophils(WbcBase): 

668 """ 

669 HAEMATOLOGY (FBC). 

670 

671 Neutrophil (polymorphonuclear leukoocte) count (absolute). 

672 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

673 """ 

674 

675 NEUTROPHILS_BASE = r""" 

676 (?: \b absolute \s* )? 

677 \b (?: Neut(?:r(?:o(?:phil)?)?)?s? | N0 ) \b 

678 (?: \s* count \b )? 

679 """ 

680 NEUTROPHILS = regex_or( 

681 *regex_components_from_read_codes( 

682 ReadCodes.NEUTROPHIL_COUNT, 

683 ReadCodes.POLYMORPH_COUNT, 

684 ), 

685 NEUTROPHILS_BASE, 

686 wrap_each_in_noncapture_group=True, 

687 wrap_result_in_noncapture_group=False, 

688 ) 

689 NAME = "neutrophils" 

690 

691 def __init__( 

692 self, 

693 nlpdef: Optional[NlpDefinition], 

694 cfg_processor_name: Optional[str], 

695 commit: bool = False, 

696 ) -> None: 

697 # see documentation above 

698 super().__init__( 

699 nlpdef=nlpdef, 

700 cfg_processor_name=cfg_processor_name, 

701 commit=commit, 

702 cell_type_regex_text=self.NEUTROPHILS, 

703 variable=self.NAME, 

704 ) 

705 

706 def test(self, verbose: bool = False) -> None: 

707 # docstring in superclass 

708 self.test_numerical_parser( 

709 [ 

710 ("neutrophils (should fail)", []), # should fail; no values 

711 ("absolute neutrophil count 6", [6]), 

712 ("neuts = 6", [6]), 

713 ("N0 6 x 10^9/L", [6]), 

714 ("neutrophil count 6 x 10 ^ 9 / L", [6]), 

715 ("neutrs 6.2", [6.2]), 

716 ("neutrophil 6.2", [6.2]), 

717 ("neutrophils 6.2", [6.2]), 

718 ("n0 9800/mm3", [9.8]), 

719 ("absolute neutrophils 9800 cell/mm3", [9.8]), 

720 ("neutrophils count 9800 cells/mm3", [9.8]), 

721 ("neuts 9800 per cmm", [9.8]), 

722 ("n0 9800 per cubic mm", [9.8]), 

723 ("n0 17,600/mm3", [17.6]), 

724 ("neuts-17", [17]), 

725 ("Neutrophil count (42J..) 17", [17]), 

726 ("Polymorph count (XaIao) 17", [17]), 

727 ], 

728 verbose=verbose, 

729 ) 

730 

731 

732class NeutrophilsValidator(ValidatorBase): 

733 """ 

734 Validator for Neutrophils (see help for explanation). 

735 """ 

736 

737 @classmethod 

738 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

739 return Neutrophils.NAME, [Neutrophils.NEUTROPHILS] 

740 

741 

742# ----------------------------------------------------------------------------- 

743# Lymphocytes 

744# ----------------------------------------------------------------------------- 

745 

746 

747class Lymphocytes(WbcBase): 

748 """ 

749 HAEMATOLOGY (FBC). 

750 

751 Lymphocyte count (absolute). 

752 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

753 """ 

754 

755 LYMPHOCYTES_BASE = r""" 

756 (?: \b absolute \s* )? 

757 \b Lymph(?:o(?:cyte)?)?s? \b 

758 (?: \s* count \b )? 

759 """ 

760 LYMPHOCYTES = regex_or( 

761 *regex_components_from_read_codes( 

762 ReadCodes.LYMPHOCYTE_COUNT, 

763 ), 

764 LYMPHOCYTES_BASE, 

765 wrap_each_in_noncapture_group=True, 

766 wrap_result_in_noncapture_group=False, 

767 ) 

768 NAME = "lymphocytes" 

769 

770 def __init__( 

771 self, 

772 nlpdef: Optional[NlpDefinition], 

773 cfg_processor_name: Optional[str], 

774 commit: bool = False, 

775 ) -> None: 

776 # see documentation above 

777 super().__init__( 

778 nlpdef=nlpdef, 

779 cfg_processor_name=cfg_processor_name, 

780 commit=commit, 

781 cell_type_regex_text=self.LYMPHOCYTES, 

782 variable=self.NAME, 

783 ) 

784 

785 def test(self, verbose: bool = False) -> None: 

786 # docstring in superclass 

787 self.test_numerical_parser( 

788 [ 

789 ("lymphocytes (should fail)", []), # should fail; no values 

790 ("absolute lymphocyte count 6", [6]), 

791 ("lymphs = 6", [6]), 

792 ("L0 6 x 10^9/L (should fail)", []), 

793 ("lymphocyte count 6 x 10 ^ 9 / L", [6]), 

794 ("lymphs 6.2", [6.2]), 

795 ("lymph 6.2", [6.2]), 

796 ("lympho 6.2", [6.2]), 

797 ("lymphos 9800/mm3", [9.8]), 

798 ("absolute lymphocytes 9800 cell/mm3", [9.8]), 

799 ("lymphocytes count 9800 cells/mm3", [9.8]), 

800 ("lymphocytes 9800 per cmm", [9.8]), 

801 ("lymphs-6.3", [6.3]), 

802 # We are not supporting "L0": 

803 ("l0 9800 per cubic mm (should fail)", []), 

804 ("l0 9800 per cmm (should fail)", []), 

805 ("l0 17,600/mm3 (should fail)", []), 

806 ("Lymphocyte count (42M..) 6.3", [6.3]), 

807 ], 

808 verbose=verbose, 

809 ) 

810 

811 

812class LymphocytesValidator(ValidatorBase): 

813 """ 

814 Validator for Lymphocytes (see help for explanation). 

815 """ 

816 

817 @classmethod 

818 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

819 return Lymphocytes.NAME, [Lymphocytes.LYMPHOCYTES] 

820 

821 

822# ----------------------------------------------------------------------------- 

823# Monocytes 

824# ----------------------------------------------------------------------------- 

825 

826 

827class Monocytes(WbcBase): 

828 """ 

829 HAEMATOLOGY (FBC). 

830 

831 Monocyte count (absolute). 

832 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

833 """ 

834 

835 MONOCYTES_BASE = r""" 

836 (?: \b absolute \s* )? 

837 \b Mono(?:cyte)?s? \b 

838 (?: \s* count \b )? 

839 """ 

840 MONOCYTES = regex_or( 

841 *regex_components_from_read_codes( 

842 ReadCodes.MONOCYTE_COUNT, 

843 ), 

844 MONOCYTES_BASE, 

845 wrap_each_in_noncapture_group=True, 

846 wrap_result_in_noncapture_group=False, 

847 ) 

848 NAME = "monocytes" 

849 

850 def __init__( 

851 self, 

852 nlpdef: Optional[NlpDefinition], 

853 cfg_processor_name: Optional[str], 

854 commit: bool = False, 

855 ) -> None: 

856 # see documentation above 

857 super().__init__( 

858 nlpdef=nlpdef, 

859 cfg_processor_name=cfg_processor_name, 

860 commit=commit, 

861 cell_type_regex_text=self.MONOCYTES, 

862 variable=self.NAME, 

863 ) 

864 

865 def test(self, verbose: bool = False) -> None: 

866 # docstring in superclass 

867 self.test_numerical_parser( 

868 [ 

869 ("monocytes (should fail)", []), # should fail; no values 

870 ("absolute monocyte count 6", [6]), 

871 ("monos = 6", [6]), 

872 ("M0 6 x 10^9/L (should fail)", []), 

873 ("monocyte count 6 x 10 ^ 9 / L", [6]), 

874 ("monos 6.2", [6.2]), 

875 ("mono 6.2", [6.2]), 

876 ("monos 9800/mm3", [9.8]), 

877 ("absolute mono 9800 cell/mm3", [9.8]), 

878 ("monocytes count 9800 cells/mm3", [9.8]), 

879 ("monocytes 9800 per cmm", [9.8]), 

880 ("monocytes-5.2", [5.2]), 

881 # We are not supporting "M0": 

882 ("m0 9800 per cubic mm (should fail)", []), 

883 ("m0 17,600/mm3 (should fail)", []), 

884 ("Monocyte count (42N..) 5.2", [5.2]), 

885 ], 

886 verbose=verbose, 

887 ) 

888 

889 

890class MonocytesValidator(ValidatorBase): 

891 """ 

892 Validator for Monocytes (see help for explanation). 

893 """ 

894 

895 @classmethod 

896 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

897 return Monocytes.NAME, [Monocytes.MONOCYTES] 

898 

899 

900# ----------------------------------------------------------------------------- 

901# Basophils 

902# ----------------------------------------------------------------------------- 

903 

904 

905class Basophils(WbcBase): 

906 """ 

907 HAEMATOLOGY (FBC). 

908 

909 Basophil count (absolute). 

910 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

911 """ 

912 

913 BASOPHILS_BASE = r""" 

914 (?: \b absolute \s* )? 

915 \b Baso(?:phil)?s? \b 

916 (?: \s* count \b )? 

917 """ 

918 BASOPHILS = regex_or( 

919 *regex_components_from_read_codes( 

920 ReadCodes.BASOPHIL_COUNT, 

921 ), 

922 BASOPHILS_BASE, 

923 wrap_each_in_noncapture_group=True, 

924 wrap_result_in_noncapture_group=False, 

925 ) 

926 NAME = "basophils" 

927 

928 def __init__( 

929 self, 

930 nlpdef: Optional[NlpDefinition], 

931 cfg_processor_name: Optional[str], 

932 commit: bool = False, 

933 ) -> None: 

934 # see documentation above 

935 super().__init__( 

936 nlpdef=nlpdef, 

937 cfg_processor_name=cfg_processor_name, 

938 commit=commit, 

939 cell_type_regex_text=self.BASOPHILS, 

940 variable=self.NAME, 

941 ) 

942 

943 def test(self, verbose=False) -> None: 

944 # docstring in superclass 

945 self.test_numerical_parser( 

946 [ 

947 ("basophils (should fail)", []), # should fail; no values 

948 ("absolute basophil count 6", [6]), 

949 ("basos = 6", [6]), 

950 ("B0 6 x 10^9/L (should fail)", []), 

951 ("basophil count 6 x 10 ^ 9 / L", [6]), 

952 ("basos 6.2", [6.2]), 

953 ("baso 6.2", [6.2]), 

954 ("basos 9800/mm3", [9.8]), 

955 ("absolute basophil 9800 cell/mm3", [9.8]), 

956 ("basophils count 9800 cells/mm3", [9.8]), 

957 ("basophils 9800 per cmm", [9.8]), 

958 ("basophils-5.2", [5.2]), 

959 # We are not supporting "B0": 

960 ("b0 9800 per cubic mm (should fail)", []), 

961 ("b0 17,600/mm3 (should fail)", []), 

962 ("Basophil count (42L..) 5.2", [5.2]), 

963 ], 

964 verbose=verbose, 

965 ) 

966 

967 

968class BasophilsValidator(ValidatorBase): 

969 """ 

970 Validator for Basophils (see help for explanation). 

971 """ 

972 

973 @classmethod 

974 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

975 return Basophils.NAME, [Basophils.BASOPHILS] 

976 

977 

978# ----------------------------------------------------------------------------- 

979# Eosinophils 

980# ----------------------------------------------------------------------------- 

981 

982 

983class Eosinophils(WbcBase): 

984 """ 

985 HAEMATOLOGY (FBC). 

986 

987 Eosinophil count (absolute). 

988 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

989 """ 

990 

991 EOSINOPHILS_BASE = r""" 

992 (?: \b absolute \s* )? 

993 \b Eo(?:sin(?:o(?:phil)?)?)?s? \b 

994 (?: \s* count \b )? 

995 """ 

996 EOSINOPHILS = regex_or( 

997 *regex_components_from_read_codes( 

998 ReadCodes.EOSINOPHIL_COUNT, 

999 ), 

1000 EOSINOPHILS_BASE, 

1001 wrap_each_in_noncapture_group=True, 

1002 wrap_result_in_noncapture_group=False, 

1003 ) 

1004 NAME = "eosinophils" 

1005 

1006 def __init__( 

1007 self, 

1008 nlpdef: Optional[NlpDefinition], 

1009 cfg_processor_name: Optional[str], 

1010 commit: bool = False, 

1011 ) -> None: 

1012 # see documentation above 

1013 super().__init__( 

1014 nlpdef=nlpdef, 

1015 cfg_processor_name=cfg_processor_name, 

1016 commit=commit, 

1017 cell_type_regex_text=self.EOSINOPHILS, 

1018 variable=self.NAME, 

1019 ) 

1020 

1021 def test(self, verbose: bool = False) -> None: 

1022 # docstring in superclass 

1023 self.test_numerical_parser( 

1024 [ 

1025 ("eosinophils (should fail)", []), # should fail; no values 

1026 ("absolute eosinophil count 6", [6]), 

1027 ("eos = 6", [6]), 

1028 ("E0 6 x 10^9/L (should fail)", []), 

1029 ("eosinophil count 6 x 10 ^ 9 / L", [6]), 

1030 ("eosins 6.2", [6.2]), 

1031 ("eosino 6.2", [6.2]), 

1032 ("eosinos 9800/mm3", [9.8]), 

1033 ("absolute eosinophil 9800 cell/mm3", [9.8]), 

1034 ("eosinophils count 9800 cells/mm3", [9.8]), 

1035 ("eosinophils 9800 per cmm", [9.8]), 

1036 ("eosinophils-5.3", [5.3]), 

1037 # We are not supporting "E0": 

1038 ("e0 9800 per cubic mm (should fail)", []), 

1039 ("e0 17,600/mm3 (should fail)", []), 

1040 ("Eosinophil count (42K..) 5.2", [5.2]), 

1041 ("Eosinophil count - observation (42K..) 5.2", [5.2]), 

1042 ], 

1043 verbose=verbose, 

1044 ) 

1045 

1046 

1047class EosinophilsValidator(ValidatorBase): 

1048 """ 

1049 Validator for Eosinophils (see help for explanation). 

1050 """ 

1051 

1052 @classmethod 

1053 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

1054 return Eosinophils.NAME, [Eosinophils.EOSINOPHILS] 

1055 

1056 

1057# ----------------------------------------------------------------------------- 

1058# Platelet count 

1059# ----------------------------------------------------------------------------- 

1060 

1061 

1062class Platelets(WbcBase): 

1063 """ 

1064 HAEMATOLOGY (FBC). 

1065 

1066 Platelet count. 

1067 Default units are 10^9 / L; also supports cells/mm^3 = cells/μL. 

1068 

1069 Not actually a white blood cell, of course, but can share the same base 

1070 class; platelets are expressed in the same units, of 10^9 / L. 

1071 Typical values 150–450 ×10^9 / L (or 150,000–450,000 per μL). 

1072 """ 

1073 

1074 PLATELETS_BASE = r""" 

1075 \b (?: Platelets? | plts? ) \b # platelet(s), plt(s) 

1076 (?: \s* count \b )? # optional "count" 

1077 """ 

1078 PLATELETS = regex_or( 

1079 *regex_components_from_read_codes( 

1080 ReadCodes.PLATELET_COUNT, 

1081 ), 

1082 PLATELETS_BASE, 

1083 wrap_each_in_noncapture_group=True, 

1084 wrap_result_in_noncapture_group=False, 

1085 ) 

1086 NAME = "platelets" 

1087 

1088 def __init__( 

1089 self, 

1090 nlpdef: Optional[NlpDefinition], 

1091 cfg_processor_name: Optional[str], 

1092 commit: bool = False, 

1093 ) -> None: 

1094 # see documentation above 

1095 super().__init__( 

1096 nlpdef=nlpdef, 

1097 cfg_processor_name=cfg_processor_name, 

1098 commit=commit, 

1099 cell_type_regex_text=self.PLATELETS, 

1100 variable=self.NAME, 

1101 ) 

1102 

1103 def test(self, verbose: bool = False) -> None: 

1104 # docstring in superclass 

1105 self.test_numerical_parser( 

1106 [ 

1107 ("platelets (should fail)", []), # should fail; no values 

1108 ("platelet count 150", [150]), 

1109 ("plt = 150", [150]), 

1110 ("PLT 150 x 10^9/L", [150]), 

1111 ("platelet count 150 x 10 ^ 9 / L", [150]), 

1112 ("plt 400", [400]), 

1113 ("plts 400", [400]), 

1114 ("plt 400000/mm3", [400]), 

1115 ("plt count 400000/μL", [400]), 

1116 ("plts 400000 per microliter", [400]), 

1117 ("Platelet count (42P..) 150", [150]), 

1118 ], 

1119 verbose=verbose, 

1120 ) 

1121 

1122 

1123class PlateletsValidator(ValidatorBase): 

1124 """ 

1125 Validator for Platelets (see help for explanation). 

1126 """ 

1127 

1128 @classmethod 

1129 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

1130 return Platelets.NAME, [Platelets.PLATELETS] 

1131 

1132 

1133# ============================================================================= 

1134# All classes in this module 

1135# ============================================================================= 

1136 

1137ALL_HAEMATOLOGY_NLP_AND_VALIDATORS = [ 

1138 (Basophils, BasophilsValidator), 

1139 (Eosinophils, EosinophilsValidator), 

1140 (Esr, EsrValidator), 

1141 (Haematocrit, HaematocritValidator), 

1142 (Haemoglobin, HaemoglobinValidator), 

1143 (Lymphocytes, LymphocytesValidator), 

1144 (Monocytes, MonocytesValidator), 

1145 (Neutrophils, NeutrophilsValidator), 

1146 (Platelets, PlateletsValidator), 

1147 (RBC, RBCValidator), 

1148 (Wbc, WbcValidator), 

1149]