Coverage for nlp_manager/parse_substance_misuse.py: 97%

124 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/parse_substance_misuse.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Python regex-based NLP processors for substance misuse.** 

27 

28""" 

29 

30import logging 

31from typing import Any, Dict, Generator, List, Optional, Tuple 

32 

33from crate_anon.common.regex_helpers import ( 

34 at_wb_start_end, 

35 noncapture_group, 

36 optional_named_capture_group, 

37 optional_noncapture_group, 

38 regex_or, 

39 WORD_BOUNDARY, 

40) 

41from crate_anon.nlp_manager.nlp_definition import NlpDefinition 

42from crate_anon.nlp_manager.number import to_float 

43from crate_anon.nlp_manager.regex_func import ( 

44 compile_regex, 

45 compile_regex_dict, 

46 get_regex_dict_match, 

47 get_regex_dict_search, 

48) 

49from crate_anon.nlp_manager.regex_parser import ( 

50 common_tense, 

51 EVER, 

52 FN_CONTENT, 

53 FN_END, 

54 FN_RELATION, 

55 FN_RELATION_TEXT, 

56 FN_START, 

57 FN_TENSE, 

58 FN_TENSE_TEXT, 

59 FN_UNITS, 

60 FN_VALUE_TEXT, 

61 FN_VARIABLE_NAME, 

62 FN_VARIABLE_TEXT, 

63 GROUP_NAME_QUANTITY, 

64 GROUP_NAME_RELATION, 

65 GROUP_NAME_TENSE, 

66 GROUP_NAME_UNITS, 

67 GROUP_NAME_VALUE, 

68 GROUP_NUMBER_WHOLE_EXPRESSION, 

69 make_simple_numeric_regex, 

70 NumericalResultParser, 

71 PAST, 

72 PRESENT, 

73 ValidatorBase, 

74) 

75from crate_anon.nlp_manager.regex_units import ( 

76 ALCOHOL, 

77 DAYS_PER_WEEK, 

78 UK_ALCOHOL_UNITS_PER_DAY, 

79 UK_ALCOHOL_UNITS_PER_MONTH, 

80 UK_ALCOHOL_UNITS_PER_WEEK, 

81 UK_ALCOHOL_UNITS_PER_YEAR, 

82 WEEKS_PER_MONTH_APPROX, 

83 WEEKS_PER_YEAR_APPROX, 

84) 

85 

86log = logging.getLogger(__name__) 

87 

88 

89# ============================================================================= 

90# Alcohol 

91# ============================================================================= 

92 

93 

94class AlcoholUnits(NumericalResultParser): 

95 """ 

96 SUBSTANCE MISUSE. 

97 

98 Alcohol consumption, specified explicitly as (UK) units per day or per 

99 week, or via non-numeric references to not drinking any. 

100 

101 - Output is in UK units per week. A UK unit is 10 ml of ethanol [#f1]_ [#f2]_. 

102 UK NHS guidelines used to be "per week" and remain broadly week-based [#f1]_. 

103 - It doesn't attempt any understanding of other alcohol descriptions (e.g. 

104 "pints of beer", "glasses of wine", "bottles of vodka") so is expected to 

105 apply where a clinician has converted a (potentially mixed) alcohol 

106 description to a units-per-week calculation. 

107 

108 .. [#f1] https://www.nhs.uk/live-well/alcohol-advice/calculating-alcohol-units/, 

109 accessed 2023-01-18. 

110 .. [#f2] https://en.wikipedia.org/wiki/Unit_of_alcohol 

111 """ # noqa: E501 

112 

113 # There are no relevant Read codes for alcohol consumption in 

114 # v3ReadCode_PBCL.xlsx. 

115 

116 # ------------------------------------------------------------------------- 

117 # Regex building for tense-related statements 

118 # ------------------------------------------------------------------------- 

119 

120 # All these are verbose regexes, so don't omit \s+ for whitespace! 

121 PAST_ADVERBS = ( 

122 "formerly", 

123 "once", 

124 "peak", 

125 "previously", 

126 "was", 

127 ) 

128 PAST_ADVERBS_RE = noncapture_group(regex_or(*PAST_ADVERBS)) 

129 DOES_NOT = r"does\s*n[o'’]t" # does not, doesn't 

130 PRESENT_ADVERBS = ( 

131 r"at \s+ present", 

132 r"currently", 

133 r"has \s+ been", 

134 r"now", 

135 r"nowadays", 

136 r"presently", 

137 r"these \s+ days", 

138 DOES_NOT, 

139 ) 

140 PRESENT_ADVERBS_RE = noncapture_group(regex_or(*PRESENT_ADVERBS)) 

141 TEMPORAL_WORDS = tuple( 

142 at_wb_start_end(x) for x in PAST_ADVERBS + PRESENT_ADVERBS 

143 ) 

144 TEMPORAL = noncapture_group(regex_or(*TEMPORAL_WORDS)) 

145 OPT_TEMPORAL = optional_noncapture_group(regex_or(*TEMPORAL_WORDS)) 

146 

147 NEVER = "never" 

148 # "Never" is both temporal and negating and thus fiddly. We do *not* 

149 # include it in standard temporal words, or a statement about "has never 

150 # drunk >100 u/w" would be misinterpreted as positive. 

151 

152 # ------------------------------------------------------------------------- 

153 # Regex building for drinking alcohol (and when) 

154 # ------------------------------------------------------------------------- 

155 

156 DRINKING_PAST = ( 

157 # Past infinitive: she used to drink 

158 r"\b used \s+ to \s+ drink \b", 

159 # Imperfect tense: she [adverb] drank 

160 rf"\b (?: {PAST_ADVERBS_RE} \s+ )? drank \b", 

161 # Perfect tense: has drunk 

162 rf"\b has (?: {PAST_ADVERBS_RE} \s+ )? drunk \b", 

163 # Past continuous tense: he was [adverb] drinking 

164 # Also abbreviated past continuous tense: previously drinking 

165 rf"\b {PAST_ADVERBS_RE} \s+ drinking \b", 

166 ) 

167 # We don't allow the adverbs by themselves, to avoid something that isn't 

168 # explicitly about alcohol or drinking, e.g. "[insulin] currently 6 

169 # units/day". 

170 DRINKING_PRESENT = ( 

171 # Present tense: he [adverb] drinks 

172 rf"\b (?: {PRESENT_ADVERBS_RE} \s+)? drinks \b", 

173 # Present continuous tense: he is [adverb] drinking 

174 rf"\b (?: is \s+)? (?: {PRESENT_ADVERBS_RE} \s+)? drinking \b", 

175 ) 

176 DRINKING_PAST_PRESENT = DRINKING_PAST + DRINKING_PRESENT 

177 DRINKING = noncapture_group(regex_or(*DRINKING_PAST_PRESENT)) 

178 OPT_DRINKING = optional_noncapture_group(regex_or(*DRINKING_PAST_PRESENT)) 

179 ALCOHOL_PM_CONSUMPTION = rf"{ALCOHOL} (?: \s+ consumption \b)?" 

180 ALC = noncapture_group(ALCOHOL_PM_CONSUMPTION) 

181 OPT_ALC = optional_noncapture_group(ALCOHOL_PM_CONSUMPTION) 

182 

183 # BRK: requires some sort of wordbreak or whitespace, but also disposes of 

184 # junk like some punctuation (e.g. "previously: none" versus "previously 

185 # none") and words like "at" (e.g. in "drinking at X units/week"). 

186 BRK = noncapture_group( 

187 regex_or( 

188 r"\s* : \s*", # colon +/- whitespace 

189 r"\s* \b at \b \s*", # "at" +/- whitespace 

190 r"\s+", # whitespace 

191 WORD_BOUNDARY, # other word break 

192 ) 

193 ) 

194 

195 # Move from more to less specific, or the less specific will capture first. 

196 ALCOHOL_DRINKING = rf""" 

197 {WORD_BOUNDARY} 

198 # Alcohol drinking: 

199 (?: 

200 # 1. ... DRINKING ... [ALC] ... 

201 {OPT_TEMPORAL} {BRK} 

202 {DRINKING} {BRK} 

203 {OPT_TEMPORAL} {BRK} 

204 {OPT_ALC} {BRK} 

205 {OPT_TEMPORAL} 

206 | 

207 # 2. ... ALC ... [DRINKING] ... 

208 {OPT_TEMPORAL} {BRK} 

209 {ALC} {BRK} 

210 {OPT_TEMPORAL} {BRK} 

211 {OPT_DRINKING} {BRK} 

212 {OPT_TEMPORAL} 

213 ) 

214 {WORD_BOUNDARY} 

215 """ 

216 

217 _drinking_tense_dict = {} # type: Dict[str, str] 

218 for _past in DRINKING_PAST + PAST_ADVERBS: 

219 _drinking_tense_dict[_past] = PAST 

220 for _present in DRINKING_PRESENT + PRESENT_ADVERBS: 

221 _drinking_tense_dict[_present] = PRESENT 

222 TENSE_PAST_PRESENT_LOOKUP = compile_regex_dict(_drinking_tense_dict) 

223 TENSE_NEVER_LOOKUP = compile_regex_dict({NEVER: EVER}) 

224 

225 # ------------------------------------------------------------------------- 

226 # Regex building for "drinking alcohol at X units per week" 

227 # ------------------------------------------------------------------------- 

228 

229 # A temporal suffix allows e.g. "drinking X units/week previously". 

230 GROUP_NAME_SUFFIX = "suffix" 

231 group_suffix = r"\b \s*" + optional_named_capture_group( 

232 TEMPORAL, GROUP_NAME_SUFFIX 

233 ) 

234 REGEX_ALCOHOL_UNITS = ( 

235 make_simple_numeric_regex( 

236 quantity=ALCOHOL_DRINKING, 

237 units=regex_or( 

238 UK_ALCOHOL_UNITS_PER_DAY, 

239 UK_ALCOHOL_UNITS_PER_WEEK, 

240 UK_ALCOHOL_UNITS_PER_MONTH, # perhaps unusual! 

241 UK_ALCOHOL_UNITS_PER_YEAR, # perhaps unusual! 

242 ), 

243 units_optional=False, 

244 ) 

245 + group_suffix 

246 ) 

247 

248 # ------------------------------------------------------------------------- 

249 # Regex building for "no alcohol" statements 

250 # ------------------------------------------------------------------------- 

251 

252 ABSTINENT = r"\b abstin[ae]nt \b" # "abstinent", or typo "abstinant" 

253 NONE = noncapture_group( 

254 WORD_BOUNDARY 

255 + noncapture_group( 

256 regex_or( 

257 "0", 

258 rf"{ABSTINENT} (?: \s+ from \b )?", 

259 NEVER, 

260 "no", 

261 "none", 

262 "zero", 

263 ) 

264 ) 

265 + WORD_BOUNDARY 

266 ) 

267 TEETOTAL = noncapture_group( 

268 r"\b te[ea][-]?total(?:l?er)? \b", 

269 ) 

270 DOES_NOT_DRINK = noncapture_group( 

271 regex_or( 

272 rf"\b {DOES_NOT} \s+ drink \b", 

273 rf"\b has \s+ {NEVER} \s+ drunk \b", 

274 ) 

275 ) 

276 OPT_TEMPORAL_AND_OR_DRINKING_BRK = ( 

277 f"{OPT_TEMPORAL} {BRK} {OPT_DRINKING} {BRK} {OPT_TEMPORAL} {BRK}" 

278 ) 

279 NO_ALCOHOL = rf""" 

280 {WORD_BOUNDARY} 

281 # "No alcohol" statements. 

282 # Temporal modifiers might be found in all sorts of places. 

283 (?: 

284 # 1. [DRINKING] ... ALC ... [DRINKING] ... NONE ... 

285 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

286 {ALC} {BRK} 

287 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

288 {NONE} {BRK} 

289 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

290 | 

291 # 2. NONE ... ALC (e.g. "never alcohol") 

292 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

293 {NONE} {BRK} 

294 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

295 {ALC} {BRK} 

296 {OPT_TEMPORAL_AND_OR_DRINKING_BRK} 

297 | 

298 # 3. "has never drunk... alcohol", etc. 

299 {DOES_NOT_DRINK} {BRK} {ALC} {BRK} 

300 | 

301 # 4. "teetotal" with typos 

302 {TEETOTAL} 

303 # ... but not just "drinking... none" (could be water etc.) 

304 ) 

305 {WORD_BOUNDARY} 

306 """ 

307 

308 # ------------------------------------------------------------------------- 

309 # Other class variables 

310 # ------------------------------------------------------------------------- 

311 

312 NAME = "AlcoholUnits" 

313 PREFERRED_UNIT_COLUMN = "value_uk_units_per_week" 

314 UNIT_MAPPING = { 

315 UK_ALCOHOL_UNITS_PER_WEEK: 1, # preferred unit 

316 UK_ALCOHOL_UNITS_PER_DAY: DAYS_PER_WEEK, # 1 unit/day -> 7 units/week 

317 UK_ALCOHOL_UNITS_PER_MONTH: 1 / WEEKS_PER_MONTH_APPROX, 

318 UK_ALCOHOL_UNITS_PER_YEAR: 1 / WEEKS_PER_YEAR_APPROX, 

319 } 

320 

321 # ------------------------------------------------------------------------- 

322 # Init 

323 # ------------------------------------------------------------------------- 

324 

325 def __init__( 

326 self, 

327 nlpdef: Optional[NlpDefinition], 

328 cfg_processor_name: Optional[str], 

329 commit: bool = False, 

330 ) -> None: 

331 # see documentation above 

332 super().__init__( 

333 nlpdef=nlpdef, 

334 cfg_processor_name=cfg_processor_name, 

335 variable=self.NAME, 

336 target_unit=self.PREFERRED_UNIT_COLUMN, 

337 regex_str_for_debugging=self.REGEX_ALCOHOL_UNITS, 

338 commit=commit, 

339 ) 

340 self.compiled_regex_alcohol = compile_regex(self.REGEX_ALCOHOL_UNITS) 

341 self.units_to_factor = compile_regex_dict(self.UNIT_MAPPING) 

342 self.compiled_regex_no_alcohol = compile_regex(self.NO_ALCOHOL) 

343 

344 # ------------------------------------------------------------------------- 

345 # Parse 

346 # ------------------------------------------------------------------------- 

347 

348 def parse( 

349 self, text: str, debug: bool = False 

350 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

351 """ 

352 Parse for two regexes which operate slightly differently. 

353 """ 

354 if not text: 

355 return 

356 yield from self.parse_alcohol_units(text, debug) 

357 yield from self.parse_alcohol_none(text, debug) 

358 

359 def parse_alcohol_units( 

360 self, text: str, debug: bool = False 

361 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

362 """ 

363 We amend SimpleNumericalResultParser.parse() to deal with tense a bit 

364 better (e.g. "used to drink"). Comments from that version not repeated. 

365 That version also shortened a bit since we guarantee some aspects of 

366 the flags. 

367 """ 

368 for m in self.compiled_regex_alcohol.finditer(text): 

369 startpos = m.start() 

370 endpos = m.end() 

371 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) 

372 variable_text = m.group(GROUP_NAME_QUANTITY) 

373 tense_text = m.group(GROUP_NAME_TENSE) 

374 relation_text = m.group(GROUP_NAME_RELATION) 

375 value_text = m.group(GROUP_NAME_VALUE) 

376 units = m.group(GROUP_NAME_UNITS) 

377 suffix_text = m.group(self.GROUP_NAME_SUFFIX) 

378 

379 value_in_target_units = None 

380 if units: 

381 matched_unit, multiple_or_fn = get_regex_dict_match( 

382 units, self.units_to_factor 

383 ) 

384 if not matched_unit: 

385 continue 

386 # MODIFIED: no need to check callable(multiple_or_fn); always 

387 # no 

388 value_in_target_units = to_float(value_text) * multiple_or_fn 

389 # MODIFIED: no need to check self.assume_preferred_unit (we never 

390 # assume that here) 

391 

392 # MODIFIED: no need to check self.take_absolute (always yes) 

393 if value_in_target_units is not None: 

394 value_in_target_units = abs(value_in_target_units) 

395 

396 tense, relation = common_tense(tense_text, relation_text) 

397 

398 # MODIFIED: Extra bit here to detect tense information in a 

399 # different place: 

400 for temporal_info in (variable_text, suffix_text): 

401 if tense: 

402 break 

403 tense = self._get_tense(temporal_info) 

404 if tense: 

405 tense_text = temporal_info 

406 

407 # Back to the previous code: 

408 result = { 

409 FN_VARIABLE_NAME: self.variable, 

410 FN_CONTENT: matching_text, 

411 FN_START: startpos, 

412 FN_END: endpos, 

413 FN_VARIABLE_TEXT: variable_text, 

414 FN_RELATION_TEXT: relation_text, 

415 FN_RELATION: relation, 

416 FN_VALUE_TEXT: value_text, 

417 FN_UNITS: units, 

418 self.target_unit: value_in_target_units, 

419 FN_TENSE_TEXT: tense_text, 

420 FN_TENSE: tense, 

421 } 

422 if debug: 

423 log.debug(f"Match {m} for {text!r} -> {result}") 

424 yield self.tablename, result 

425 

426 def parse_alcohol_none( 

427 self, text: str, debug: bool = False 

428 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

429 """ 

430 Deal with references to not drinking any alcohol (except those referred 

431 to as e.g. "0 units per week", which will be picked up by the 

432 units-per-week function -- that will be rare!). 

433 """ 

434 for m in self.compiled_regex_no_alcohol.finditer(text): 

435 startpos = m.start() 

436 endpos = m.end() 

437 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) 

438 tense = self._get_tense(matching_text) 

439 tense_text = matching_text if tense else None 

440 

441 result = { 

442 FN_VARIABLE_NAME: self.variable, 

443 FN_CONTENT: matching_text, 

444 FN_START: startpos, 

445 FN_END: endpos, 

446 FN_VARIABLE_TEXT: matching_text, 

447 FN_RELATION_TEXT: None, 

448 FN_RELATION: None, 

449 FN_VALUE_TEXT: matching_text, 

450 FN_UNITS: None, 

451 self.target_unit: 0, # zero units 

452 FN_TENSE_TEXT: tense_text, 

453 FN_TENSE: tense, 

454 } 

455 if debug: 

456 log.debug(f"Match {m} for {text!r} -> {result}") 

457 yield self.tablename, result 

458 

459 def _get_tense(self, text: str) -> Optional[str]: 

460 """ 

461 Find a tense indicator and return the corresponding text, or None. 

462 """ 

463 # We deal with "never" first because otherwise "never drank" may hit 

464 # "[optional_stuff] drank" and be classified as the past tense. 

465 _, tense = get_regex_dict_search(text, self.TENSE_NEVER_LOOKUP) 

466 if not tense: 

467 _, tense = get_regex_dict_search( 

468 text, self.TENSE_PAST_PRESENT_LOOKUP 

469 ) 

470 return tense 

471 

472 # ------------------------------------------------------------------------- 

473 # Test 

474 # ------------------------------------------------------------------------- 

475 

476 def test(self, verbose: bool = False) -> None: 

477 # docstring in parent class 

478 # Test via e.g.: 

479 # pytest -k SubstanceMisuseTests # self-tests 

480 # crate_run_crate_nlp_demo - --processors AlcoholUnits # interactive 

481 no_results = [] 

482 six_no_tense = [{self.target_unit: 6, FN_TENSE: None}] 

483 six_past = [{self.target_unit: 6, FN_TENSE: PAST}] 

484 six_present = [{self.target_unit: 6, FN_TENSE: PRESENT}] 

485 six_per_day_present = [ 

486 {self.target_unit: 6 * DAYS_PER_WEEK, FN_TENSE: PRESENT} 

487 ] 

488 six_per_month_present = [ 

489 {self.target_unit: 6 / WEEKS_PER_MONTH_APPROX, FN_TENSE: PRESENT} 

490 ] 

491 six_per_year_present = [ 

492 {self.target_unit: 6 / WEEKS_PER_YEAR_APPROX, FN_TENSE: PRESENT} 

493 ] 

494 under_6_present = [ 

495 {self.target_unit: 6, FN_RELATION: "<", FN_TENSE: PRESENT} 

496 ] 

497 over_200_present = [ 

498 {self.target_unit: 200, FN_RELATION: ">", FN_TENSE: PRESENT} 

499 ] 

500 no_alcohol_no_tense = [{self.target_unit: 0, FN_TENSE: None}] 

501 no_alcohol_past = [{self.target_unit: 0, FN_TENSE: PAST}] 

502 no_alcohol_present = [{self.target_unit: 0, FN_TENSE: PRESENT}] 

503 no_alcohol_ever = [{self.target_unit: 0, FN_TENSE: EVER}] 

504 self.detailed_test_multiple( 

505 [ 

506 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

507 # No results expected: 

508 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

509 ("Alcohol", no_results), 

510 ("He used to drink like a fish", no_results), 

511 ("[e.g. insulin] currently 6 units per week", no_results), 

512 ("[e.g. insulin] previously 6 units per week", no_results), 

513 ("[could be insulin] peak 6 u/w", no_results), 

514 ("[!] methylalcohol 6 u/w", no_results), 

515 ("[not starts with no] Alcohol: not explored", no_results), 

516 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

517 # Value with no tense: 

518 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

519 ("Alcohol 6 u/w", six_no_tense), 

520 ("Alcohol - 6 u/w", six_no_tense), 

521 ("EtOH = 6 u/w", six_no_tense), 

522 ("EtOH = 6 u/wk", six_no_tense), 

523 ("Alcohol (units/week): 6", six_no_tense), 

524 ("Ethanol 6 units/week", six_no_tense), 

525 ("[not international but] alcohol 6 IU/week", six_no_tense), 

526 ("alcohol 6 I.U./week", six_no_tense), 

527 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

528 # Past tense: 

529 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

530 ("Alcohol: was 6 u/w", six_past), # other tenses fail (= good) 

531 ("Alcohol: formerly 6 u/w", six_past), 

532 ("Alcohol: previously 6 u/w", six_past), 

533 ("Alcohol: once 6 u/w", six_past), 

534 ("Alcohol: peak 6 u/w", six_past), 

535 ("Used to drink 6 u/w", six_past), 

536 ("Peak drinking 6 u/w", six_past), 

537 ("Peak alcohol consumption: 6 u/w", six_past), 

538 ("Drank 6 u/w", six_past), 

539 ("Formerly drank 6 u/w", six_past), 

540 ("Previously drank 6 u/w", six_past), 

541 ("Was drinking 6 u/w", six_past), 

542 ("Was previously drinking 6 u/w", six_past), 

543 ("Was formerly drinking 6 u/w", six_past), 

544 ("Alcohol: formerly 6 u/w", six_past), 

545 ("Alcohol: previously 6 u/w", six_past), 

546 ("Alcohol: 6 u/w previously", six_past), 

547 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

548 # Present tense: 

549 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

550 ("Drinks 6 units per week", six_present), 

551 ("Drinks 6 alcohol units per week", six_present), 

552 ("Drinks 6 UK units per week", six_present), 

553 ("Drinks 6 UK alcohol units per week", six_present), 

554 ("[silly] Drinks 6 UK alcohol IU per week", six_present), 

555 ("Drinks 6 units/d", six_per_day_present), 

556 ("Drinks 6 units/dy", six_per_day_present), 

557 ("Drinks 6 units/day", six_per_day_present), 

558 ("Currently drinks 6 units per week", six_present), 

559 ("These days drinks 6 units per week", six_present), 

560 ("Now drinks 6 units per week", six_present), 

561 ("Nowadays drinks 6 units per week", six_present), 

562 ("Drinking 6 units per week", six_present), 

563 ("Currently drinking 6 units per week", six_present), 

564 ("Presently drinking 6 units per week", six_present), 

565 ("Alcohol: currently 6 u/w", six_present), 

566 ("Alcohol: presently 6 u/w", six_present), 

567 ("In terms of alcohol she drinks 6 units/week", six_present), 

568 ("Has been drinking 6 units per week", six_present), 

569 ("Drinks 6 units per month", six_per_month_present), 

570 ("Drinks 6 units per year", six_per_year_present), 

571 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

572 # Inequalities: 

573 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

574 ("Alcohol: presently less than 6 u/w", under_6_present), 

575 ("Alcohol: presently under 6 u/w", under_6_present), 

576 ("Alcohol: presently >200 u/w", over_200_present), 

577 ("Alcohol: currently more than 200 u/w", over_200_present), 

578 ("Alcohol: currently over 200 u/w", over_200_present), 

579 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

580 # References to not drinking -- no tense: 

581 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

582 ("Alcohol: none", no_alcohol_no_tense), 

583 ("Teetotal", no_alcohol_no_tense), 

584 ("Tee-total", no_alcohol_no_tense), # typo 

585 ("Teetotaller", no_alcohol_no_tense), 

586 ("Teetotaler", no_alcohol_no_tense), # typo 

587 ("Abstinent from alcohol", no_alcohol_no_tense), 

588 ("Alcohol: abstinent", no_alcohol_no_tense), 

589 ("Alcohol: abstinant", no_alcohol_no_tense), # typo 

590 ("Alcohol: zero", no_alcohol_no_tense), 

591 ("Alcohol: 0", no_alcohol_no_tense), 

592 ("Alcohol: no", no_alcohol_no_tense), 

593 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

594 # References to not drinking -- past tense: 

595 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

596 ("Alcohol: was abstinent", no_alcohol_past), 

597 ("Alcohol: previously abstinent", no_alcohol_past), 

598 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

599 # References to not drinking -- present tense: 

600 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

601 ("Alcohol: has been abstinent", no_alcohol_present), 

602 ("Alcohol: currently abstinent", no_alcohol_present), 

603 ("Alcohol: currently none", no_alcohol_present), 

604 ("Drinks no alcohol", no_alcohol_present), 

605 ("Drinks zero alcohol", no_alcohol_present), 

606 ("Does not drink alcohol", no_alcohol_present), 

607 ("Doesn't drink alcohol", no_alcohol_present), 

608 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

609 # References to not drinking -- ever: 

610 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

611 ("Has never drunk alcohol", no_alcohol_ever), 

612 ("Never drank alcohol", no_alcohol_ever), 

613 ("Alcohol: never", no_alcohol_ever), 

614 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

615 # Vague references to not drinking, not interpreted: 

616 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

617 ("Has not drunk alcohol", no_results), 

618 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

619 # Potential teetotal statements, but very tricky to be sure: 

620 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

621 ("Doesn't drink [coffee]", no_results), 

622 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

623 # Distractors: 

624 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

625 ("Lemonade, which he does not drink.", no_results), 

626 ], 

627 verbose=verbose, 

628 ) 

629 

630 

631class AlcoholUnitsValidator(ValidatorBase): 

632 """ 

633 Validator for AlcoholUnits (see help for explanation). 

634 """ 

635 

636 @classmethod 

637 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: 

638 # We're very broad here: 

639 return AlcoholUnits.NAME, [ 

640 regex_or( 

641 ALCOHOL, 

642 r"\b dr[iau]nk ", # drink/drank/drunk plus any ending 

643 AlcoholUnits.ABSTINENT, 

644 AlcoholUnits.TEETOTAL, 

645 ) 

646 ] 

647 

648 

649# ============================================================================= 

650# All classes in this module 

651# ============================================================================= 

652 

653ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS = [ 

654 (AlcoholUnits, AlcoholUnitsValidator), 

655]