Coverage for nlp_manager/parse_substance_misuse.py: 97%
124 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/parse_substance_misuse.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Python regex-based NLP processors for substance misuse.**
28"""
30import logging
31from typing import Any, Dict, Generator, List, Optional, Tuple
33from crate_anon.common.regex_helpers import (
34 at_wb_start_end,
35 noncapture_group,
36 optional_named_capture_group,
37 optional_noncapture_group,
38 regex_or,
39 WORD_BOUNDARY,
40)
41from crate_anon.nlp_manager.nlp_definition import NlpDefinition
42from crate_anon.nlp_manager.number import to_float
43from crate_anon.nlp_manager.regex_func import (
44 compile_regex,
45 compile_regex_dict,
46 get_regex_dict_match,
47 get_regex_dict_search,
48)
49from crate_anon.nlp_manager.regex_parser import (
50 common_tense,
51 EVER,
52 FN_CONTENT,
53 FN_END,
54 FN_RELATION,
55 FN_RELATION_TEXT,
56 FN_START,
57 FN_TENSE,
58 FN_TENSE_TEXT,
59 FN_UNITS,
60 FN_VALUE_TEXT,
61 FN_VARIABLE_NAME,
62 FN_VARIABLE_TEXT,
63 GROUP_NAME_QUANTITY,
64 GROUP_NAME_RELATION,
65 GROUP_NAME_TENSE,
66 GROUP_NAME_UNITS,
67 GROUP_NAME_VALUE,
68 GROUP_NUMBER_WHOLE_EXPRESSION,
69 make_simple_numeric_regex,
70 NumericalResultParser,
71 PAST,
72 PRESENT,
73 ValidatorBase,
74)
75from crate_anon.nlp_manager.regex_units import (
76 ALCOHOL,
77 DAYS_PER_WEEK,
78 UK_ALCOHOL_UNITS_PER_DAY,
79 UK_ALCOHOL_UNITS_PER_MONTH,
80 UK_ALCOHOL_UNITS_PER_WEEK,
81 UK_ALCOHOL_UNITS_PER_YEAR,
82 WEEKS_PER_MONTH_APPROX,
83 WEEKS_PER_YEAR_APPROX,
84)
86log = logging.getLogger(__name__)
89# =============================================================================
90# Alcohol
91# =============================================================================
94class AlcoholUnits(NumericalResultParser):
95 """
96 SUBSTANCE MISUSE.
98 Alcohol consumption, specified explicitly as (UK) units per day or per
99 week, or via non-numeric references to not drinking any.
101 - Output is in UK units per week. A UK unit is 10 ml of ethanol [#f1]_ [#f2]_.
102 UK NHS guidelines used to be "per week" and remain broadly week-based [#f1]_.
103 - It doesn't attempt any understanding of other alcohol descriptions (e.g.
104 "pints of beer", "glasses of wine", "bottles of vodka") so is expected to
105 apply where a clinician has converted a (potentially mixed) alcohol
106 description to a units-per-week calculation.
108 .. [#f1] https://www.nhs.uk/live-well/alcohol-advice/calculating-alcohol-units/,
109 accessed 2023-01-18.
110 .. [#f2] https://en.wikipedia.org/wiki/Unit_of_alcohol
111 """ # noqa: E501
113 # There are no relevant Read codes for alcohol consumption in
114 # v3ReadCode_PBCL.xlsx.
116 # -------------------------------------------------------------------------
117 # Regex building for tense-related statements
118 # -------------------------------------------------------------------------
120 # All these are verbose regexes, so don't omit \s+ for whitespace!
121 PAST_ADVERBS = (
122 "formerly",
123 "once",
124 "peak",
125 "previously",
126 "was",
127 )
128 PAST_ADVERBS_RE = noncapture_group(regex_or(*PAST_ADVERBS))
129 DOES_NOT = r"does\s*n[o'’]t" # does not, doesn't
130 PRESENT_ADVERBS = (
131 r"at \s+ present",
132 r"currently",
133 r"has \s+ been",
134 r"now",
135 r"nowadays",
136 r"presently",
137 r"these \s+ days",
138 DOES_NOT,
139 )
140 PRESENT_ADVERBS_RE = noncapture_group(regex_or(*PRESENT_ADVERBS))
141 TEMPORAL_WORDS = tuple(
142 at_wb_start_end(x) for x in PAST_ADVERBS + PRESENT_ADVERBS
143 )
144 TEMPORAL = noncapture_group(regex_or(*TEMPORAL_WORDS))
145 OPT_TEMPORAL = optional_noncapture_group(regex_or(*TEMPORAL_WORDS))
147 NEVER = "never"
148 # "Never" is both temporal and negating and thus fiddly. We do *not*
149 # include it in standard temporal words, or a statement about "has never
150 # drunk >100 u/w" would be misinterpreted as positive.
152 # -------------------------------------------------------------------------
153 # Regex building for drinking alcohol (and when)
154 # -------------------------------------------------------------------------
156 DRINKING_PAST = (
157 # Past infinitive: she used to drink
158 r"\b used \s+ to \s+ drink \b",
159 # Imperfect tense: she [adverb] drank
160 rf"\b (?: {PAST_ADVERBS_RE} \s+ )? drank \b",
161 # Perfect tense: has drunk
162 rf"\b has (?: {PAST_ADVERBS_RE} \s+ )? drunk \b",
163 # Past continuous tense: he was [adverb] drinking
164 # Also abbreviated past continuous tense: previously drinking
165 rf"\b {PAST_ADVERBS_RE} \s+ drinking \b",
166 )
167 # We don't allow the adverbs by themselves, to avoid something that isn't
168 # explicitly about alcohol or drinking, e.g. "[insulin] currently 6
169 # units/day".
170 DRINKING_PRESENT = (
171 # Present tense: he [adverb] drinks
172 rf"\b (?: {PRESENT_ADVERBS_RE} \s+)? drinks \b",
173 # Present continuous tense: he is [adverb] drinking
174 rf"\b (?: is \s+)? (?: {PRESENT_ADVERBS_RE} \s+)? drinking \b",
175 )
176 DRINKING_PAST_PRESENT = DRINKING_PAST + DRINKING_PRESENT
177 DRINKING = noncapture_group(regex_or(*DRINKING_PAST_PRESENT))
178 OPT_DRINKING = optional_noncapture_group(regex_or(*DRINKING_PAST_PRESENT))
179 ALCOHOL_PM_CONSUMPTION = rf"{ALCOHOL} (?: \s+ consumption \b)?"
180 ALC = noncapture_group(ALCOHOL_PM_CONSUMPTION)
181 OPT_ALC = optional_noncapture_group(ALCOHOL_PM_CONSUMPTION)
183 # BRK: requires some sort of wordbreak or whitespace, but also disposes of
184 # junk like some punctuation (e.g. "previously: none" versus "previously
185 # none") and words like "at" (e.g. in "drinking at X units/week").
186 BRK = noncapture_group(
187 regex_or(
188 r"\s* : \s*", # colon +/- whitespace
189 r"\s* \b at \b \s*", # "at" +/- whitespace
190 r"\s+", # whitespace
191 WORD_BOUNDARY, # other word break
192 )
193 )
195 # Move from more to less specific, or the less specific will capture first.
196 ALCOHOL_DRINKING = rf"""
197 {WORD_BOUNDARY}
198 # Alcohol drinking:
199 (?:
200 # 1. ... DRINKING ... [ALC] ...
201 {OPT_TEMPORAL} {BRK}
202 {DRINKING} {BRK}
203 {OPT_TEMPORAL} {BRK}
204 {OPT_ALC} {BRK}
205 {OPT_TEMPORAL}
206 |
207 # 2. ... ALC ... [DRINKING] ...
208 {OPT_TEMPORAL} {BRK}
209 {ALC} {BRK}
210 {OPT_TEMPORAL} {BRK}
211 {OPT_DRINKING} {BRK}
212 {OPT_TEMPORAL}
213 )
214 {WORD_BOUNDARY}
215 """
217 _drinking_tense_dict = {} # type: Dict[str, str]
218 for _past in DRINKING_PAST + PAST_ADVERBS:
219 _drinking_tense_dict[_past] = PAST
220 for _present in DRINKING_PRESENT + PRESENT_ADVERBS:
221 _drinking_tense_dict[_present] = PRESENT
222 TENSE_PAST_PRESENT_LOOKUP = compile_regex_dict(_drinking_tense_dict)
223 TENSE_NEVER_LOOKUP = compile_regex_dict({NEVER: EVER})
225 # -------------------------------------------------------------------------
226 # Regex building for "drinking alcohol at X units per week"
227 # -------------------------------------------------------------------------
229 # A temporal suffix allows e.g. "drinking X units/week previously".
230 GROUP_NAME_SUFFIX = "suffix"
231 group_suffix = r"\b \s*" + optional_named_capture_group(
232 TEMPORAL, GROUP_NAME_SUFFIX
233 )
234 REGEX_ALCOHOL_UNITS = (
235 make_simple_numeric_regex(
236 quantity=ALCOHOL_DRINKING,
237 units=regex_or(
238 UK_ALCOHOL_UNITS_PER_DAY,
239 UK_ALCOHOL_UNITS_PER_WEEK,
240 UK_ALCOHOL_UNITS_PER_MONTH, # perhaps unusual!
241 UK_ALCOHOL_UNITS_PER_YEAR, # perhaps unusual!
242 ),
243 units_optional=False,
244 )
245 + group_suffix
246 )
248 # -------------------------------------------------------------------------
249 # Regex building for "no alcohol" statements
250 # -------------------------------------------------------------------------
252 ABSTINENT = r"\b abstin[ae]nt \b" # "abstinent", or typo "abstinant"
253 NONE = noncapture_group(
254 WORD_BOUNDARY
255 + noncapture_group(
256 regex_or(
257 "0",
258 rf"{ABSTINENT} (?: \s+ from \b )?",
259 NEVER,
260 "no",
261 "none",
262 "zero",
263 )
264 )
265 + WORD_BOUNDARY
266 )
267 TEETOTAL = noncapture_group(
268 r"\b te[ea][-]?total(?:l?er)? \b",
269 )
270 DOES_NOT_DRINK = noncapture_group(
271 regex_or(
272 rf"\b {DOES_NOT} \s+ drink \b",
273 rf"\b has \s+ {NEVER} \s+ drunk \b",
274 )
275 )
276 OPT_TEMPORAL_AND_OR_DRINKING_BRK = (
277 f"{OPT_TEMPORAL} {BRK} {OPT_DRINKING} {BRK} {OPT_TEMPORAL} {BRK}"
278 )
279 NO_ALCOHOL = rf"""
280 {WORD_BOUNDARY}
281 # "No alcohol" statements.
282 # Temporal modifiers might be found in all sorts of places.
283 (?:
284 # 1. [DRINKING] ... ALC ... [DRINKING] ... NONE ...
285 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
286 {ALC} {BRK}
287 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
288 {NONE} {BRK}
289 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
290 |
291 # 2. NONE ... ALC (e.g. "never alcohol")
292 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
293 {NONE} {BRK}
294 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
295 {ALC} {BRK}
296 {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
297 |
298 # 3. "has never drunk... alcohol", etc.
299 {DOES_NOT_DRINK} {BRK} {ALC} {BRK}
300 |
301 # 4. "teetotal" with typos
302 {TEETOTAL}
303 # ... but not just "drinking... none" (could be water etc.)
304 )
305 {WORD_BOUNDARY}
306 """
308 # -------------------------------------------------------------------------
309 # Other class variables
310 # -------------------------------------------------------------------------
312 NAME = "AlcoholUnits"
313 PREFERRED_UNIT_COLUMN = "value_uk_units_per_week"
314 UNIT_MAPPING = {
315 UK_ALCOHOL_UNITS_PER_WEEK: 1, # preferred unit
316 UK_ALCOHOL_UNITS_PER_DAY: DAYS_PER_WEEK, # 1 unit/day -> 7 units/week
317 UK_ALCOHOL_UNITS_PER_MONTH: 1 / WEEKS_PER_MONTH_APPROX,
318 UK_ALCOHOL_UNITS_PER_YEAR: 1 / WEEKS_PER_YEAR_APPROX,
319 }
321 # -------------------------------------------------------------------------
322 # Init
323 # -------------------------------------------------------------------------
325 def __init__(
326 self,
327 nlpdef: Optional[NlpDefinition],
328 cfg_processor_name: Optional[str],
329 commit: bool = False,
330 ) -> None:
331 # see documentation above
332 super().__init__(
333 nlpdef=nlpdef,
334 cfg_processor_name=cfg_processor_name,
335 variable=self.NAME,
336 target_unit=self.PREFERRED_UNIT_COLUMN,
337 regex_str_for_debugging=self.REGEX_ALCOHOL_UNITS,
338 commit=commit,
339 )
340 self.compiled_regex_alcohol = compile_regex(self.REGEX_ALCOHOL_UNITS)
341 self.units_to_factor = compile_regex_dict(self.UNIT_MAPPING)
342 self.compiled_regex_no_alcohol = compile_regex(self.NO_ALCOHOL)
344 # -------------------------------------------------------------------------
345 # Parse
346 # -------------------------------------------------------------------------
348 def parse(
349 self, text: str, debug: bool = False
350 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
351 """
352 Parse for two regexes which operate slightly differently.
353 """
354 if not text:
355 return
356 yield from self.parse_alcohol_units(text, debug)
357 yield from self.parse_alcohol_none(text, debug)
359 def parse_alcohol_units(
360 self, text: str, debug: bool = False
361 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
362 """
363 We amend SimpleNumericalResultParser.parse() to deal with tense a bit
364 better (e.g. "used to drink"). Comments from that version not repeated.
365 That version also shortened a bit since we guarantee some aspects of
366 the flags.
367 """
368 for m in self.compiled_regex_alcohol.finditer(text):
369 startpos = m.start()
370 endpos = m.end()
371 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
372 variable_text = m.group(GROUP_NAME_QUANTITY)
373 tense_text = m.group(GROUP_NAME_TENSE)
374 relation_text = m.group(GROUP_NAME_RELATION)
375 value_text = m.group(GROUP_NAME_VALUE)
376 units = m.group(GROUP_NAME_UNITS)
377 suffix_text = m.group(self.GROUP_NAME_SUFFIX)
379 value_in_target_units = None
380 if units:
381 matched_unit, multiple_or_fn = get_regex_dict_match(
382 units, self.units_to_factor
383 )
384 if not matched_unit:
385 continue
386 # MODIFIED: no need to check callable(multiple_or_fn); always
387 # no
388 value_in_target_units = to_float(value_text) * multiple_or_fn
389 # MODIFIED: no need to check self.assume_preferred_unit (we never
390 # assume that here)
392 # MODIFIED: no need to check self.take_absolute (always yes)
393 if value_in_target_units is not None:
394 value_in_target_units = abs(value_in_target_units)
396 tense, relation = common_tense(tense_text, relation_text)
398 # MODIFIED: Extra bit here to detect tense information in a
399 # different place:
400 for temporal_info in (variable_text, suffix_text):
401 if tense:
402 break
403 tense = self._get_tense(temporal_info)
404 if tense:
405 tense_text = temporal_info
407 # Back to the previous code:
408 result = {
409 FN_VARIABLE_NAME: self.variable,
410 FN_CONTENT: matching_text,
411 FN_START: startpos,
412 FN_END: endpos,
413 FN_VARIABLE_TEXT: variable_text,
414 FN_RELATION_TEXT: relation_text,
415 FN_RELATION: relation,
416 FN_VALUE_TEXT: value_text,
417 FN_UNITS: units,
418 self.target_unit: value_in_target_units,
419 FN_TENSE_TEXT: tense_text,
420 FN_TENSE: tense,
421 }
422 if debug:
423 log.debug(f"Match {m} for {text!r} -> {result}")
424 yield self.tablename, result
426 def parse_alcohol_none(
427 self, text: str, debug: bool = False
428 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
429 """
430 Deal with references to not drinking any alcohol (except those referred
431 to as e.g. "0 units per week", which will be picked up by the
432 units-per-week function -- that will be rare!).
433 """
434 for m in self.compiled_regex_no_alcohol.finditer(text):
435 startpos = m.start()
436 endpos = m.end()
437 matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
438 tense = self._get_tense(matching_text)
439 tense_text = matching_text if tense else None
441 result = {
442 FN_VARIABLE_NAME: self.variable,
443 FN_CONTENT: matching_text,
444 FN_START: startpos,
445 FN_END: endpos,
446 FN_VARIABLE_TEXT: matching_text,
447 FN_RELATION_TEXT: None,
448 FN_RELATION: None,
449 FN_VALUE_TEXT: matching_text,
450 FN_UNITS: None,
451 self.target_unit: 0, # zero units
452 FN_TENSE_TEXT: tense_text,
453 FN_TENSE: tense,
454 }
455 if debug:
456 log.debug(f"Match {m} for {text!r} -> {result}")
457 yield self.tablename, result
459 def _get_tense(self, text: str) -> Optional[str]:
460 """
461 Find a tense indicator and return the corresponding text, or None.
462 """
463 # We deal with "never" first because otherwise "never drank" may hit
464 # "[optional_stuff] drank" and be classified as the past tense.
465 _, tense = get_regex_dict_search(text, self.TENSE_NEVER_LOOKUP)
466 if not tense:
467 _, tense = get_regex_dict_search(
468 text, self.TENSE_PAST_PRESENT_LOOKUP
469 )
470 return tense
472 # -------------------------------------------------------------------------
473 # Test
474 # -------------------------------------------------------------------------
476 def test(self, verbose: bool = False) -> None:
477 # docstring in parent class
478 # Test via e.g.:
479 # pytest -k SubstanceMisuseTests # self-tests
480 # crate_run_crate_nlp_demo - --processors AlcoholUnits # interactive
481 no_results = []
482 six_no_tense = [{self.target_unit: 6, FN_TENSE: None}]
483 six_past = [{self.target_unit: 6, FN_TENSE: PAST}]
484 six_present = [{self.target_unit: 6, FN_TENSE: PRESENT}]
485 six_per_day_present = [
486 {self.target_unit: 6 * DAYS_PER_WEEK, FN_TENSE: PRESENT}
487 ]
488 six_per_month_present = [
489 {self.target_unit: 6 / WEEKS_PER_MONTH_APPROX, FN_TENSE: PRESENT}
490 ]
491 six_per_year_present = [
492 {self.target_unit: 6 / WEEKS_PER_YEAR_APPROX, FN_TENSE: PRESENT}
493 ]
494 under_6_present = [
495 {self.target_unit: 6, FN_RELATION: "<", FN_TENSE: PRESENT}
496 ]
497 over_200_present = [
498 {self.target_unit: 200, FN_RELATION: ">", FN_TENSE: PRESENT}
499 ]
500 no_alcohol_no_tense = [{self.target_unit: 0, FN_TENSE: None}]
501 no_alcohol_past = [{self.target_unit: 0, FN_TENSE: PAST}]
502 no_alcohol_present = [{self.target_unit: 0, FN_TENSE: PRESENT}]
503 no_alcohol_ever = [{self.target_unit: 0, FN_TENSE: EVER}]
504 self.detailed_test_multiple(
505 [
506 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
507 # No results expected:
508 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
509 ("Alcohol", no_results),
510 ("He used to drink like a fish", no_results),
511 ("[e.g. insulin] currently 6 units per week", no_results),
512 ("[e.g. insulin] previously 6 units per week", no_results),
513 ("[could be insulin] peak 6 u/w", no_results),
514 ("[!] methylalcohol 6 u/w", no_results),
515 ("[not starts with no] Alcohol: not explored", no_results),
516 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
517 # Value with no tense:
518 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
519 ("Alcohol 6 u/w", six_no_tense),
520 ("Alcohol - 6 u/w", six_no_tense),
521 ("EtOH = 6 u/w", six_no_tense),
522 ("EtOH = 6 u/wk", six_no_tense),
523 ("Alcohol (units/week): 6", six_no_tense),
524 ("Ethanol 6 units/week", six_no_tense),
525 ("[not international but] alcohol 6 IU/week", six_no_tense),
526 ("alcohol 6 I.U./week", six_no_tense),
527 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
528 # Past tense:
529 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
530 ("Alcohol: was 6 u/w", six_past), # other tenses fail (= good)
531 ("Alcohol: formerly 6 u/w", six_past),
532 ("Alcohol: previously 6 u/w", six_past),
533 ("Alcohol: once 6 u/w", six_past),
534 ("Alcohol: peak 6 u/w", six_past),
535 ("Used to drink 6 u/w", six_past),
536 ("Peak drinking 6 u/w", six_past),
537 ("Peak alcohol consumption: 6 u/w", six_past),
538 ("Drank 6 u/w", six_past),
539 ("Formerly drank 6 u/w", six_past),
540 ("Previously drank 6 u/w", six_past),
541 ("Was drinking 6 u/w", six_past),
542 ("Was previously drinking 6 u/w", six_past),
543 ("Was formerly drinking 6 u/w", six_past),
544 ("Alcohol: formerly 6 u/w", six_past),
545 ("Alcohol: previously 6 u/w", six_past),
546 ("Alcohol: 6 u/w previously", six_past),
547 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
548 # Present tense:
549 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
550 ("Drinks 6 units per week", six_present),
551 ("Drinks 6 alcohol units per week", six_present),
552 ("Drinks 6 UK units per week", six_present),
553 ("Drinks 6 UK alcohol units per week", six_present),
554 ("[silly] Drinks 6 UK alcohol IU per week", six_present),
555 ("Drinks 6 units/d", six_per_day_present),
556 ("Drinks 6 units/dy", six_per_day_present),
557 ("Drinks 6 units/day", six_per_day_present),
558 ("Currently drinks 6 units per week", six_present),
559 ("These days drinks 6 units per week", six_present),
560 ("Now drinks 6 units per week", six_present),
561 ("Nowadays drinks 6 units per week", six_present),
562 ("Drinking 6 units per week", six_present),
563 ("Currently drinking 6 units per week", six_present),
564 ("Presently drinking 6 units per week", six_present),
565 ("Alcohol: currently 6 u/w", six_present),
566 ("Alcohol: presently 6 u/w", six_present),
567 ("In terms of alcohol she drinks 6 units/week", six_present),
568 ("Has been drinking 6 units per week", six_present),
569 ("Drinks 6 units per month", six_per_month_present),
570 ("Drinks 6 units per year", six_per_year_present),
571 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
572 # Inequalities:
573 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
574 ("Alcohol: presently less than 6 u/w", under_6_present),
575 ("Alcohol: presently under 6 u/w", under_6_present),
576 ("Alcohol: presently >200 u/w", over_200_present),
577 ("Alcohol: currently more than 200 u/w", over_200_present),
578 ("Alcohol: currently over 200 u/w", over_200_present),
579 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
580 # References to not drinking -- no tense:
581 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
582 ("Alcohol: none", no_alcohol_no_tense),
583 ("Teetotal", no_alcohol_no_tense),
584 ("Tee-total", no_alcohol_no_tense), # typo
585 ("Teetotaller", no_alcohol_no_tense),
586 ("Teetotaler", no_alcohol_no_tense), # typo
587 ("Abstinent from alcohol", no_alcohol_no_tense),
588 ("Alcohol: abstinent", no_alcohol_no_tense),
589 ("Alcohol: abstinant", no_alcohol_no_tense), # typo
590 ("Alcohol: zero", no_alcohol_no_tense),
591 ("Alcohol: 0", no_alcohol_no_tense),
592 ("Alcohol: no", no_alcohol_no_tense),
593 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
594 # References to not drinking -- past tense:
595 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
596 ("Alcohol: was abstinent", no_alcohol_past),
597 ("Alcohol: previously abstinent", no_alcohol_past),
598 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
599 # References to not drinking -- present tense:
600 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
601 ("Alcohol: has been abstinent", no_alcohol_present),
602 ("Alcohol: currently abstinent", no_alcohol_present),
603 ("Alcohol: currently none", no_alcohol_present),
604 ("Drinks no alcohol", no_alcohol_present),
605 ("Drinks zero alcohol", no_alcohol_present),
606 ("Does not drink alcohol", no_alcohol_present),
607 ("Doesn't drink alcohol", no_alcohol_present),
608 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
609 # References to not drinking -- ever:
610 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
611 ("Has never drunk alcohol", no_alcohol_ever),
612 ("Never drank alcohol", no_alcohol_ever),
613 ("Alcohol: never", no_alcohol_ever),
614 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
615 # Vague references to not drinking, not interpreted:
616 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
617 ("Has not drunk alcohol", no_results),
618 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
619 # Potential teetotal statements, but very tricky to be sure:
620 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
621 ("Doesn't drink [coffee]", no_results),
622 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
623 # Distractors:
624 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
625 ("Lemonade, which he does not drink.", no_results),
626 ],
627 verbose=verbose,
628 )
631class AlcoholUnitsValidator(ValidatorBase):
632 """
633 Validator for AlcoholUnits (see help for explanation).
634 """
636 @classmethod
637 def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
638 # We're very broad here:
639 return AlcoholUnits.NAME, [
640 regex_or(
641 ALCOHOL,
642 r"\b dr[iau]nk ", # drink/drank/drunk plus any ending
643 AlcoholUnits.ABSTINENT,
644 AlcoholUnits.TEETOTAL,
645 )
646 ]
649# =============================================================================
650# All classes in this module
651# =============================================================================
653ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS = [
654 (AlcoholUnits, AlcoholUnitsValidator),
655]