Coverage for nlp_manager/regex_units.py: 85%
136 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/regex_units.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Regular expressions to detect physical units.**
28"""
30from typing import List, Optional, Tuple
32from crate_anon.nlp_manager.regex_numbers import (
33 BILLION,
34 MULTIPLY_OR_SPACE,
35 PLAIN_INTEGER,
36 POWER,
37 TRILLION,
38)
41# =============================================================================
42# Physical units
43# =============================================================================
45OUT_OF_SEPARATOR = r"(?: \/ | \b out \s+ of \b )"
48def per(
49 numerator: str,
50 denominator: str,
51 include_power_minus1: bool = True,
52 numerator_optional: bool = False,
53) -> str:
54 """
55 Returns regex text representing "X per Y"; e.g. "millimoles per litre",
56 "cells per cubic millimetre".
58 Args:
59 numerator: regex representing the numerator
60 denominator: regex representing the denominator
61 include_power_minus1: include the "n d -1" format for "n/d"
62 numerator_optional: presence of the numerator is optional
63 """
64 if numerator:
65 if numerator_optional:
66 # ensure that the optional whitespace is captured as part of the
67 # "optional" bit, so there is no leftover whitespace that can
68 # remain
69 numerator_part = rf"(?: {numerator} \s* )?"
70 else:
71 # numerator, optional whitespace
72 numerator_part = rf"{numerator} \s*"
73 # Use of "\s* \b" rather than "\s+" is so we can have a BLANK
74 # numerator.
75 else:
76 # Blank numerator
77 numerator_part = ""
78 options = [
79 rf"{numerator_part} (?: \/ | \b per \b) \s* {denominator}",
80 ]
81 if include_power_minus1:
82 options.append(rf"{numerator_part} \b {denominator} \s* -1")
83 return r"(?: {} )".format(" | ".join(options))
86def _out_of_str(n_as_regex: str) -> str:
87 """
88 Returns regex text representing "out of N".
90 Args:
91 n_as_regex: the "N", as a regular expression
92 """
93 # / n
94 # out of n
95 return rf"(?: {OUT_OF_SEPARATOR} \s* {n_as_regex} \b)"
98def out_of(n: int) -> str:
99 """
100 Returns regex text representing "out of N".
102 Args:
103 n: the number N
104 """
105 return _out_of_str(str(n))
108def out_of_anything() -> str:
109 """
110 Returns:
111 regex representing "out of N" where N is any number
112 """
113 return _out_of_str(PLAIN_INTEGER)
116def power(x: str, n: int, allow_no_operator: bool = False) -> str:
117 """
118 Returns regex text representing "x to the power n".
120 Args:
121 x: base
122 n: exponent
123 allow_no_operator: make the operator (like ``^`` or ``**``) optional?
124 """
125 return r"(?: {x} \s* {power}{optional} \s* {n})".format(
126 x=x,
127 power=POWER,
128 optional="?" if allow_no_operator else "",
129 n=n,
130 )
133def units_times(*args: str) -> str:
134 """
135 Returns regular expression text combining all its inputs with optional
136 multiplication.
138 For units, where they are notionally multiplied.
139 """
140 multiply = MULTIPLY_OR_SPACE + "?"
141 joined = multiply.join(args)
142 return rf"(?: {joined} )"
145def units_by_dimension(
146 *args: Tuple[str, int], # specify type of *one* arg!
147 allow_no_operator: bool = False,
148) -> str:
149 """
150 Returns regex text for a unit where we specify them by their dimensions.
152 Args:
153 *args: each is a tuple ``unit, power``
154 allow_no_operator: make the operator (like ``^`` or ``**``) optional?
155 """
156 multiply = " " + MULTIPLY_OR_SPACE + " "
157 power_elements = [] # type: List[str]
158 for i, unit_exponent in enumerate(args):
159 unit, exponent = unit_exponent
160 assert exponent != 0
161 power_elements.append(
162 power(unit, exponent, allow_no_operator=allow_no_operator)
163 )
164 joined_power_elements = multiply.join(power_elements)
165 power_style = rf"(?: {joined_power_elements} )"
166 options = [power_style]
167 # noinspection PyChainedComparisons
168 if len(args) == 2 and args[0][1] > 0 and args[1][1] < 0:
169 # x per y
170 options.append(per(args[0][0], args[1][0], include_power_minus1=False))
171 return r"(?: {} )".format(r" | ".join(options))
174# -----------------------------------------------------------------------------
175# Distance
176# -----------------------------------------------------------------------------
178M = r"(?: met(?:re|er)s? | m )" # m, metre(s), meter(s)
179CM = r"(?: cm | centimet(?:re|er)s? )" # cm, centimetre(s), centimeter(s)
180MM = r"(?: mm | millimet(?:re|er)s? )" # mm, millimetre(s), millimeter(s)
182FEET = r"""(?: f(?:ee|oo)?t | \' | ’ | ′ )"""
183# ... feet, foot, ft
184# ... apostrophe, right single quote (U+2019), prime (U+2032)
185INCHES = r"""(?: in(?:ch(?:e)?)?s? | \" | ” | ″)"""
186# ... in, ins, inch, inches, [inchs = typo but clear]
187# ... ", right double quote (U+2014), double prime (U+2033)
189# -----------------------------------------------------------------------------
190# Mass
191# -----------------------------------------------------------------------------
193MCG = r"(?: mcg | microgram(?:me)?s? | [μu]g )" # you won't stop people using ug... # noqa: E501
194MG = r"(?: mg | milligram(?:me)?s? )" # mg, milligram, milligrams, milligramme, milligrammes # noqa: E501
195G = r"(?: gram(?:me)?s? | g )" # g, gram, grams, gramme, grammes
196KG = r"(?: kgs? | kilo(?:gram(?:me)?)?s? )" # kg, kgs, kilos ... kilogrammes etc. # noqa: E501
197LB = r"(?: pounds? | lbs? )" # pound(s), lb(s)
198STONES = r"(?: stones? | st\.? )" # stone(s), st, st.
200# -----------------------------------------------------------------------------
201# Volume
202# -----------------------------------------------------------------------------
204L = r"(?: lit(?:re|er)s? | L )" # L, litre(s), liter(s)
205DL = rf"(?: d(?:eci)?{L} )" # 10^-1
206ML = rf"(?: m(?:illi)?{L} )" # 10^-3
207MICROLITRE = rf"(?: micro{L} | [μu]L )" # 10^-6: microL, microliter(s), microlitre(s), μL, uL # noqa: E501
208NANOLITRE = rf"(?: nano{L} | nL )" # 10^-9: nanoL, nanoliter(s), nanolitre(s), nL # noqa: E501
209PICOLITRE = rf"(?: pico{L} | pL )" # 10^-12: picoL, picoliter(s), picolitre(s), pL # noqa: E501
210FEMTOLITRE = rf"(?: femto{L} | fL )" # 10^-15: femtoL, femtoliter(s), femtolitre(s), fL # noqa: E501
211# CUBIC_MM = r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} )""".format(
212CUBIC_MM = (
213 r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} | (?: \b cmm \b ) )"""
214).format(mm=MM, mm_cubed=power(MM, 3, allow_no_operator=True))
215# cubic mm, etc. | mm^3, mm3, mm 3, etc. | cmm
216# "cmm" added 2018-09-07 having seen this in the wild (albeit urinary results).
218# A microlitre is of course the same as a cubic millimetre:
219CUBIC_MM_OR_MICROLITRE = rf"(?: {MICROLITRE} | {CUBIC_MM} )"
221# -----------------------------------------------------------------------------
222# Inverse (reciprocal) volume
223# -----------------------------------------------------------------------------
225PER_CUBIC_MM = per("", CUBIC_MM, numerator_optional=True)
227# -----------------------------------------------------------------------------
228# Time
229# -----------------------------------------------------------------------------
231HOUR = r"(?: \b h(?:rs?|ours?)? \b)" # h, hr, hrs, hour, hours
232DAY = r"(?: \b d(?:y?|ay?)? \b )" # d, dy, day
233WEEK = r"(?: \b w(?:k?|eek?)? \b)" # w, wk, week
234MONTH = r"(?:\b month \b)" # month
235YEAR = r"(?:\b y(?:(?:ea)?r)? \b)" # y, yr, year
237DAYS_PER_WEEK = 7
239# The mean month (across a normal 4-year cycle ignoring century non-leap years)
240# is 30.4375 days:
241# n <- c(28, rep(30, 4), rep(31, 7)) # mean 30.41667
242# l <- c(29, rep(30, 4), rep(31, 7)) # mean 30.5
243# fouryearcycle <- c(n, n, n, l) # mean 30.4375
244# century <- c(rep(n, 76), rep(l, 24)) # mean 30.43667
245# mean(n) / 7 # 4.345238
246# mean(fouryearcycle) / 7 # 4.348214
247# mean(century) / 7 # 4.348095
248# ... the Google answer for weeks per month is 4.34524, i.e. a normal year.
249# But let's not be spuriouly precise:
250WEEKS_PER_MONTH_APPROX = 4.35
251WEEKS_PER_YEAR_APPROX = 52
253# -----------------------------------------------------------------------------
254# Proportions
255# -----------------------------------------------------------------------------
257PERCENT = r"""(?:%|pe?r?\s?ce?n?t)"""
258# "%" or some subset of "percent" -- for the latter, must have "pct", other
259# characters optional
261# -----------------------------------------------------------------------------
262# Arbitrary count things
263# -----------------------------------------------------------------------------
265CELLS = r"(?:\b cells? \b)"
267UNITS = r"(?: (?:I\.?)? U(?:nits?|\.)? )" # U, IU, I.U., unit, units...
268# (IU for international units)
269MICROUNITS = rf"(?: (?:micro|μ|u) {UNITS} )"
270MILLIUNITS = rf"(?: m(?:illi)? {UNITS} )"
272UK = r"(?: U(?:nited\s+|\.\s*)? K(?:ingdom|\.)? )"
273ALCOHOL = r"(?: \b(?:alcohol|ethanol|EtOH)\b )"
274UK_ALCOHOL_UNITS = rf"(?: (?: {UK} \s+)? ({ALCOHOL} \s+)? {UNITS} )"
275# U, unit, units, UK units, UK alcohol units...
276# I thought not "IU" as they are not international units; however, RS used that
277# term, so whether correct or in error, that's sufficient for me to include it!
278UK_ALCOHOL_UNITS_PER_DAY = per(
279 UK_ALCOHOL_UNITS, DAY, include_power_minus1=False
280)
281UK_ALCOHOL_UNITS_PER_WEEK = per(
282 UK_ALCOHOL_UNITS, WEEK, include_power_minus1=False
283)
284UK_ALCOHOL_UNITS_PER_MONTH = per(
285 UK_ALCOHOL_UNITS, MONTH, include_power_minus1=False
286)
287UK_ALCOHOL_UNITS_PER_YEAR = per(
288 UK_ALCOHOL_UNITS, YEAR, include_power_minus1=False
289)
291SCORE = r"(?:scored?)" # score(d)
293# -----------------------------------------------------------------------------
294# Moles
295# -----------------------------------------------------------------------------
297MOLES = r"(?:\b mole?s? \b)" # mol, mole, mols, moles
298MICROMOLES = r"(?: (?:micro|μ|u)mole?s? )"
299MILLIMOLES = r"(?: m(?:illi)?mole?s? )"
301MICROEQ = r"(?: (?:micro|μ|u)Eq )"
302MILLIEQ = r"(?: m(?:illi)?Eq )"
304# -----------------------------------------------------------------------------
305# Concentration (molarity)
306# -----------------------------------------------------------------------------
308MICROMOLAR = r"(?:[μu]M | micromolar)"
309MILLIMOLAR = r"(?:mM)" # NB case-insensitive... confusable with millimetres
311MICROEQ_PER_L = per(MICROEQ, L)
312MICROMOLES_PER_L = per(MICROMOLES, L)
313MILLIEQ_PER_L = per(MILLIEQ, L)
314MILLIMOLES_PER_L = per(MILLIMOLES, L)
316# -----------------------------------------------------------------------------
317# Concentration (mass)
318# -----------------------------------------------------------------------------
320G_PER_DL = per(G, DL)
321G_PER_L = per(G, L)
322L_PER_L = per(L, L)
323MG_PER_DL = per(MG, DL)
324MG_PER_L = per(MG, L)
326# -----------------------------------------------------------------------------
327# Concentration (arbitrary count and dimensionless things)
328# -----------------------------------------------------------------------------
330BILLION_PER_L = per(BILLION, L)
331TRILLION_PER_L = per(TRILLION, L)
333CELLS_PER_CUBIC_MM = per(CELLS, CUBIC_MM, numerator_optional=True)
334CELLS_PER_CUBIC_MM_OR_MICROLITRE = per(
335 CELLS, CUBIC_MM_OR_MICROLITRE, numerator_optional=True
336)
338MICROUNITS_PER_ML = per(MICROUNITS, ML)
339MILLIUNITS_PER_L = per(MILLIUNITS, L)
340UNITS_PER_L = per(UNITS, L)
342MILLIMOLES_PER_MOL = per(MILLIMOLES, MOLES)
344# -----------------------------------------------------------------------------
345# Speed
346# -----------------------------------------------------------------------------
348MM_PER_H = per(MM, HOUR)
350# -----------------------------------------------------------------------------
351# Pressure
352# -----------------------------------------------------------------------------
354MM_HG = r"(?: mm \s* Hg )" # mmHg, mm Hg
355# ... likelihood of "millimetres of mercury" quite small?
357# -----------------------------------------------------------------------------
358# Area and related
359# -----------------------------------------------------------------------------
361SQ_M = r"""
362 (?: # square metres
363 (?: sq(?:uare)? \s+ {m} ) # sq m, square metres, etc.
364 | (?: {m} \s+ sq(?:uared?)? ) # m sq, metres square(d), etc.
365 | {m_sq} # m ^ 2, etc.
366 )
367""".format(
368 m=M, m_sq=power(M, 2)
369)
371# BMI
372KG_PER_SQ_M = r"(?: {kg_per_sqm} | {kg_sqm_pow_minus2} )".format(
373 kg_per_sqm=per(KG, SQ_M, include_power_minus1=False),
374 kg_sqm_pow_minus2=units_times(KG, power(M, -2)),
375)
378# =============================================================================
379# Generic conversion functions
380# =============================================================================
383def kg_from_st_lb_oz(
384 stones: float = 0, pounds: float = 0, ounces: float = 0
385) -> Optional[float]:
386 """
387 Convert Imperial to metric mass.
389 Returns:
390 mass in kg
392 """
393 # 16 ounces in a pound
394 # 14 pounds in a stone
395 # 1 avoirdupois pound = 0.45359237 kg
396 # https://en.wikipedia.org/wiki/Pound_(mass)
397 # Have you the peas? "Goods of weight"; aveir de peis (OFr.; see OED).
398 try:
399 total_pounds = (stones * 14) + pounds + (ounces / 16)
400 return 0.45359237 * total_pounds
401 except (TypeError, ValueError):
402 return None
405def m_from_ft_in(feet: float = 0, inches: float = 0) -> Optional[float]:
406 """
407 Converts Imperial to metric length.
409 Returns:
410 length in m
412 """
413 # 12 inches in a foot
414 # 1 inch = 25.4 mm
415 try:
416 total_inches = (feet * 12) + inches
417 return total_inches * 25.4 / 1000
418 except (TypeError, ValueError):
419 return None
422def m_from_m_cm(metres: float = 0, centimetres: float = 0) -> Optional[float]:
423 """
424 Converts metres/centimetres to metres.
425 """
426 try:
427 return metres + (centimetres / 100)
428 except (TypeError, ValueError):
429 return None
432def assemble_units(components: List[Optional[str]]) -> str:
433 """
434 Takes e.g. ``["ft", "in"]`` and makes ``"ft in"``.
435 """
436 active_components = [c for c in components if c]
437 return " ".join(active_components)
440def factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float:
441 """
442 Returns the conversion factor that you should multiple a "mg/dL" number by
443 to get a "mM" (mmol/L) number.
445 Principle:
447 .. code-block:: none
449 mmol_per_L
450 = 0.001 * mol_per_L
451 = 0.001 * (g_per_L / g_per_mol)
452 = 0.001 * ((10 * g_per_dL) / g_per_mol)
453 = 0.001 * ((10 * 1000 * mg_per_dL) / g_per_mol)
454 = (0.001 * 10 * 1000 / g_per_mol) * mg_per_dL
455 = (10 / g_per_mol) * mg_per_dl
457 Example:
458 glucose, molecular mass 180.156 g/mol
459 => conversion factor is (10 / 180.156)
460 90 mg/dL -> (10 / 180.156) * 90 mM = 5.0 mM
462 Args:
463 molecular_mass_g_per_mol: molecular mass in g/mol
465 Returns:
466 conversion factor
468 """
469 return 10 / molecular_mass_g_per_mol
472def factor_micromolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float:
473 """
474 Returns the conversion factor that you should multiple a "mg/dL" number by
475 to get a "μM" (μmol/L) number.
477 Args:
478 molecular_mass_g_per_mol: molecular mass in g/mol
480 Returns:
481 conversion factor
483 """
484 return 1000 * factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol)
487def millimolar_from_mg_per_dl(
488 mg_per_dl: float, molecular_mass_g_per_mol: float
489) -> float:
490 """
491 Converts a concentration from mg/dL to mM (mmol/L).
493 Args:
494 mg_per_dl: value in mg/dL
495 molecular_mass_g_per_mol: molecular mass in g/mol
497 Returns:
498 value in mM = mmol/L
500 """
501 return mg_per_dl * factor_millimolar_from_mg_per_dl(
502 molecular_mass_g_per_mol
503 )
506def micromolar_from_mg_per_dl(
507 mg_per_dl: float, molecular_mass_g_per_mol: float
508) -> float:
509 """
510 Converts a concentration from mg/dL to μM (μmol/L).
512 Args:
513 mg_per_dl: value in mg/dL
514 molecular_mass_g_per_mol: molecular mass in g/mol
516 Returns:
517 value in μM = μmol/L
519 """
520 return mg_per_dl * factor_micromolar_from_mg_per_dl(
521 molecular_mass_g_per_mol
522 )