Coverage for nlp_manager/regex_units.py: 85%

136 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/regex_units.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Regular expressions to detect physical units.** 

27 

28""" 

29 

30from typing import List, Optional, Tuple 

31 

32from crate_anon.nlp_manager.regex_numbers import ( 

33 BILLION, 

34 MULTIPLY_OR_SPACE, 

35 PLAIN_INTEGER, 

36 POWER, 

37 TRILLION, 

38) 

39 

40 

41# ============================================================================= 

42# Physical units 

43# ============================================================================= 

44 

45OUT_OF_SEPARATOR = r"(?: \/ | \b out \s+ of \b )" 

46 

47 

48def per( 

49 numerator: str, 

50 denominator: str, 

51 include_power_minus1: bool = True, 

52 numerator_optional: bool = False, 

53) -> str: 

54 """ 

55 Returns regex text representing "X per Y"; e.g. "millimoles per litre", 

56 "cells per cubic millimetre". 

57 

58 Args: 

59 numerator: regex representing the numerator 

60 denominator: regex representing the denominator 

61 include_power_minus1: include the "n d -1" format for "n/d" 

62 numerator_optional: presence of the numerator is optional 

63 """ 

64 if numerator: 

65 if numerator_optional: 

66 # ensure that the optional whitespace is captured as part of the 

67 # "optional" bit, so there is no leftover whitespace that can 

68 # remain 

69 numerator_part = rf"(?: {numerator} \s* )?" 

70 else: 

71 # numerator, optional whitespace 

72 numerator_part = rf"{numerator} \s*" 

73 # Use of "\s* \b" rather than "\s+" is so we can have a BLANK 

74 # numerator. 

75 else: 

76 # Blank numerator 

77 numerator_part = "" 

78 options = [ 

79 rf"{numerator_part} (?: \/ | \b per \b) \s* {denominator}", 

80 ] 

81 if include_power_minus1: 

82 options.append(rf"{numerator_part} \b {denominator} \s* -1") 

83 return r"(?: {} )".format(" | ".join(options)) 

84 

85 

86def _out_of_str(n_as_regex: str) -> str: 

87 """ 

88 Returns regex text representing "out of N". 

89 

90 Args: 

91 n_as_regex: the "N", as a regular expression 

92 """ 

93 # / n 

94 # out of n 

95 return rf"(?: {OUT_OF_SEPARATOR} \s* {n_as_regex} \b)" 

96 

97 

98def out_of(n: int) -> str: 

99 """ 

100 Returns regex text representing "out of N". 

101 

102 Args: 

103 n: the number N 

104 """ 

105 return _out_of_str(str(n)) 

106 

107 

108def out_of_anything() -> str: 

109 """ 

110 Returns: 

111 regex representing "out of N" where N is any number 

112 """ 

113 return _out_of_str(PLAIN_INTEGER) 

114 

115 

116def power(x: str, n: int, allow_no_operator: bool = False) -> str: 

117 """ 

118 Returns regex text representing "x to the power n". 

119 

120 Args: 

121 x: base 

122 n: exponent 

123 allow_no_operator: make the operator (like ``^`` or ``**``) optional? 

124 """ 

125 return r"(?: {x} \s* {power}{optional} \s* {n})".format( 

126 x=x, 

127 power=POWER, 

128 optional="?" if allow_no_operator else "", 

129 n=n, 

130 ) 

131 

132 

133def units_times(*args: str) -> str: 

134 """ 

135 Returns regular expression text combining all its inputs with optional 

136 multiplication. 

137 

138 For units, where they are notionally multiplied. 

139 """ 

140 multiply = MULTIPLY_OR_SPACE + "?" 

141 joined = multiply.join(args) 

142 return rf"(?: {joined} )" 

143 

144 

145def units_by_dimension( 

146 *args: Tuple[str, int], # specify type of *one* arg! 

147 allow_no_operator: bool = False, 

148) -> str: 

149 """ 

150 Returns regex text for a unit where we specify them by their dimensions. 

151 

152 Args: 

153 *args: each is a tuple ``unit, power`` 

154 allow_no_operator: make the operator (like ``^`` or ``**``) optional? 

155 """ 

156 multiply = " " + MULTIPLY_OR_SPACE + " " 

157 power_elements = [] # type: List[str] 

158 for i, unit_exponent in enumerate(args): 

159 unit, exponent = unit_exponent 

160 assert exponent != 0 

161 power_elements.append( 

162 power(unit, exponent, allow_no_operator=allow_no_operator) 

163 ) 

164 joined_power_elements = multiply.join(power_elements) 

165 power_style = rf"(?: {joined_power_elements} )" 

166 options = [power_style] 

167 # noinspection PyChainedComparisons 

168 if len(args) == 2 and args[0][1] > 0 and args[1][1] < 0: 

169 # x per y 

170 options.append(per(args[0][0], args[1][0], include_power_minus1=False)) 

171 return r"(?: {} )".format(r" | ".join(options)) 

172 

173 

174# ----------------------------------------------------------------------------- 

175# Distance 

176# ----------------------------------------------------------------------------- 

177 

178M = r"(?: met(?:re|er)s? | m )" # m, metre(s), meter(s) 

179CM = r"(?: cm | centimet(?:re|er)s? )" # cm, centimetre(s), centimeter(s) 

180MM = r"(?: mm | millimet(?:re|er)s? )" # mm, millimetre(s), millimeter(s) 

181 

182FEET = r"""(?: f(?:ee|oo)?t | \' | ’ | ′ )""" 

183# ... feet, foot, ft 

184# ... apostrophe, right single quote (U+2019), prime (U+2032) 

185INCHES = r"""(?: in(?:ch(?:e)?)?s? | \" | ” | ″)""" 

186# ... in, ins, inch, inches, [inchs = typo but clear] 

187# ... ", right double quote (U+2014), double prime (U+2033) 

188 

189# ----------------------------------------------------------------------------- 

190# Mass 

191# ----------------------------------------------------------------------------- 

192 

193MCG = r"(?: mcg | microgram(?:me)?s? | [μu]g )" # you won't stop people using ug... # noqa: E501 

194MG = r"(?: mg | milligram(?:me)?s? )" # mg, milligram, milligrams, milligramme, milligrammes # noqa: E501 

195G = r"(?: gram(?:me)?s? | g )" # g, gram, grams, gramme, grammes 

196KG = r"(?: kgs? | kilo(?:gram(?:me)?)?s? )" # kg, kgs, kilos ... kilogrammes etc. # noqa: E501 

197LB = r"(?: pounds? | lbs? )" # pound(s), lb(s) 

198STONES = r"(?: stones? | st\.? )" # stone(s), st, st. 

199 

200# ----------------------------------------------------------------------------- 

201# Volume 

202# ----------------------------------------------------------------------------- 

203 

204L = r"(?: lit(?:re|er)s? | L )" # L, litre(s), liter(s) 

205DL = rf"(?: d(?:eci)?{L} )" # 10^-1 

206ML = rf"(?: m(?:illi)?{L} )" # 10^-3 

207MICROLITRE = rf"(?: micro{L} | [μu]L )" # 10^-6: microL, microliter(s), microlitre(s), μL, uL # noqa: E501 

208NANOLITRE = rf"(?: nano{L} | nL )" # 10^-9: nanoL, nanoliter(s), nanolitre(s), nL # noqa: E501 

209PICOLITRE = rf"(?: pico{L} | pL )" # 10^-12: picoL, picoliter(s), picolitre(s), pL # noqa: E501 

210FEMTOLITRE = rf"(?: femto{L} | fL )" # 10^-15: femtoL, femtoliter(s), femtolitre(s), fL # noqa: E501 

211# CUBIC_MM = r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} )""".format( 

212CUBIC_MM = ( 

213 r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} | (?: \b cmm \b ) )""" 

214).format(mm=MM, mm_cubed=power(MM, 3, allow_no_operator=True)) 

215# cubic mm, etc. | mm^3, mm3, mm 3, etc. | cmm 

216# "cmm" added 2018-09-07 having seen this in the wild (albeit urinary results). 

217 

218# A microlitre is of course the same as a cubic millimetre: 

219CUBIC_MM_OR_MICROLITRE = rf"(?: {MICROLITRE} | {CUBIC_MM} )" 

220 

221# ----------------------------------------------------------------------------- 

222# Inverse (reciprocal) volume 

223# ----------------------------------------------------------------------------- 

224 

225PER_CUBIC_MM = per("", CUBIC_MM, numerator_optional=True) 

226 

227# ----------------------------------------------------------------------------- 

228# Time 

229# ----------------------------------------------------------------------------- 

230 

231HOUR = r"(?: \b h(?:rs?|ours?)? \b)" # h, hr, hrs, hour, hours 

232DAY = r"(?: \b d(?:y?|ay?)? \b )" # d, dy, day 

233WEEK = r"(?: \b w(?:k?|eek?)? \b)" # w, wk, week 

234MONTH = r"(?:\b month \b)" # month 

235YEAR = r"(?:\b y(?:(?:ea)?r)? \b)" # y, yr, year 

236 

237DAYS_PER_WEEK = 7 

238 

239# The mean month (across a normal 4-year cycle ignoring century non-leap years) 

240# is 30.4375 days: 

241# n <- c(28, rep(30, 4), rep(31, 7)) # mean 30.41667 

242# l <- c(29, rep(30, 4), rep(31, 7)) # mean 30.5 

243# fouryearcycle <- c(n, n, n, l) # mean 30.4375 

244# century <- c(rep(n, 76), rep(l, 24)) # mean 30.43667 

245# mean(n) / 7 # 4.345238 

246# mean(fouryearcycle) / 7 # 4.348214 

247# mean(century) / 7 # 4.348095 

248# ... the Google answer for weeks per month is 4.34524, i.e. a normal year. 

249# But let's not be spuriouly precise: 

250WEEKS_PER_MONTH_APPROX = 4.35 

251WEEKS_PER_YEAR_APPROX = 52 

252 

253# ----------------------------------------------------------------------------- 

254# Proportions 

255# ----------------------------------------------------------------------------- 

256 

257PERCENT = r"""(?:%|pe?r?\s?ce?n?t)""" 

258# "%" or some subset of "percent" -- for the latter, must have "pct", other 

259# characters optional 

260 

261# ----------------------------------------------------------------------------- 

262# Arbitrary count things 

263# ----------------------------------------------------------------------------- 

264 

265CELLS = r"(?:\b cells? \b)" 

266 

267UNITS = r"(?: (?:I\.?)? U(?:nits?|\.)? )" # U, IU, I.U., unit, units... 

268# (IU for international units) 

269MICROUNITS = rf"(?: (?:micro|μ|u) {UNITS} )" 

270MILLIUNITS = rf"(?: m(?:illi)? {UNITS} )" 

271 

272UK = r"(?: U(?:nited\s+|\.\s*)? K(?:ingdom|\.)? )" 

273ALCOHOL = r"(?: \b(?:alcohol|ethanol|EtOH)\b )" 

274UK_ALCOHOL_UNITS = rf"(?: (?: {UK} \s+)? ({ALCOHOL} \s+)? {UNITS} )" 

275# U, unit, units, UK units, UK alcohol units... 

276# I thought not "IU" as they are not international units; however, RS used that 

277# term, so whether correct or in error, that's sufficient for me to include it! 

278UK_ALCOHOL_UNITS_PER_DAY = per( 

279 UK_ALCOHOL_UNITS, DAY, include_power_minus1=False 

280) 

281UK_ALCOHOL_UNITS_PER_WEEK = per( 

282 UK_ALCOHOL_UNITS, WEEK, include_power_minus1=False 

283) 

284UK_ALCOHOL_UNITS_PER_MONTH = per( 

285 UK_ALCOHOL_UNITS, MONTH, include_power_minus1=False 

286) 

287UK_ALCOHOL_UNITS_PER_YEAR = per( 

288 UK_ALCOHOL_UNITS, YEAR, include_power_minus1=False 

289) 

290 

291SCORE = r"(?:scored?)" # score(d) 

292 

293# ----------------------------------------------------------------------------- 

294# Moles 

295# ----------------------------------------------------------------------------- 

296 

297MOLES = r"(?:\b mole?s? \b)" # mol, mole, mols, moles 

298MICROMOLES = r"(?: (?:micro|μ|u)mole?s? )" 

299MILLIMOLES = r"(?: m(?:illi)?mole?s? )" 

300 

301MICROEQ = r"(?: (?:micro|μ|u)Eq )" 

302MILLIEQ = r"(?: m(?:illi)?Eq )" 

303 

304# ----------------------------------------------------------------------------- 

305# Concentration (molarity) 

306# ----------------------------------------------------------------------------- 

307 

308MICROMOLAR = r"(?:[μu]M | micromolar)" 

309MILLIMOLAR = r"(?:mM)" # NB case-insensitive... confusable with millimetres 

310 

311MICROEQ_PER_L = per(MICROEQ, L) 

312MICROMOLES_PER_L = per(MICROMOLES, L) 

313MILLIEQ_PER_L = per(MILLIEQ, L) 

314MILLIMOLES_PER_L = per(MILLIMOLES, L) 

315 

316# ----------------------------------------------------------------------------- 

317# Concentration (mass) 

318# ----------------------------------------------------------------------------- 

319 

320G_PER_DL = per(G, DL) 

321G_PER_L = per(G, L) 

322L_PER_L = per(L, L) 

323MG_PER_DL = per(MG, DL) 

324MG_PER_L = per(MG, L) 

325 

326# ----------------------------------------------------------------------------- 

327# Concentration (arbitrary count and dimensionless things) 

328# ----------------------------------------------------------------------------- 

329 

330BILLION_PER_L = per(BILLION, L) 

331TRILLION_PER_L = per(TRILLION, L) 

332 

333CELLS_PER_CUBIC_MM = per(CELLS, CUBIC_MM, numerator_optional=True) 

334CELLS_PER_CUBIC_MM_OR_MICROLITRE = per( 

335 CELLS, CUBIC_MM_OR_MICROLITRE, numerator_optional=True 

336) 

337 

338MICROUNITS_PER_ML = per(MICROUNITS, ML) 

339MILLIUNITS_PER_L = per(MILLIUNITS, L) 

340UNITS_PER_L = per(UNITS, L) 

341 

342MILLIMOLES_PER_MOL = per(MILLIMOLES, MOLES) 

343 

344# ----------------------------------------------------------------------------- 

345# Speed 

346# ----------------------------------------------------------------------------- 

347 

348MM_PER_H = per(MM, HOUR) 

349 

350# ----------------------------------------------------------------------------- 

351# Pressure 

352# ----------------------------------------------------------------------------- 

353 

354MM_HG = r"(?: mm \s* Hg )" # mmHg, mm Hg 

355# ... likelihood of "millimetres of mercury" quite small? 

356 

357# ----------------------------------------------------------------------------- 

358# Area and related 

359# ----------------------------------------------------------------------------- 

360 

361SQ_M = r""" 

362 (?: # square metres 

363 (?: sq(?:uare)? \s+ {m} ) # sq m, square metres, etc. 

364 | (?: {m} \s+ sq(?:uared?)? ) # m sq, metres square(d), etc. 

365 | {m_sq} # m ^ 2, etc. 

366 ) 

367""".format( 

368 m=M, m_sq=power(M, 2) 

369) 

370 

371# BMI 

372KG_PER_SQ_M = r"(?: {kg_per_sqm} | {kg_sqm_pow_minus2} )".format( 

373 kg_per_sqm=per(KG, SQ_M, include_power_minus1=False), 

374 kg_sqm_pow_minus2=units_times(KG, power(M, -2)), 

375) 

376 

377 

378# ============================================================================= 

379# Generic conversion functions 

380# ============================================================================= 

381 

382 

383def kg_from_st_lb_oz( 

384 stones: float = 0, pounds: float = 0, ounces: float = 0 

385) -> Optional[float]: 

386 """ 

387 Convert Imperial to metric mass. 

388 

389 Returns: 

390 mass in kg 

391 

392 """ 

393 # 16 ounces in a pound 

394 # 14 pounds in a stone 

395 # 1 avoirdupois pound = 0.45359237 kg 

396 # https://en.wikipedia.org/wiki/Pound_(mass) 

397 # Have you the peas? "Goods of weight"; aveir de peis (OFr.; see OED). 

398 try: 

399 total_pounds = (stones * 14) + pounds + (ounces / 16) 

400 return 0.45359237 * total_pounds 

401 except (TypeError, ValueError): 

402 return None 

403 

404 

405def m_from_ft_in(feet: float = 0, inches: float = 0) -> Optional[float]: 

406 """ 

407 Converts Imperial to metric length. 

408 

409 Returns: 

410 length in m 

411 

412 """ 

413 # 12 inches in a foot 

414 # 1 inch = 25.4 mm 

415 try: 

416 total_inches = (feet * 12) + inches 

417 return total_inches * 25.4 / 1000 

418 except (TypeError, ValueError): 

419 return None 

420 

421 

422def m_from_m_cm(metres: float = 0, centimetres: float = 0) -> Optional[float]: 

423 """ 

424 Converts metres/centimetres to metres. 

425 """ 

426 try: 

427 return metres + (centimetres / 100) 

428 except (TypeError, ValueError): 

429 return None 

430 

431 

432def assemble_units(components: List[Optional[str]]) -> str: 

433 """ 

434 Takes e.g. ``["ft", "in"]`` and makes ``"ft in"``. 

435 """ 

436 active_components = [c for c in components if c] 

437 return " ".join(active_components) 

438 

439 

440def factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float: 

441 """ 

442 Returns the conversion factor that you should multiple a "mg/dL" number by 

443 to get a "mM" (mmol/L) number. 

444 

445 Principle: 

446 

447 .. code-block:: none 

448 

449 mmol_per_L 

450 = 0.001 * mol_per_L 

451 = 0.001 * (g_per_L / g_per_mol) 

452 = 0.001 * ((10 * g_per_dL) / g_per_mol) 

453 = 0.001 * ((10 * 1000 * mg_per_dL) / g_per_mol) 

454 = (0.001 * 10 * 1000 / g_per_mol) * mg_per_dL 

455 = (10 / g_per_mol) * mg_per_dl 

456 

457 Example: 

458 glucose, molecular mass 180.156 g/mol 

459 => conversion factor is (10 / 180.156) 

460 90 mg/dL -> (10 / 180.156) * 90 mM = 5.0 mM 

461 

462 Args: 

463 molecular_mass_g_per_mol: molecular mass in g/mol 

464 

465 Returns: 

466 conversion factor 

467 

468 """ 

469 return 10 / molecular_mass_g_per_mol 

470 

471 

472def factor_micromolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float: 

473 """ 

474 Returns the conversion factor that you should multiple a "mg/dL" number by 

475 to get a "μM" (μmol/L) number. 

476 

477 Args: 

478 molecular_mass_g_per_mol: molecular mass in g/mol 

479 

480 Returns: 

481 conversion factor 

482 

483 """ 

484 return 1000 * factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol) 

485 

486 

487def millimolar_from_mg_per_dl( 

488 mg_per_dl: float, molecular_mass_g_per_mol: float 

489) -> float: 

490 """ 

491 Converts a concentration from mg/dL to mM (mmol/L). 

492 

493 Args: 

494 mg_per_dl: value in mg/dL 

495 molecular_mass_g_per_mol: molecular mass in g/mol 

496 

497 Returns: 

498 value in mM = mmol/L 

499 

500 """ 

501 return mg_per_dl * factor_millimolar_from_mg_per_dl( 

502 molecular_mass_g_per_mol 

503 ) 

504 

505 

506def micromolar_from_mg_per_dl( 

507 mg_per_dl: float, molecular_mass_g_per_mol: float 

508) -> float: 

509 """ 

510 Converts a concentration from mg/dL to μM (μmol/L). 

511 

512 Args: 

513 mg_per_dl: value in mg/dL 

514 molecular_mass_g_per_mol: molecular mass in g/mol 

515 

516 Returns: 

517 value in μM = μmol/L 

518 

519 """ 

520 return mg_per_dl * factor_micromolar_from_mg_per_dl( 

521 molecular_mass_g_per_mol 

522 )