Coverage for nlp_manager/regex_numbers.py: 96%
28 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/regex_numbers.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Constants and functions to assist in making regular expressions relating to
27numbers (e.g. integers, floating-point, scientific notation...).**
29"""
31# =============================================================================
32# Helper functions
33# =============================================================================
36def _negative_lookahead(x: str) -> str:
37 """
38 Regex for: x does not occur here.
39 """
40 # (?! something ) is a negative lookahead assertion
41 return rf"(?! {x} )"
44def _negative_lookbehind(x: str) -> str:
45 """
46 Regex for: x does not immediately precede what's here.
47 """
48 # (?<! something ) is a negative lookbehind assertion
49 return rf"(?<! {x} )"
52# =============================================================================
53# Mathematical operations
54# =============================================================================
56MULTIPLY = r"[x\*×⋅]" # x, *, ×, ⋅
57MULTIPLY_OR_SPACE = r"[x\*×⋅\s]" # x, *, ×, ⋅, space
58POWER = r"(?: \^ | \*\* )" # ^, **
59POWER_INC_E = r"(?: e | \^ | \*\* )" # e, ^, **
60POWER_INC_E_ASTERISK = r"(?: e | \^ | \*\* | \*)" # e, ^, **, *
61# ... e.g. in CUH: "10*9/L" for "×10^9/L"
63PLUS_SIGN = r"\+" # don't forget to escape it
64MINUS_SIGN = r"[-−–]" # any of: ASCII hyphen-minus, Unicode minus, en dash
65SIGN = rf"(?: {PLUS_SIGN} | {MINUS_SIGN} )"
67# NO_MINUS_SIGN = _negative_lookahead(MINUS_SIGN)
68# NO_PRECEDING_MINUS_SIGN = _negative_lookbehind(MINUS_SIGN)
69# NO_PRECEDING_MINUS_SIGN_OR_DIGIT = _negative_lookbehind(fr"{MINUS_SIGN} | \d") # noqa: E501
70NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT = _negative_lookbehind(
71 rf"{MINUS_SIGN} | \d,? | \."
72)
75# =============================================================================
76# Quantities
77# =============================================================================
80def times_ten_to_power(n: int) -> str:
81 """
82 For a power *n*, returns a regex to capture "10^n" and similar notations.
83 """
84 return rf"(?: {MULTIPLY}? \s* 10 \s* {POWER_INC_E_ASTERISK} \s* {n})"
87BILLION = times_ten_to_power(9)
88TRILLION = times_ten_to_power(12)
91# =============================================================================
92# Number components
93# =============================================================================
94# Don't create components that are entirely optional; they're hard to test!
96PLAIN_INTEGER = r"\d+"
97# Numbers with commas: https://stackoverflow.com/questions/5917082
98# ... then modified a little, because that fails with Python's regex module;
99# (a) the "\d+" grabs things like "12,000" and thinks "aha, 12", so we have to
100# fix that by putting the "thousands" bit first; then
101# (b) that has to be modified to contain at least one comma/thousands grouping
102# (or it will treat "9800" as "980").
104PLAIN_INTEGER_W_THOUSAND_COMMAS = r"(?: (?: \d{1,3} (?:,\d{3})+ ) | \d+ )"
105# ... plain integer allowing commas as a thousands separator
106# (1) a number with thousands separators, or
107# (2) a plain number
108# ... NOTE: PUT THE ONE THAT NEEDS TO BE GREEDIER FIRST, i.e. the one with
109# thousands separators
111FLOATING_POINT_GROUP = r"(?: \. \d+ )" # decimal point and further digits
112SCIENTIFIC_NOTATION_EXPONENT = rf"(?: E {SIGN}? \d+ )"
113# ... Scientific notation does NOT offer non-integer exponents.
114# Specifically, float("-3.4e-27") is fine, but float("-3.4e-27.1") isn't.
116# NO_FOLLOWING_SCIENTIFIC_NOTATION_EXPONENT = _negative_lookahead(
117# SCIENTIFIC_NOTATION_EXPONENT)
120# =============================================================================
121# Number types
122# =============================================================================
123# Beware of unsigned types. You may not want a sign, but if you use an
124# unsigned type, "-3" will be read as "3".
126# Beware this one. You may not want a sign, but if you use this, "-3" will be
127# read as "3".
128IGNORESIGN_INTEGER = PLAIN_INTEGER_W_THOUSAND_COMMAS
129SIGNED_INTEGER = r"(?: {sign}? {integer} )".format(
130 sign=SIGN, # optional
131 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
132)
133UNSIGNED_INTEGER = r"(?: {nominus} {plus}? {integer} )".format(
134 nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT,
135 plus=PLUS_SIGN, # optional
136 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
137)
139IGNORESIGN_FLOAT = r"(?: {integer} {fp}? )".format(
140 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
141 fp=FLOATING_POINT_GROUP, # optional
142)
143SIGNED_FLOAT = r"(?: {sign}? {integer} {fp}? )".format(
144 sign=SIGN, # optional
145 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
146 fp=FLOATING_POINT_GROUP, # optional
147)
148UNSIGNED_FLOAT = r"(?: {nominus} {plus}? {integer} {fp}? )".format(
149 nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT,
150 plus=PLUS_SIGN, # optional
151 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
152 fp=FLOATING_POINT_GROUP, # optional
153)
155LIBERAL_NUMBER = r"(?: {sign}? {integer} {fp}? {exp}? )".format(
156 sign=SIGN, # optional
157 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS,
158 fp=FLOATING_POINT_GROUP, # optional
159 exp=SCIENTIFIC_NOTATION_EXPONENT, # optional
160)