Coverage for nlp_manager/regex_numbers.py: 96%

28 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/regex_numbers.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Constants and functions to assist in making regular expressions relating to 

27numbers (e.g. integers, floating-point, scientific notation...).** 

28 

29""" 

30 

31# ============================================================================= 

32# Helper functions 

33# ============================================================================= 

34 

35 

36def _negative_lookahead(x: str) -> str: 

37 """ 

38 Regex for: x does not occur here. 

39 """ 

40 # (?! something ) is a negative lookahead assertion 

41 return rf"(?! {x} )" 

42 

43 

44def _negative_lookbehind(x: str) -> str: 

45 """ 

46 Regex for: x does not immediately precede what's here. 

47 """ 

48 # (?<! something ) is a negative lookbehind assertion 

49 return rf"(?<! {x} )" 

50 

51 

52# ============================================================================= 

53# Mathematical operations 

54# ============================================================================= 

55 

56MULTIPLY = r"[x\*×⋅]" # x, *, ×, ⋅ 

57MULTIPLY_OR_SPACE = r"[x\*×⋅\s]" # x, *, ×, ⋅, space 

58POWER = r"(?: \^ | \*\* )" # ^, ** 

59POWER_INC_E = r"(?: e | \^ | \*\* )" # e, ^, ** 

60POWER_INC_E_ASTERISK = r"(?: e | \^ | \*\* | \*)" # e, ^, **, * 

61# ... e.g. in CUH: "10*9/L" for "×10^9/L" 

62 

63PLUS_SIGN = r"\+" # don't forget to escape it 

64MINUS_SIGN = r"[-−–]" # any of: ASCII hyphen-minus, Unicode minus, en dash 

65SIGN = rf"(?: {PLUS_SIGN} | {MINUS_SIGN} )" 

66 

67# NO_MINUS_SIGN = _negative_lookahead(MINUS_SIGN) 

68# NO_PRECEDING_MINUS_SIGN = _negative_lookbehind(MINUS_SIGN) 

69# NO_PRECEDING_MINUS_SIGN_OR_DIGIT = _negative_lookbehind(fr"{MINUS_SIGN} | \d") # noqa: E501 

70NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT = _negative_lookbehind( 

71 rf"{MINUS_SIGN} | \d,? | \." 

72) 

73 

74 

75# ============================================================================= 

76# Quantities 

77# ============================================================================= 

78 

79 

80def times_ten_to_power(n: int) -> str: 

81 """ 

82 For a power *n*, returns a regex to capture "10^n" and similar notations. 

83 """ 

84 return rf"(?: {MULTIPLY}? \s* 10 \s* {POWER_INC_E_ASTERISK} \s* {n})" 

85 

86 

87BILLION = times_ten_to_power(9) 

88TRILLION = times_ten_to_power(12) 

89 

90 

91# ============================================================================= 

92# Number components 

93# ============================================================================= 

94# Don't create components that are entirely optional; they're hard to test! 

95 

96PLAIN_INTEGER = r"\d+" 

97# Numbers with commas: https://stackoverflow.com/questions/5917082 

98# ... then modified a little, because that fails with Python's regex module; 

99# (a) the "\d+" grabs things like "12,000" and thinks "aha, 12", so we have to 

100# fix that by putting the "thousands" bit first; then 

101# (b) that has to be modified to contain at least one comma/thousands grouping 

102# (or it will treat "9800" as "980"). 

103 

104PLAIN_INTEGER_W_THOUSAND_COMMAS = r"(?: (?: \d{1,3} (?:,\d{3})+ ) | \d+ )" 

105# ... plain integer allowing commas as a thousands separator 

106# (1) a number with thousands separators, or 

107# (2) a plain number 

108# ... NOTE: PUT THE ONE THAT NEEDS TO BE GREEDIER FIRST, i.e. the one with 

109# thousands separators 

110 

111FLOATING_POINT_GROUP = r"(?: \. \d+ )" # decimal point and further digits 

112SCIENTIFIC_NOTATION_EXPONENT = rf"(?: E {SIGN}? \d+ )" 

113# ... Scientific notation does NOT offer non-integer exponents. 

114# Specifically, float("-3.4e-27") is fine, but float("-3.4e-27.1") isn't. 

115 

116# NO_FOLLOWING_SCIENTIFIC_NOTATION_EXPONENT = _negative_lookahead( 

117# SCIENTIFIC_NOTATION_EXPONENT) 

118 

119 

120# ============================================================================= 

121# Number types 

122# ============================================================================= 

123# Beware of unsigned types. You may not want a sign, but if you use an 

124# unsigned type, "-3" will be read as "3". 

125 

126# Beware this one. You may not want a sign, but if you use this, "-3" will be 

127# read as "3". 

128IGNORESIGN_INTEGER = PLAIN_INTEGER_W_THOUSAND_COMMAS 

129SIGNED_INTEGER = r"(?: {sign}? {integer} )".format( 

130 sign=SIGN, # optional 

131 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

132) 

133UNSIGNED_INTEGER = r"(?: {nominus} {plus}? {integer} )".format( 

134 nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT, 

135 plus=PLUS_SIGN, # optional 

136 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

137) 

138 

139IGNORESIGN_FLOAT = r"(?: {integer} {fp}? )".format( 

140 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

141 fp=FLOATING_POINT_GROUP, # optional 

142) 

143SIGNED_FLOAT = r"(?: {sign}? {integer} {fp}? )".format( 

144 sign=SIGN, # optional 

145 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

146 fp=FLOATING_POINT_GROUP, # optional 

147) 

148UNSIGNED_FLOAT = r"(?: {nominus} {plus}? {integer} {fp}? )".format( 

149 nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT, 

150 plus=PLUS_SIGN, # optional 

151 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

152 fp=FLOATING_POINT_GROUP, # optional 

153) 

154 

155LIBERAL_NUMBER = r"(?: {sign}? {integer} {fp}? {exp}? )".format( 

156 sign=SIGN, # optional 

157 integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, 

158 fp=FLOATING_POINT_GROUP, # optional 

159 exp=SCIENTIFIC_NOTATION_EXPONENT, # optional 

160)