Coverage for nlp_manager/tests/regex_numbers_tests.py: 100%

26 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/tests/regex_numbers_tests.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26Unit tests. 

27 

28""" 

29 

30import unittest 

31 

32from crate_anon.nlp_manager.regex_numbers import ( 

33 MINUS_SIGN, 

34 MULTIPLY, 

35 PLUS_SIGN, 

36 POWER, 

37 POWER_INC_E, 

38 # POWER_INC_E_ASTERISK, 

39 SIGN, 

40 BILLION, 

41 TRILLION, 

42 PLAIN_INTEGER, 

43 PLAIN_INTEGER_W_THOUSAND_COMMAS, 

44 SCIENTIFIC_NOTATION_EXPONENT, 

45 IGNORESIGN_FLOAT, 

46 IGNORESIGN_INTEGER, 

47 LIBERAL_NUMBER, 

48 SIGNED_FLOAT, 

49 SIGNED_INTEGER, 

50 UNSIGNED_FLOAT, 

51 UNSIGNED_INTEGER, 

52) 

53from crate_anon.nlp_manager.tests.regex_test_helperfunc import ( 

54 assert_text_regex, 

55) 

56 

57 

58# ============================================================================= 

59# Unit tests 

60# ============================================================================= 

61 

62 

63class NumberRegexesTests(unittest.TestCase): 

64 @staticmethod 

65 def test_number_regexes() -> None: 

66 verbose = True 

67 

68 # --------------------------------------------------------------------- 

69 # Operators, etc. 

70 # --------------------------------------------------------------------- 

71 assert_text_regex( 

72 "MULTIPLY", 

73 MULTIPLY, 

74 [ 

75 ("a * b", ["*"]), 

76 ("a x b", ["x"]), 

77 ("a × b", ["×"]), 

78 ("a ⋅ b", ["⋅"]), 

79 ("a blah b", []), 

80 ], 

81 verbose=verbose, 

82 ) 

83 assert_text_regex( 

84 "POWER", 

85 POWER, 

86 [ 

87 ("a ^ b", ["^"]), 

88 ("a ** b", ["**"]), 

89 ("10e5", []), 

90 ("10E5", []), 

91 ("a blah b", []), 

92 ], 

93 verbose=verbose, 

94 ) 

95 assert_text_regex( 

96 "POWER_INC_E", 

97 POWER_INC_E, 

98 [ 

99 ("a ^ b", ["^"]), 

100 ("a ** b", ["**"]), 

101 ("10e5", ["e"]), 

102 ("10E5", ["E"]), 

103 ("a blah b", []), 

104 ], 

105 verbose=verbose, 

106 ) 

107 assert_text_regex( 

108 "BILLION", 

109 BILLION, 

110 [ 

111 ("10 x 10^9/l", ["x 10^9"]), 

112 ], 

113 verbose=verbose, 

114 ) 

115 assert_text_regex( 

116 "PLUS_SIGN", 

117 PLUS_SIGN, 

118 [ 

119 ("a + b", ["+"]), 

120 ("a blah b", []), 

121 ], 

122 verbose=verbose, 

123 ) 

124 assert_text_regex( 

125 "MINUS_SIGN", 

126 MINUS_SIGN, 

127 [ 

128 # good: 

129 ("a - b", ["-"]), # ASCII hyphen-minus 

130 ("a − b", ["−"]), # Unicode minus 

131 ("a – b", ["–"]), # en dash 

132 # bad: 

133 ("a — b", []), # em dash 

134 ("a ‐ b", []), # Unicode formal hyphen 

135 ("a blah b", []), 

136 ], 

137 verbose=verbose, 

138 ) 

139 # Can't test optional regexes very easily! They match nothing. 

140 assert_text_regex( 

141 "SIGN", 

142 SIGN, 

143 [ 

144 # good: 

145 ("a + b", ["+"]), 

146 ("a - b", ["-"]), # ASCII hyphen-minus 

147 ("a − b", ["−"]), # Unicode minus 

148 ("a – b", ["–"]), # en dash 

149 # bad: 

150 ("a — b", []), # em dash 

151 ("a ‐ b", []), # Unicode formal hyphen 

152 ("a blah b", []), 

153 ], 

154 verbose=verbose, 

155 ) 

156 

157 # --------------------------------------------------------------------- 

158 # Quantities 

159 # --------------------------------------------------------------------- 

160 

161 assert_text_regex( 

162 "BILLION", 

163 BILLION, 

164 [ 

165 ("* 10^9", ["* 10^9"]), 

166 ("× 10e9", ["× 10e9"]), 

167 ("x 10 ** 9", ["x 10 ** 9"]), 

168 ], 

169 verbose=verbose, 

170 ) 

171 assert_text_regex( 

172 "TRILLION", 

173 TRILLION, 

174 [ 

175 ("* 10^12", ["* 10^12"]), 

176 ("× 10e12", ["× 10e12"]), 

177 ("x 10 ** 12", ["x 10 ** 12"]), 

178 ], 

179 verbose=verbose, 

180 ) 

181 

182 # --------------------------------------------------------------------- 

183 # Number elements 

184 # --------------------------------------------------------------------- 

185 

186 assert_text_regex( 

187 "PLAIN_INTEGER", 

188 PLAIN_INTEGER, 

189 [ 

190 ("a 1234 b", ["1234"]), 

191 ("a 1234.5 b", ["1234", "5"]), 

192 ("a 12,000 b", ["12", "000"]), 

193 ], 

194 verbose=verbose, 

195 ) 

196 assert_text_regex( 

197 "PLAIN_INTEGER_W_THOUSAND_COMMAS", 

198 PLAIN_INTEGER_W_THOUSAND_COMMAS, 

199 [ 

200 ("a 1234 b", ["1234"]), 

201 ("a 1234.5 b", ["1234", "5"]), 

202 ("a 12,000 b", ["12,000"]), 

203 ], 

204 verbose=verbose, 

205 ) 

206 assert_text_regex( 

207 "SCIENTIFIC_NOTATION_EXPONENT", 

208 SCIENTIFIC_NOTATION_EXPONENT, 

209 [ 

210 ("a 1234 b", []), 

211 ("E-4", ["E-4"]), 

212 ("e15", ["e15"]), 

213 ("e15.3", ["e15"]), 

214 ], 

215 verbose=verbose, 

216 ) 

217 

218 # --------------------------------------------------------------------- 

219 # Number types 

220 # --------------------------------------------------------------------- 

221 

222 assert_text_regex( 

223 "IGNORESIGN_FLOAT", 

224 IGNORESIGN_FLOAT, 

225 [ 

226 ("1", ["1"]), 

227 ("12345", ["12345"]), 

228 ("-1", ["1"]), # NB may be unexpected! 

229 ("1.2", ["1.2"]), 

230 ("-3.4", ["3.4"]), # NB may be unexpected! 

231 ("+3.4", ["3.4"]), 

232 ("-3.4e27.3", ["3.4", "27.3"]), 

233 ("3.4e-27", ["3.4", "27"]), 

234 ("9,800", ["9,800"]), 

235 ("17,600.34", ["17,600.34"]), 

236 ("-17,300.6588", ["17,300.6588"]), 

237 ("+12345", ["12345"]), 

238 ("-12345", ["12345"]), # NB may be unexpected! 

239 ("-12345.67", ["12345.67"]), # NB may be unexpected! 

240 ("12345.67", ["12345.67"]), 

241 ("-12345.67e-5", ["12345.67", "5"]), # NB may be unexpected! 

242 ("12345.67e-5", ["12345.67", "5"]), # NB may be unexpected! 

243 ], 

244 verbose=verbose, 

245 ) 

246 assert_text_regex( 

247 "IGNORESIGN_INTEGER", 

248 IGNORESIGN_INTEGER, 

249 [ 

250 ("1", ["1"]), 

251 ("12345", ["12345"]), 

252 ("-1", ["1"]), # will drop sign 

253 ("1.2", ["1", "2"]), 

254 ("-3.4", ["3", "4"]), 

255 ("+3.4", ["3", "4"]), 

256 ("-3.4e27.3", ["3", "4", "27", "3"]), 

257 ("3.4e-27", ["3", "4", "27"]), 

258 ("9,800", ["9,800"]), 

259 ("17,600.34", ["17,600", "34"]), 

260 ("-17,300.6588", ["17,300", "6588"]), 

261 ("-12345", ["12345"]), # NB may be unexpected! 

262 ("-12345.67", ["12345", "67"]), # NB may be unexpected! 

263 ], 

264 verbose=verbose, 

265 ) 

266 assert_text_regex( 

267 "LIBERAL_NUMBER", 

268 LIBERAL_NUMBER, 

269 [ 

270 ("1", ["1"]), 

271 ("12345", ["12345"]), 

272 ("-1", ["-1"]), 

273 ("1.2", ["1.2"]), 

274 ("-3.4", ["-3.4"]), 

275 ("+3.4", ["+3.4"]), 

276 ( 

277 "-3.4e27.3", 

278 ["-3.4e27", "3"], 

279 ), # not valid scientific notation 

280 ("3.4e-27", ["3.4e-27"]), 

281 ("9,800", ["9,800"]), 

282 ("17,600.34", ["17,600.34"]), 

283 ("-17,300.6588", ["-17,300.6588"]), 

284 ("+12345", ["+12345"]), 

285 ("-12345", ["-12345"]), 

286 ("-12345.67", ["-12345.67"]), 

287 ("-12345.67e-5", ["-12345.67e-5"]), 

288 ], 

289 verbose=verbose, 

290 ) 

291 assert_text_regex( 

292 "SIGNED_FLOAT", 

293 SIGNED_FLOAT, 

294 [ 

295 ("1", ["1"]), 

296 ("12345", ["12345"]), 

297 ("-1", ["-1"]), 

298 ("1.2", ["1.2"]), 

299 ("-3.4", ["-3.4"]), 

300 ("+3.4", ["+3.4"]), 

301 ("-3.4e27.3", ["-3.4", "27.3"]), 

302 ("3.4e-27", ["3.4", "-27"]), 

303 ("9,800", ["9,800"]), 

304 ("17,600.34", ["17,600.34"]), 

305 ("-17,300.6588", ["-17,300.6588"]), 

306 ("+12345", ["+12345"]), 

307 ("-12345", ["-12345"]), 

308 ("-12345.67", ["-12345.67"]), 

309 ("-12345.67e-5", ["-12345.67", "-5"]), # NB may be unexpected! 

310 ], 

311 verbose=verbose, 

312 ) 

313 assert_text_regex( 

314 "SIGNED_INTEGER", 

315 SIGNED_INTEGER, 

316 [ 

317 ("1", ["1"]), 

318 ("12345", ["12345"]), 

319 ("-1", ["-1"]), 

320 ("1.2", ["1", "2"]), 

321 ("-3.4", ["-3", "4"]), 

322 ("+3.4", ["+3", "4"]), 

323 ("-3.4e27.3", ["-3", "4", "27", "3"]), 

324 ("3.4e-27", ["3", "4", "-27"]), 

325 ("9,800", ["9,800"]), 

326 ("17,600.34", ["17,600", "34"]), 

327 ("-17,300.6588", ["-17,300", "6588"]), 

328 ("+12345", ["+12345"]), 

329 ("-12345", ["-12345"]), 

330 ("-12345.67", ["-12345", "67"]), # NB may be unexpected! 

331 ( 

332 "-12345.67e-5", 

333 ["-12345", "67", "-5"], 

334 ), # NB may be unexpected! 

335 ], 

336 verbose=verbose, 

337 ) 

338 assert_text_regex( 

339 "UNSIGNED_FLOAT", 

340 UNSIGNED_FLOAT, 

341 [ 

342 ("1", ["1"]), 

343 ("12345", ["12345"]), 

344 ("-1", []), 

345 ("1.2", ["1.2"]), 

346 ("-3.4", []), 

347 ("+3.4", ["+3.4"]), 

348 ("-3.4e27.3", ["27.3"]), 

349 ("3.4e-27", ["3.4"]), 

350 ("9,800", ["9,800"]), 

351 ("17,600.34", ["17,600.34"]), 

352 ("-17,300.6588", []), 

353 ("+12345", ["+12345"]), 

354 ("-12345", []), 

355 ("-12345.67", []), 

356 ("12345.67", ["12345.67"]), 

357 ("-12345.67e-5", []), 

358 ("12345.67e-5", ["12345.67"]), # NB may be unexpected! 

359 ], 

360 verbose=verbose, 

361 ) 

362 assert_text_regex( 

363 "UNSIGNED_INTEGER", 

364 UNSIGNED_INTEGER, 

365 [ 

366 ("12345", ["12345"]), 

367 ("+12345", ["+12345"]), 

368 ("-12345", []), 

369 ("-12345.67", []), 

370 ("-12345.67e-5", []), 

371 ], 

372 verbose=verbose, 

373 )