Coverage for common/regex_helpers.py: 41%

83 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/common/regex_helpers.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Constants and helper functionsfor use with regexes.** 

27 

28""" 

29 

30from typing import Iterable, List, Union 

31 

32import regex # sudo apt-get install python-regex 

33 

34 

35# ============================================================================= 

36# Constants 

37# ============================================================================= 

38 

39# Reminders: ? zero or one, + one or more, * zero or more 

40# Non-capturing groups: (?:...) 

41# ... https://docs.python.org/3/howto/regex.html 

42# ... https://stackoverflow.com/questions/3512471/non-capturing-group 

43 

44ASTERISK = r"\*" 

45AT_LEAST_ONE_NONWORD = r"\W+" # 1 or more non-alphanumeric character 

46AT_LEAST_ONE_WHITESPACE = r"\s+" # one or more whitespace chars 

47AT_LEAST_ONE_NON_NEWLINE_WHITESPACE = r"[ \t]+" # one or more spaces/tabs 

48 

49HYPHEN_OR_EN_DASH = r"[-–]" 

50 

51LEFT_BRACKET = r"\(" 

52 

53NON_ALPHANUMERIC_SPLITTERS = regex.compile(AT_LEAST_ONE_NONWORD, regex.UNICODE) 

54 

55# http://www.regular-expressions.info/lookaround.html 

56# Not all engines support lookbehind; e.g. regexr.com doesn't; but Python does 

57NOT_DIGIT_LOOKBEHIND = r"(?<!\d)" 

58NOT_DIGIT_LOOKAHEAD = r"(?!\d)" 

59 

60# The Kleene star has highest precedence. 

61# So, for example, ab*c matches abbbc, but not (all of) ababc. See regexr.com 

62OPTIONAL_NONWORD = r"\W*" # zero or more non-alphanumeric characters... 

63# ... doesn't need to be [\W]*, for precedence reasons as above. 

64OPTIONAL_WHITESPACE = r"\s*" # zero or more whitespace chars 

65OPTIONAL_NON_NEWLINE_WHITESPACE = r"[ \t]*" # zero or more spaces/tabs 

66 

67REGEX_METACHARS = [ 

68 "\\", 

69 "^", 

70 "$", 

71 ".", 

72 "|", 

73 "?", 

74 "*", 

75 "+", 

76 "(", 

77 ")", 

78 "[", 

79 "{", 

80 "#", 

81 " ", 

82] 

83# http://www.regular-expressions.info/characters.html 

84# Start with \, for replacement. 

85 

86RIGHT_BRACKET = r"\)" 

87 

88WB = r"\b" # word boundary; escape the slash if not using a raw string 

89WHITESPACE_CHARACTERS = [" ", "\t", "\n"] 

90WORD_BOUNDARY = WB 

91 

92_NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$") 

93_NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$") 

94# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa: E501 

95 

96 

97# ============================================================================= 

98# Helper functions 

99# ============================================================================= 

100 

101 

102def escape_literal_string_for_regex(s: str) -> str: 

103 r""" 

104 Escape any regex characters. Returns a string. 

105 

106 For example, maps ``Hello there.`` to ``Hello\ there\.`` 

107 

108 Start with ``\`` -> ``\\``; this should be the first replacement in 

109 :data:`REGEX_METACHARS`. 

110 """ 

111 for c in REGEX_METACHARS: 

112 s = s.replace(c, "\\" + c) 

113 return s 

114 

115 

116def escape_literal_for_regex_giving_charlist(s: str) -> List[str]: 

117 r""" 

118 Escape any regex characters. Returns a list of characters or escaped 

119 characters. 

120 

121 Start with ``\`` -> ``\\``; this should be the first replacement in 

122 :data:`REGEX_METACHARS`. 

123 """ 

124 chars = [] # type: List[str] 

125 for unescaped_char in s: 

126 if unescaped_char in REGEX_METACHARS: 

127 chars.append("\\" + unescaped_char) 

128 else: 

129 chars.append(unescaped_char) 

130 return chars 

131 

132 

133def escape_literal_for_regex_allowing_flexible_whitespace(s: str) -> str: 

134 r""" 

135 Escapes literal characters, but creating a regex that allows flexible 

136 whitespace (e.g. double space) for every bit of whitespace in the original. 

137 

138 For example, maps ``Hello there.`` to ``Hello\s+there\.`` 

139 """ 

140 # Replace all forms of whitespace with spaces. 

141 for c in WHITESPACE_CHARACTERS: 

142 s = s.replace(c, " ") 

143 # Eliminate double spaces 

144 while " " in s: 

145 s = s.replace(" ", " ") 

146 # Escape regex characters, except handling whitespace (now, spaces) 

147 # differently. 

148 s = escape_literal_string_for_regex(s) 

149 s = s.replace(r"\ ", AT_LEAST_ONE_WHITESPACE) 

150 return s 

151 

152 

153def at_wb_start_end(regex_str: str) -> str: 

154 """ 

155 Returns a version of the regex starting and ending with a word boundary. 

156 

157 Caution using this. Digits do not end a word, so "mm3" will not match if 

158 your "mm" group ends in a word boundary. 

159 """ 

160 return rf"\b{regex_str}\b" 

161 

162 

163def at_start_wb(regex_str: str) -> str: 

164 """ 

165 Returns a version of the regex starting with a word boundary. 

166 

167 Beware, though; e.g. "3kg" is reasonable, and this does NOT have a word 

168 boundary in. 

169 """ 

170 return rf"\b{regex_str}" 

171 

172 

173def noncapture_group(regex_str: str) -> str: 

174 """ 

175 Wraps the string in a non-capture group, ``(?: ... )`` 

176 """ 

177 return f"(?:{regex_str})" 

178 

179 

180def optional_noncapture_group(regex_str: str) -> str: 

181 """ 

182 Wraps the string in an optional non-capture group, ``(?: ... )?`` 

183 """ 

184 return f"(?:{regex_str})?" 

185 

186 

187def named_capture_group(regex_str: str, name: str) -> str: 

188 """ 

189 Wraps the string in an named capture group, ``(?P<name>...)`` 

190 The P is for Python extensions; 

191 https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups 

192 """ 

193 return f"(?P<{name}>{regex_str})" 

194 

195 

196def optional_named_capture_group(regex_str: str, name: str) -> str: 

197 """ 

198 As for :func:`named_capture_group`, but optional. 

199 """ 

200 return f"(?P<{name}>{regex_str})?" 

201 

202 

203def regex_or( 

204 *regex_strings: str, 

205 wrap_each_in_noncapture_group: bool = False, 

206 wrap_result_in_noncapture_group: bool = False, 

207) -> str: 

208 """ 

209 Returns a regex representing an "or" join of the components. 

210 

211 Args: 

212 regex_strings: 

213 The strings to join with ``|``. 

214 wrap_each_in_noncapture_group: 

215 Convert each ``component`` into ``(?:component)`` before joining? 

216 wrap_result_in_noncapture_group: 

217 Convert the final ``result`` into ``(?:result)``? 

218 """ 

219 if len(regex_strings) == 1: 

220 # Add a bit of efficiency. 

221 only_string = regex_strings[0] 

222 if wrap_each_in_noncapture_group or wrap_result_in_noncapture_group: 

223 return noncapture_group(only_string) 

224 else: 

225 return only_string 

226 if wrap_each_in_noncapture_group: 

227 result = "|".join(noncapture_group(x) for x in regex_strings) 

228 else: 

229 result = "|".join(x for x in regex_strings) 

230 if wrap_result_in_noncapture_group: 

231 return noncapture_group(result) 

232 else: 

233 return result 

234 

235 

236def assert_alphabetical(x: Union[str, Iterable[str]]) -> None: 

237 """ 

238 Asserts that the string is not empty and contains only alphabetical 

239 characters. 

240 """ 

241 if isinstance(x, str): 

242 assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(x), ( 

243 f"Should be non-empty and contain only alphabetical characters: " 

244 f"{x!r}" 

245 ) 

246 else: 

247 for s in x: 

248 assert isinstance(s, str) 

249 assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(s), ( 

250 f"Should be non-empty and contain only alphabetical " 

251 f"characters: {s!r} (part of {x!r})" 

252 ) 

253 

254 

255def first_n_characters_required(x: str, n: int) -> str: 

256 """ 

257 Returns a regex string that requires the first n characters, and then 

258 allows the rest as optional as long as they are in sequence. 

259 

260 Args: 

261 x: 

262 String 

263 n: 

264 Minimum number of characters required at the start 

265 """ 

266 assert _NOT_EMPTY_WORD_ONLY_REGEX.match(x) 

267 assert n >= 0 

268 start = x[0:n] 

269 rest = x[n:] 

270 rest_regex = "" 

271 for c in reversed(rest): 

272 rest_regex = optional_noncapture_group(c + rest_regex) 

273 return start + rest_regex 

274 

275 

276def anchor(x: str, start: bool = True, end: bool = True) -> str: 

277 """ 

278 Anchor a regex at the start and/or end. 

279 """ 

280 s = "^" if start else "" 

281 e = "$" if end else "" 

282 return s + x + e