Coverage for common/regex_helpers.py: 41%
83 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/common/regex_helpers.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Constants and helper functionsfor use with regexes.**
28"""
30from typing import Iterable, List, Union
32import regex # sudo apt-get install python-regex
35# =============================================================================
36# Constants
37# =============================================================================
39# Reminders: ? zero or one, + one or more, * zero or more
40# Non-capturing groups: (?:...)
41# ... https://docs.python.org/3/howto/regex.html
42# ... https://stackoverflow.com/questions/3512471/non-capturing-group
44ASTERISK = r"\*"
45AT_LEAST_ONE_NONWORD = r"\W+" # 1 or more non-alphanumeric character
46AT_LEAST_ONE_WHITESPACE = r"\s+" # one or more whitespace chars
47AT_LEAST_ONE_NON_NEWLINE_WHITESPACE = r"[ \t]+" # one or more spaces/tabs
49HYPHEN_OR_EN_DASH = r"[-–]"
51LEFT_BRACKET = r"\("
53NON_ALPHANUMERIC_SPLITTERS = regex.compile(AT_LEAST_ONE_NONWORD, regex.UNICODE)
55# http://www.regular-expressions.info/lookaround.html
56# Not all engines support lookbehind; e.g. regexr.com doesn't; but Python does
57NOT_DIGIT_LOOKBEHIND = r"(?<!\d)"
58NOT_DIGIT_LOOKAHEAD = r"(?!\d)"
60# The Kleene star has highest precedence.
61# So, for example, ab*c matches abbbc, but not (all of) ababc. See regexr.com
62OPTIONAL_NONWORD = r"\W*" # zero or more non-alphanumeric characters...
63# ... doesn't need to be [\W]*, for precedence reasons as above.
64OPTIONAL_WHITESPACE = r"\s*" # zero or more whitespace chars
65OPTIONAL_NON_NEWLINE_WHITESPACE = r"[ \t]*" # zero or more spaces/tabs
67REGEX_METACHARS = [
68 "\\",
69 "^",
70 "$",
71 ".",
72 "|",
73 "?",
74 "*",
75 "+",
76 "(",
77 ")",
78 "[",
79 "{",
80 "#",
81 " ",
82]
83# http://www.regular-expressions.info/characters.html
84# Start with \, for replacement.
86RIGHT_BRACKET = r"\)"
88WB = r"\b" # word boundary; escape the slash if not using a raw string
89WHITESPACE_CHARACTERS = [" ", "\t", "\n"]
90WORD_BOUNDARY = WB
92_NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$")
93_NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$")
94# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa: E501
97# =============================================================================
98# Helper functions
99# =============================================================================
102def escape_literal_string_for_regex(s: str) -> str:
103 r"""
104 Escape any regex characters. Returns a string.
106 For example, maps ``Hello there.`` to ``Hello\ there\.``
108 Start with ``\`` -> ``\\``; this should be the first replacement in
109 :data:`REGEX_METACHARS`.
110 """
111 for c in REGEX_METACHARS:
112 s = s.replace(c, "\\" + c)
113 return s
116def escape_literal_for_regex_giving_charlist(s: str) -> List[str]:
117 r"""
118 Escape any regex characters. Returns a list of characters or escaped
119 characters.
121 Start with ``\`` -> ``\\``; this should be the first replacement in
122 :data:`REGEX_METACHARS`.
123 """
124 chars = [] # type: List[str]
125 for unescaped_char in s:
126 if unescaped_char in REGEX_METACHARS:
127 chars.append("\\" + unescaped_char)
128 else:
129 chars.append(unescaped_char)
130 return chars
133def escape_literal_for_regex_allowing_flexible_whitespace(s: str) -> str:
134 r"""
135 Escapes literal characters, but creating a regex that allows flexible
136 whitespace (e.g. double space) for every bit of whitespace in the original.
138 For example, maps ``Hello there.`` to ``Hello\s+there\.``
139 """
140 # Replace all forms of whitespace with spaces.
141 for c in WHITESPACE_CHARACTERS:
142 s = s.replace(c, " ")
143 # Eliminate double spaces
144 while " " in s:
145 s = s.replace(" ", " ")
146 # Escape regex characters, except handling whitespace (now, spaces)
147 # differently.
148 s = escape_literal_string_for_regex(s)
149 s = s.replace(r"\ ", AT_LEAST_ONE_WHITESPACE)
150 return s
153def at_wb_start_end(regex_str: str) -> str:
154 """
155 Returns a version of the regex starting and ending with a word boundary.
157 Caution using this. Digits do not end a word, so "mm3" will not match if
158 your "mm" group ends in a word boundary.
159 """
160 return rf"\b{regex_str}\b"
163def at_start_wb(regex_str: str) -> str:
164 """
165 Returns a version of the regex starting with a word boundary.
167 Beware, though; e.g. "3kg" is reasonable, and this does NOT have a word
168 boundary in.
169 """
170 return rf"\b{regex_str}"
173def noncapture_group(regex_str: str) -> str:
174 """
175 Wraps the string in a non-capture group, ``(?: ... )``
176 """
177 return f"(?:{regex_str})"
180def optional_noncapture_group(regex_str: str) -> str:
181 """
182 Wraps the string in an optional non-capture group, ``(?: ... )?``
183 """
184 return f"(?:{regex_str})?"
187def named_capture_group(regex_str: str, name: str) -> str:
188 """
189 Wraps the string in an named capture group, ``(?P<name>...)``
190 The P is for Python extensions;
191 https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups
192 """
193 return f"(?P<{name}>{regex_str})"
196def optional_named_capture_group(regex_str: str, name: str) -> str:
197 """
198 As for :func:`named_capture_group`, but optional.
199 """
200 return f"(?P<{name}>{regex_str})?"
203def regex_or(
204 *regex_strings: str,
205 wrap_each_in_noncapture_group: bool = False,
206 wrap_result_in_noncapture_group: bool = False,
207) -> str:
208 """
209 Returns a regex representing an "or" join of the components.
211 Args:
212 regex_strings:
213 The strings to join with ``|``.
214 wrap_each_in_noncapture_group:
215 Convert each ``component`` into ``(?:component)`` before joining?
216 wrap_result_in_noncapture_group:
217 Convert the final ``result`` into ``(?:result)``?
218 """
219 if len(regex_strings) == 1:
220 # Add a bit of efficiency.
221 only_string = regex_strings[0]
222 if wrap_each_in_noncapture_group or wrap_result_in_noncapture_group:
223 return noncapture_group(only_string)
224 else:
225 return only_string
226 if wrap_each_in_noncapture_group:
227 result = "|".join(noncapture_group(x) for x in regex_strings)
228 else:
229 result = "|".join(x for x in regex_strings)
230 if wrap_result_in_noncapture_group:
231 return noncapture_group(result)
232 else:
233 return result
236def assert_alphabetical(x: Union[str, Iterable[str]]) -> None:
237 """
238 Asserts that the string is not empty and contains only alphabetical
239 characters.
240 """
241 if isinstance(x, str):
242 assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(x), (
243 f"Should be non-empty and contain only alphabetical characters: "
244 f"{x!r}"
245 )
246 else:
247 for s in x:
248 assert isinstance(s, str)
249 assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(s), (
250 f"Should be non-empty and contain only alphabetical "
251 f"characters: {s!r} (part of {x!r})"
252 )
255def first_n_characters_required(x: str, n: int) -> str:
256 """
257 Returns a regex string that requires the first n characters, and then
258 allows the rest as optional as long as they are in sequence.
260 Args:
261 x:
262 String
263 n:
264 Minimum number of characters required at the start
265 """
266 assert _NOT_EMPTY_WORD_ONLY_REGEX.match(x)
267 assert n >= 0
268 start = x[0:n]
269 rest = x[n:]
270 rest_regex = ""
271 for c in reversed(rest):
272 rest_regex = optional_noncapture_group(c + rest_regex)
273 return start + rest_regex
276def anchor(x: str, start: bool = True, end: bool = True) -> str:
277 """
278 Anchor a regex at the start and/or end.
279 """
280 s = "^" if start else ""
281 e = "$" if end else ""
282 return s + x + e