Coverage for anonymise/anonregex.py: 24%
147 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/anonymise/anonregex.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Regular expression functions for anonymisation.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import calendar
35import datetime
36import logging
37from typing import Iterable, List, Optional, Pattern, Union
39from cardinal_pythonlib.lists import unique_list
41# https://pypi.python.org/pypi/regex/
42# https://bitbucket.org/mrabarnett/mrab-regex
43import regex # sudo apt-get install python-regex
45# noinspection PyProtectedMember
46from regex import _regex_core
48from crate_anon.common.regex_helpers import (
49 assert_alphabetical,
50 AT_LEAST_ONE_NONWORD,
51 escape_literal_for_regex_giving_charlist,
52 escape_literal_string_for_regex,
53 first_n_characters_required,
54 named_capture_group,
55 NON_ALPHANUMERIC_SPLITTERS,
56 noncapture_group,
57 NOT_DIGIT_LOOKAHEAD,
58 NOT_DIGIT_LOOKBEHIND,
59 OPTIONAL_NON_NEWLINE_WHITESPACE,
60 optional_noncapture_group,
61 OPTIONAL_NONWORD,
62 WORD_BOUNDARY as WB,
63)
65log = logging.getLogger(__name__)
68# =============================================================================
69# Constants
70# =============================================================================
72ORDINAL_SUFFIXES_ENGLISH = ("st", "nd", "rd", "th") # 1st, 2nd, 3rd, 4th...
73MONTHS_ENGLISH = tuple(calendar.month_name[_] for _ in range(1, 12 + 1))
74# https://docs.python.org/3/library/calendar.html
76REGEX_COMPILE_FLAGS = (
77 regex.IGNORECASE | regex.UNICODE | regex.VERBOSE | regex.MULTILINE
78)
80EMAIL_REGEX_STR = (
81 # http://emailregex.com/
82 # The simple Python example doesn't cope with "r&d@somewhere.nhs.uk".
83 # The "full" version is:
84 r"""
85(?:
86 [a-z0-9!#$%&'*+/=?^_`{|}~-]+
87 (?:
88 \.[a-z0-9!#$%&'*+/=?^_`{|}~-]+
89 )*|
90 "
91 (?:
92 [\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|
93 \\
94 [\x01-\x09\x0b\x0c\x0e-\x7f]
95 )*
96 "
97)
98@
99(?:
100 (?:
101 [a-z0-9]
102 (?:
103 [a-z0-9-]*
104 [a-z0-9]
105 )?
106 \.
107 )+
108 [a-z0-9]
109 (?:
110 [a-z0-9-]*
111 [a-z0-9]
112 )?
113 |
114 \[
115 (?:
116 (?:
117 25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
118 )
119 \.
120 ){3}
121 (?:
122 25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
123 |
124 [a-z0-9-]*[a-z0-9]:
125 (?:
126 [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]
127 |
128 \\[\x01-\x09\x0b\x0c\x0e-\x7f]
129 )+
130 )
131 \]
132)
134"""
135)
138# =============================================================================
139# String manipulation
140# =============================================================================
143def get_anon_fragments_from_string(s: str) -> List[str]:
144 """
145 Takes a complex string, such as a name or address with its components
146 separated by spaces, commas, etc., and returns a list of substrings to be
147 used for anonymisation.
149 - For example, from ``"John Smith"``, return ``["John", "Smith"]``;
150 from ``"John D'Souza"``, return ``["John", "D", "Souza"]``;
151 from ``"42 West Street"``, return ``["42", "West", "Street"]``.
153 - Try these examples:
155 .. code-block:: python
157 get_anon_fragments_from_string("Bob D'Souza")
158 get_anon_fragments_from_string("Jemima Al-Khalaim")
159 get_anon_fragments_from_string("47 Russell Square")
161 - Note that this is a LIBERAL algorithm, i.e. one prone to anonymise too
162 much (e.g. all instances of ``"Street"`` if someone has that as part of
163 their address).
164 - *Note that we use the "word boundary" facility when replacing, and that
165 treats apostrophes and hyphens as word boundaries.*
166 Therefore, we don't need the largest-level chunks, like ``D'Souza``.
167 """
168 return list(filter(None, NON_ALPHANUMERIC_SPLITTERS.split(s)))
169 # The filter(None, ...) aspect removes empty strings, e.g. from
170 # leading/trailing whitespace.
173# =============================================================================
174# Anonymisation regexes
175# =============================================================================
177# -----------------------------------------------------------------------------
178# Dates
179# -----------------------------------------------------------------------------
182def _month_word_regex_fragment(month_name: str) -> str:
183 """
184 Returns possibilities for the month word, allowing the first 3 characters,
185 or the whole month name -- e.g. converts ``September`` to
186 ``Sep(?:tember)?``, or indeed anything in between 3 and all of the
187 characters, e.g. ``Sept``.
188 """
189 return first_n_characters_required(month_name, 3)
192def get_date_regex_elements(
193 dt: Union[datetime.datetime, datetime.date],
194 at_word_boundaries_only: bool = False,
195 ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH,
196) -> List[str]:
197 """
198 Takes a datetime object and returns a list of regex strings with which
199 to scrub.
201 For example, a date/time of 13 Sep 2014 will produce regexes that recognize
202 "13 Sep 2014", "September 13, 2014", "2014/09/13", and many more.
204 Args:
205 dt:
206 The datetime or date or similar object.
207 at_word_boundaries_only:
208 Ensure that all regexes begin and end with a word boundary
209 requirement.
210 ordinal_suffixes:
211 Language-specific suffixes that may be appended to numbers to make
212 them ordinal. In English, "st", "nd", "rd", and "th".
214 Returns:
215 the list of regular expression strings, as above
216 """
217 # Day (numeric), allowing leading zeroes and e.g. "1st, 2nd"
218 assert_alphabetical(ordinal_suffixes)
219 assert not isinstance(ordinal_suffixes, str)
220 optional_suffixes = optional_noncapture_group("|".join(ordinal_suffixes))
221 day = "0*" + str(dt.day) + optional_suffixes
223 # Month
224 # ... numerically, allowing leading zeroes for numeric and e.g.
225 # Feb/February
226 month_numeric = "0*" + str(dt.month)
227 # ... as a word
228 # month_word = dt.strftime("%B") # can't cope with years < 1900
229 month_name = calendar.month_name[dt.month] # localized
230 # Allow first 3 characters, or whole month name:
231 month_word = _month_word_regex_fragment(month_name)
232 month = "(?:" + month_numeric + "|" + month_word + ")"
234 # Year
235 year = str(dt.year)
236 if len(year) == 4:
237 year = "(?:" + year[0:2] + ")?" + year[2:4]
238 # ... converts e.g. 1986 to (19)?86, to match 1986 or 86
240 # Separator
241 sep = OPTIONAL_NONWORD
243 # Regexes
244 basic_regexes = [
245 day + sep + month + sep + year, # e.g. 13 Sep 2014
246 month + sep + day + sep + year, # e.g. Sep 13, 2014
247 year + sep + month + sep + day, # e.g. 2014/09/13
248 ]
249 if at_word_boundaries_only:
250 return [WB + x + WB for x in basic_regexes]
251 else:
252 return basic_regexes
255class DateRegexNames:
256 """
257 For named groups in date regexes.
258 """
260 # Components that we might need to preserve for blurring, and thus
261 # capture:
262 ALPHABETICAL_MONTH = "alphabetical_month"
263 FOUR_DIGIT_YEAR = "four_digit_year"
264 NUMERIC_DAY = "numeric_day"
265 NUMERIC_MONTH = "numeric_month"
266 TWO_DIGIT_YEAR = "two_digit_year"
267 # Grouped:
268 DAY_MONTH_YEAR = "day_month_year"
269 MONTH_DAY_YEAR = "month_day_year"
270 YEAR_MONTH_DAY = "year_month_day"
271 ISODATE_NO_SEP = "isodate_no_sep"
274def get_generic_date_regex_elements(
275 at_word_boundaries_only: bool = True,
276 ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH,
277 all_month_names: Iterable[str] = MONTHS_ENGLISH,
278) -> List[str]:
279 """
280 Returns a set of regex elements to scrub *any* date.
282 Word boundaries are strongly preferred! This will match some odd things
283 otherwise; see the associated unit tests.
284 """
285 # https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa: E501
287 # range [1, 31]
288 numeric_day = named_capture_group(
289 r"0?[1-9]|[12]\d|30|31", DateRegexNames.NUMERIC_DAY
290 )
291 # range [1, 12]
292 numeric_month = named_capture_group(
293 r"0?[1-9]|1[0-2]", DateRegexNames.NUMERIC_MONTH
294 )
295 # a 2-digit or 4-digit number
296 two_digit_year = named_capture_group(
297 r"\d{2}", DateRegexNames.TWO_DIGIT_YEAR
298 )
299 four_digit_year = named_capture_group(
300 r"\d{4}", DateRegexNames.FOUR_DIGIT_YEAR
301 )
302 year = noncapture_group(rf"{two_digit_year}|{four_digit_year}")
303 sep = r"[^\w\d\r\n:]" # an active separator
304 # ^ = anything not in the set
305 # \w = word (alphanumeric and underscore)
306 # \d = digit [redundant, I think]
307 # \r = carriage return (code 13)
308 # \n = linefeed (code 10)
309 # : = colon
311 # For ordinal days:
312 day = numeric_day + optional_noncapture_group("|".join(ordinal_suffixes))
314 # To be able to capture ISO dates like "20010101", but not capture e.g.
315 # "31/12" as 3, 1, 12, we require separators normally and do a special for
316 # ISO dates:
317 two_digit_day = noncapture_group(r"0[1-9]|[12]\d|30|31")
318 two_digit_month = noncapture_group(r"0[1-9]|1[0-2]")
319 isodate_no_sep = four_digit_year + two_digit_month + two_digit_day
321 # Then for months as words:
322 alphabetical_months = named_capture_group(
323 "|".join([_month_word_regex_fragment(m) for m in all_month_names]),
324 DateRegexNames.ALPHABETICAL_MONTH,
325 )
326 month = noncapture_group("|".join([numeric_month] + [alphabetical_months]))
328 basic_regexes = [
329 named_capture_group(
330 day + sep + month + sep + year,
331 DateRegexNames.DAY_MONTH_YEAR, # e.g. UK
332 ),
333 named_capture_group(
334 month + sep + day + sep + year,
335 DateRegexNames.MONTH_DAY_YEAR, # e.g. USA
336 ),
337 named_capture_group(
338 year + sep + month + sep + day,
339 DateRegexNames.YEAR_MONTH_DAY, # e.g. ISO
340 ),
341 named_capture_group(
342 isodate_no_sep,
343 DateRegexNames.ISODATE_NO_SEP, # ISO with no separators
344 ),
345 ]
346 if at_word_boundaries_only:
347 return [WB + x + WB for x in basic_regexes]
348 else:
349 # Even if we don't require a strict word boundary, we can't allow just
350 # anything -- you get garbage if numbers precede numeric dates.
351 non_numeric_boundary = noncapture_group(r"\b|[\WA-Za-z_]")
352 # \b word boundary = change from word to non-word (or the reverse)
353 # \w = word = alphanumeric and underscore
354 # ... so we take the subset that is alphabetical and underscore
355 # \W = nonword = everything not in \w
356 return [
357 non_numeric_boundary + x + non_numeric_boundary
358 for x in basic_regexes
359 ]
362# -----------------------------------------------------------------------------
363# Generic codes
364# -----------------------------------------------------------------------------
367def get_code_regex_elements(
368 s: str,
369 liberal: bool = True,
370 very_liberal: bool = True,
371 at_word_boundaries_only: bool = True,
372 at_numeric_boundaries_only: bool = True,
373) -> List[str]:
374 """
375 Takes a **string** representation of a number or an alphanumeric code,
376 which may include leading zeros (as for phone numbers), and produces a list
377 of regex strings for scrubbing.
379 We allow all sorts of separators. For example, 0123456789 might appear as
381 .. code-block:: none
383 (01234) 56789
384 0123 456 789
385 01234-56789
386 0123.456.789
388 This can also be used for postcodes, which should have whitespace
389 prestripped, so e.g. PE123AB might appear as
391 .. code-block:: none
393 PE123AB
394 PE12 3AB
395 PE 12 3 AB
397 Args:
398 s:
399 The string representation of a number or code.
400 liberal:
401 Boolean. Use "optional non-newline whitespace" to separate
402 characters in the source.
403 very_liberal:
404 Boolean. Use "optional nonword" to separate characters in the
405 source.
406 at_word_boundaries_only:
407 Boolean. Ensure that the regex begins and ends with a word boundary
408 requirement. So, if True, "123" will not be scrubbed from "M123".
409 at_numeric_boundaries_only:
410 Boolean. Only applicable if ``at_numeric_boundaries_only`` is
411 False. Ensure that the number/code is only recognized when
412 surrounded by non-numbers; that is, only at the boundaries of
413 numbers (at numeric boundaries).
415 - Applicable if ``not at_word_boundaries_only``.
417 - Even though we're not restricting to word boundaries, because
418 (for example) we want ``123456`` to match ``M123456``, it can be
419 undesirable to match numbers that are bordered only by numbers;
420 that is, with this setting, ``23`` should never match ``234`` or
421 ``1234`` or ``123``.
423 - If set, this option ensures that the number/code is recognized
424 only when it is bordered by non-numbers.
426 - But if you want to anonymise "123456" out of a phone number
427 written like "01223123456", you might have to turn this off...
429 Returns:
430 a list of regular expression strings
432 """
433 if not s:
434 return []
435 chars = escape_literal_for_regex_giving_charlist(
436 s
437 ) # escape any decimal points, etc.
438 if very_liberal:
439 separators = OPTIONAL_NONWORD
440 elif liberal:
441 separators = OPTIONAL_NON_NEWLINE_WHITESPACE
442 else:
443 separators = ""
444 s = separators.join([c for c in chars]) # ... can appear anywhere
445 if at_word_boundaries_only:
446 return [WB + s + WB]
447 else:
448 if at_numeric_boundaries_only:
449 # http://www.regular-expressions.info/lookaround.html
450 # https://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa: E501
451 return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD]
452 else:
453 return [s]
456# -----------------------------------------------------------------------------
457# Generic numbers
458# -----------------------------------------------------------------------------
461def get_number_of_length_n_regex_elements(
462 n: int,
463 liberal: bool = True,
464 very_liberal: bool = False,
465 at_word_boundaries_only: bool = True,
466) -> List[str]:
467 """
468 Get a list of regex strings for scrubbing n-digit numbers -- for
469 example, to remove all 10-digit numbers as putative NHS numbers, or all
470 11-digit numbers as putative UK phone numbers.
472 Args:
473 n: the length of the number
474 liberal:
475 Boolean. Use "optional non-newline whitespace" to separate
476 the digits.
477 very_liberal:
478 Boolean. Use "optional nonword" to separate the digits.
479 at_word_boundaries_only:
480 Boolean. If set, ensure that the regex begins and ends with a word
481 boundary requirement. If not set, the regex must be surrounded by
482 non-digits. (If it were surrounded by more digits, it wouldn't be
483 an n-digit number!)
485 Returns:
486 a list of regular expression strings
488 """
489 s = ["[0-9]"] * n
490 if very_liberal:
491 separators = OPTIONAL_NONWORD
492 elif liberal:
493 separators = OPTIONAL_NON_NEWLINE_WHITESPACE
494 else:
495 separators = ""
496 s = separators.join([c for c in s])
497 if at_word_boundaries_only:
498 return [WB + s + WB]
499 else:
500 return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD]
501 # ... if there was a digit before/after, it's not an n-digit number
504# -----------------------------------------------------------------------------
505# UK postcodes
506# -----------------------------------------------------------------------------
509def get_uk_postcode_regex_elements(
510 at_word_boundaries_only: bool = True,
511) -> List[str]:
512 """
513 Get a list of regex strings for scrubbing UK postcodes. These have a
514 well-defined format.
516 Unless compiled with the ``re.IGNORECASE``, they will match upper-case
517 postcodes only.
519 Args:
520 at_word_boundaries_only:
521 Boolean. If set, ensure that the regex begins and ends with a word
522 boundary requirement.
524 Returns:
525 a list of regular expression strings
527 See:
529 - https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes
530 """ # noqa: E501
531 # -------------------------------------------------------------------------
532 # Old
533 # -------------------------------------------------------------------------
535 # e = [
536 # "AN NAA",
537 # "ANN NAA",
538 # "AAN NAA",
539 # "AANN NAA",
540 # "ANA NAA",
541 # "AANA NAA",
542 # ] # type: List[str]
543 # for i in range(len(e)):
544 # e[i] = e[i].replace("A", "[A-Z]") # letter
545 # e[i] = e[i].replace("N", "[0-9]") # number
546 # e[i] = e[i].replace(" ", OPTIONAL_WHITESPACE)
547 # if at_word_boundaries_only:
548 # e[i] = WB + e[i] + WB
549 # return e
551 # -------------------------------------------------------------------------
552 # New 2020-04-28: much more efficient
553 # -------------------------------------------------------------------------
554 e = r"[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}"
555 if at_word_boundaries_only:
556 e = WB + e + WB
557 return [e]
560def get_uk_postcode_regex_string(at_word_boundaries_only: bool = True) -> str:
561 """
562 Shortcut to retrieve a single regex string for UK postcodes (following the
563 changes above on 2020-04-28). See :func:`get_uk_postcode_regex_elements`.
564 """
565 postcode_regexes = get_uk_postcode_regex_elements(
566 at_word_boundaries_only=at_word_boundaries_only
567 )
568 assert len(postcode_regexes) == 1 # as of 2020-04-28, this is true
569 return postcode_regexes[0]
572# -----------------------------------------------------------------------------
573# Generic strings and phrases
574# -----------------------------------------------------------------------------
575# Note, for strings, several typo-detecting methods:
576# http://en.wikipedia.org/wiki/Levenshtein_distance
577# http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
578# http://en.wikipedia.org/wiki/TRE_(computing)
579# https://pypi.python.org/pypi/regex
580# ... let's go with the fuzzy regex method (Python regex module).
583def get_string_regex_elements(
584 s: str,
585 suffixes: List[str] = None,
586 at_word_boundaries_only: bool = True,
587 max_errors: int = 0,
588) -> List[str]:
589 """
590 Takes a string and returns a list of regex strings with which to scrub.
592 Args:
593 s:
594 The starting string.
595 suffixes:
596 A list of suffixes to permit, typically ``["s"]``.
597 at_word_boundaries_only:
598 Boolean. If set, ensure that the regex begins and ends with a word
599 boundary requirement.
600 (If false: will scrub ``ANN`` from ``bANNed``.)
601 max_errors:
602 The maximum number of typographical insertion/deletion/substitution
603 errors to permit.
605 Returns:
606 a list of regular expression strings
608 """
609 if not s:
610 return []
611 s = escape_literal_string_for_regex(s)
612 if max_errors > 0:
613 s = "(" + s + "){e<" + str(max_errors + 1) + "}"
614 # - a leading (?e) forces a search for a better match than the first;
615 # the other way is to specify the regex.ENHANCEMATCH flag...
616 # however, when doing this in get_regex_from_elements(), we got a
617 # segmentation fault... and, less consistently, when we put it here.
618 # So skip that!
619 # - (...) is the pattern
620 # - suffix up to n insertion/deletion/substitution errors
621 # ... https://pypi.python.org/pypi/regex
622 # ... http://www.gossamer-threads.com/lists/python/python/1002881
623 if suffixes:
624 suffixstr = (
625 "(?:"
626 + "|".join([escape_literal_string_for_regex(x) for x in suffixes])
627 + "|)" # allows for no suffix at all
628 )
629 else:
630 suffixstr = ""
631 if at_word_boundaries_only:
632 return [WB + s + suffixstr + WB]
633 else:
634 return [s + suffixstr]
637def get_phrase_regex_elements(
638 phrase: str,
639 suffixes: List[str] = None,
640 at_word_boundaries_only: bool = True,
641 max_errors: int = 0,
642 alternatives: List[List[str]] = None,
643) -> List[str]:
644 """
645 Gets regular expressions to scrub a phrase; that is, all words within a
646 phrase consecutively.
648 Args:
649 phrase:
650 E.g. '4 Privet Drive'.
651 suffixes:
652 A list of suffixes to permit (unusual).
653 at_word_boundaries_only:
654 Apply regex only at word boundaries?
655 max_errors:
656 Maximum number of typos, as defined by the regex module.
657 alternatives:
658 This allows words to be substituted by equivalents; such as
659 ``St`` for ``Street`` or ``Rd`` for ``Road``. The parameter is a
660 list of lists of equivalents; see
661 :func:`crate_anon.anonymise.config.get_word_alternatives`.
663 Returns:
664 A list of regex fragments.
665 """
667 # Break the phrase into consecutive strings.
668 strings = get_anon_fragments_from_string(phrase)
669 if not strings:
670 return []
672 if alternatives:
673 # If we're allowing alternatives...
674 for i, string in enumerate(strings):
675 upperstring = string.upper()
676 found_equivalents = False
677 for equivalent_words in alternatives:
678 if upperstring in equivalent_words:
679 # Found it. Replace our single word with a regex
680 # representing a whole set of alternatives (including what
681 # we started with).
682 strings[i] = (
683 "(?:"
684 + "|".join(
685 escape_literal_string_for_regex(x)
686 for x in equivalent_words
687 )
688 + ")"
689 )
690 found_equivalents = True
691 break
692 if not found_equivalents:
693 # No equivalents; just escape what we have
694 strings[i] = escape_literal_string_for_regex(string)
695 else:
696 # Otherwise, escape what we have
697 strings = [escape_literal_string_for_regex(x) for x in strings]
699 s = AT_LEAST_ONE_NONWORD.join(strings)
700 if max_errors > 0:
701 s = "(" + s + "){e<" + str(max_errors + 1) + "}"
702 if suffixes:
703 suffixstr = (
704 "(?:"
705 + "|".join([escape_literal_string_for_regex(x) for x in suffixes])
706 + "|)" # allows for no suffix at all
707 )
708 else:
709 suffixstr = ""
710 if at_word_boundaries_only:
711 return [WB + s + suffixstr + WB]
712 else:
713 return [s + suffixstr]
716# =============================================================================
717# Combining regex elements into a giant regex
718# =============================================================================
721def get_regex_string_from_elements(elementlist: List[str]) -> str:
722 """
723 Convert a list of regex elements into a single regex string.
724 """
725 if not elementlist:
726 return ""
727 return "|".join(unique_list(elementlist))
728 # The or operator | has the lowest precedence.
729 # ... http://www.regular-expressions.info/alternation.html
730 # We also want to minimize the number of brackets.
731 # THEREFORE, ANYTHING CONTRIBUTING FRAGMENTS HERE SHOULD NOT HAVE |
732 # OPERATORS AT ITS TOP LEVEL. If it does, it should encapsulate them in a
733 # non-capturing group, (?:...)
736def get_regex_from_elements(elementlist: List[str]) -> Optional[Pattern]:
737 """
738 Convert a list of regex elements into a compiled regex, which will operate
739 in case-insensitive fashion on Unicode strings.
740 """
741 if not elementlist:
742 return None
743 try:
744 s = get_regex_string_from_elements(elementlist)
745 return regex.compile(s, REGEX_COMPILE_FLAGS)
746 except _regex_core.error:
747 log.exception(f"Failed regex: elementlist={elementlist}")
748 raise