Coverage for anonymise/anonregex.py: 24%

147 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/anonymise/anonregex.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Regular expression functions for anonymisation.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import calendar 

35import datetime 

36import logging 

37from typing import Iterable, List, Optional, Pattern, Union 

38 

39from cardinal_pythonlib.lists import unique_list 

40 

41# https://pypi.python.org/pypi/regex/ 

42# https://bitbucket.org/mrabarnett/mrab-regex 

43import regex # sudo apt-get install python-regex 

44 

45# noinspection PyProtectedMember 

46from regex import _regex_core 

47 

48from crate_anon.common.regex_helpers import ( 

49 assert_alphabetical, 

50 AT_LEAST_ONE_NONWORD, 

51 escape_literal_for_regex_giving_charlist, 

52 escape_literal_string_for_regex, 

53 first_n_characters_required, 

54 named_capture_group, 

55 NON_ALPHANUMERIC_SPLITTERS, 

56 noncapture_group, 

57 NOT_DIGIT_LOOKAHEAD, 

58 NOT_DIGIT_LOOKBEHIND, 

59 OPTIONAL_NON_NEWLINE_WHITESPACE, 

60 optional_noncapture_group, 

61 OPTIONAL_NONWORD, 

62 WORD_BOUNDARY as WB, 

63) 

64 

65log = logging.getLogger(__name__) 

66 

67 

68# ============================================================================= 

69# Constants 

70# ============================================================================= 

71 

72ORDINAL_SUFFIXES_ENGLISH = ("st", "nd", "rd", "th") # 1st, 2nd, 3rd, 4th... 

73MONTHS_ENGLISH = tuple(calendar.month_name[_] for _ in range(1, 12 + 1)) 

74# https://docs.python.org/3/library/calendar.html 

75 

76REGEX_COMPILE_FLAGS = ( 

77 regex.IGNORECASE | regex.UNICODE | regex.VERBOSE | regex.MULTILINE 

78) 

79 

80EMAIL_REGEX_STR = ( 

81 # http://emailregex.com/ 

82 # The simple Python example doesn't cope with "r&d@somewhere.nhs.uk". 

83 # The "full" version is: 

84 r""" 

85(?: 

86 [a-z0-9!#$%&'*+/=?^_`{|}~-]+ 

87 (?: 

88 \.[a-z0-9!#$%&'*+/=?^_`{|}~-]+ 

89 )*| 

90 " 

91 (?: 

92 [\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]| 

93 \\ 

94 [\x01-\x09\x0b\x0c\x0e-\x7f] 

95 )* 

96 " 

97) 

98@ 

99(?: 

100 (?: 

101 [a-z0-9] 

102 (?: 

103 [a-z0-9-]* 

104 [a-z0-9] 

105 )? 

106 \. 

107 )+ 

108 [a-z0-9] 

109 (?: 

110 [a-z0-9-]* 

111 [a-z0-9] 

112 )? 

113 | 

114 \[ 

115 (?: 

116 (?: 

117 25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]? 

118 ) 

119 \. 

120 ){3} 

121 (?: 

122 25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]? 

123 | 

124 [a-z0-9-]*[a-z0-9]: 

125 (?: 

126 [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f] 

127 | 

128 \\[\x01-\x09\x0b\x0c\x0e-\x7f] 

129 )+ 

130 ) 

131 \] 

132) 

133 

134""" 

135) 

136 

137 

138# ============================================================================= 

139# String manipulation 

140# ============================================================================= 

141 

142 

143def get_anon_fragments_from_string(s: str) -> List[str]: 

144 """ 

145 Takes a complex string, such as a name or address with its components 

146 separated by spaces, commas, etc., and returns a list of substrings to be 

147 used for anonymisation. 

148 

149 - For example, from ``"John Smith"``, return ``["John", "Smith"]``; 

150 from ``"John D'Souza"``, return ``["John", "D", "Souza"]``; 

151 from ``"42 West Street"``, return ``["42", "West", "Street"]``. 

152 

153 - Try these examples: 

154 

155 .. code-block:: python 

156 

157 get_anon_fragments_from_string("Bob D'Souza") 

158 get_anon_fragments_from_string("Jemima Al-Khalaim") 

159 get_anon_fragments_from_string("47 Russell Square") 

160 

161 - Note that this is a LIBERAL algorithm, i.e. one prone to anonymise too 

162 much (e.g. all instances of ``"Street"`` if someone has that as part of 

163 their address). 

164 - *Note that we use the "word boundary" facility when replacing, and that 

165 treats apostrophes and hyphens as word boundaries.* 

166 Therefore, we don't need the largest-level chunks, like ``D'Souza``. 

167 """ 

168 return list(filter(None, NON_ALPHANUMERIC_SPLITTERS.split(s))) 

169 # The filter(None, ...) aspect removes empty strings, e.g. from 

170 # leading/trailing whitespace. 

171 

172 

173# ============================================================================= 

174# Anonymisation regexes 

175# ============================================================================= 

176 

177# ----------------------------------------------------------------------------- 

178# Dates 

179# ----------------------------------------------------------------------------- 

180 

181 

182def _month_word_regex_fragment(month_name: str) -> str: 

183 """ 

184 Returns possibilities for the month word, allowing the first 3 characters, 

185 or the whole month name -- e.g. converts ``September`` to 

186 ``Sep(?:tember)?``, or indeed anything in between 3 and all of the 

187 characters, e.g. ``Sept``. 

188 """ 

189 return first_n_characters_required(month_name, 3) 

190 

191 

192def get_date_regex_elements( 

193 dt: Union[datetime.datetime, datetime.date], 

194 at_word_boundaries_only: bool = False, 

195 ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH, 

196) -> List[str]: 

197 """ 

198 Takes a datetime object and returns a list of regex strings with which 

199 to scrub. 

200 

201 For example, a date/time of 13 Sep 2014 will produce regexes that recognize 

202 "13 Sep 2014", "September 13, 2014", "2014/09/13", and many more. 

203 

204 Args: 

205 dt: 

206 The datetime or date or similar object. 

207 at_word_boundaries_only: 

208 Ensure that all regexes begin and end with a word boundary 

209 requirement. 

210 ordinal_suffixes: 

211 Language-specific suffixes that may be appended to numbers to make 

212 them ordinal. In English, "st", "nd", "rd", and "th". 

213 

214 Returns: 

215 the list of regular expression strings, as above 

216 """ 

217 # Day (numeric), allowing leading zeroes and e.g. "1st, 2nd" 

218 assert_alphabetical(ordinal_suffixes) 

219 assert not isinstance(ordinal_suffixes, str) 

220 optional_suffixes = optional_noncapture_group("|".join(ordinal_suffixes)) 

221 day = "0*" + str(dt.day) + optional_suffixes 

222 

223 # Month 

224 # ... numerically, allowing leading zeroes for numeric and e.g. 

225 # Feb/February 

226 month_numeric = "0*" + str(dt.month) 

227 # ... as a word 

228 # month_word = dt.strftime("%B") # can't cope with years < 1900 

229 month_name = calendar.month_name[dt.month] # localized 

230 # Allow first 3 characters, or whole month name: 

231 month_word = _month_word_regex_fragment(month_name) 

232 month = "(?:" + month_numeric + "|" + month_word + ")" 

233 

234 # Year 

235 year = str(dt.year) 

236 if len(year) == 4: 

237 year = "(?:" + year[0:2] + ")?" + year[2:4] 

238 # ... converts e.g. 1986 to (19)?86, to match 1986 or 86 

239 

240 # Separator 

241 sep = OPTIONAL_NONWORD 

242 

243 # Regexes 

244 basic_regexes = [ 

245 day + sep + month + sep + year, # e.g. 13 Sep 2014 

246 month + sep + day + sep + year, # e.g. Sep 13, 2014 

247 year + sep + month + sep + day, # e.g. 2014/09/13 

248 ] 

249 if at_word_boundaries_only: 

250 return [WB + x + WB for x in basic_regexes] 

251 else: 

252 return basic_regexes 

253 

254 

255class DateRegexNames: 

256 """ 

257 For named groups in date regexes. 

258 """ 

259 

260 # Components that we might need to preserve for blurring, and thus 

261 # capture: 

262 ALPHABETICAL_MONTH = "alphabetical_month" 

263 FOUR_DIGIT_YEAR = "four_digit_year" 

264 NUMERIC_DAY = "numeric_day" 

265 NUMERIC_MONTH = "numeric_month" 

266 TWO_DIGIT_YEAR = "two_digit_year" 

267 # Grouped: 

268 DAY_MONTH_YEAR = "day_month_year" 

269 MONTH_DAY_YEAR = "month_day_year" 

270 YEAR_MONTH_DAY = "year_month_day" 

271 ISODATE_NO_SEP = "isodate_no_sep" 

272 

273 

274def get_generic_date_regex_elements( 

275 at_word_boundaries_only: bool = True, 

276 ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH, 

277 all_month_names: Iterable[str] = MONTHS_ENGLISH, 

278) -> List[str]: 

279 """ 

280 Returns a set of regex elements to scrub *any* date. 

281 

282 Word boundaries are strongly preferred! This will match some odd things 

283 otherwise; see the associated unit tests. 

284 """ 

285 # https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa: E501 

286 

287 # range [1, 31] 

288 numeric_day = named_capture_group( 

289 r"0?[1-9]|[12]\d|30|31", DateRegexNames.NUMERIC_DAY 

290 ) 

291 # range [1, 12] 

292 numeric_month = named_capture_group( 

293 r"0?[1-9]|1[0-2]", DateRegexNames.NUMERIC_MONTH 

294 ) 

295 # a 2-digit or 4-digit number 

296 two_digit_year = named_capture_group( 

297 r"\d{2}", DateRegexNames.TWO_DIGIT_YEAR 

298 ) 

299 four_digit_year = named_capture_group( 

300 r"\d{4}", DateRegexNames.FOUR_DIGIT_YEAR 

301 ) 

302 year = noncapture_group(rf"{two_digit_year}|{four_digit_year}") 

303 sep = r"[^\w\d\r\n:]" # an active separator 

304 # ^ = anything not in the set 

305 # \w = word (alphanumeric and underscore) 

306 # \d = digit [redundant, I think] 

307 # \r = carriage return (code 13) 

308 # \n = linefeed (code 10) 

309 # : = colon 

310 

311 # For ordinal days: 

312 day = numeric_day + optional_noncapture_group("|".join(ordinal_suffixes)) 

313 

314 # To be able to capture ISO dates like "20010101", but not capture e.g. 

315 # "31/12" as 3, 1, 12, we require separators normally and do a special for 

316 # ISO dates: 

317 two_digit_day = noncapture_group(r"0[1-9]|[12]\d|30|31") 

318 two_digit_month = noncapture_group(r"0[1-9]|1[0-2]") 

319 isodate_no_sep = four_digit_year + two_digit_month + two_digit_day 

320 

321 # Then for months as words: 

322 alphabetical_months = named_capture_group( 

323 "|".join([_month_word_regex_fragment(m) for m in all_month_names]), 

324 DateRegexNames.ALPHABETICAL_MONTH, 

325 ) 

326 month = noncapture_group("|".join([numeric_month] + [alphabetical_months])) 

327 

328 basic_regexes = [ 

329 named_capture_group( 

330 day + sep + month + sep + year, 

331 DateRegexNames.DAY_MONTH_YEAR, # e.g. UK 

332 ), 

333 named_capture_group( 

334 month + sep + day + sep + year, 

335 DateRegexNames.MONTH_DAY_YEAR, # e.g. USA 

336 ), 

337 named_capture_group( 

338 year + sep + month + sep + day, 

339 DateRegexNames.YEAR_MONTH_DAY, # e.g. ISO 

340 ), 

341 named_capture_group( 

342 isodate_no_sep, 

343 DateRegexNames.ISODATE_NO_SEP, # ISO with no separators 

344 ), 

345 ] 

346 if at_word_boundaries_only: 

347 return [WB + x + WB for x in basic_regexes] 

348 else: 

349 # Even if we don't require a strict word boundary, we can't allow just 

350 # anything -- you get garbage if numbers precede numeric dates. 

351 non_numeric_boundary = noncapture_group(r"\b|[\WA-Za-z_]") 

352 # \b word boundary = change from word to non-word (or the reverse) 

353 # \w = word = alphanumeric and underscore 

354 # ... so we take the subset that is alphabetical and underscore 

355 # \W = nonword = everything not in \w 

356 return [ 

357 non_numeric_boundary + x + non_numeric_boundary 

358 for x in basic_regexes 

359 ] 

360 

361 

362# ----------------------------------------------------------------------------- 

363# Generic codes 

364# ----------------------------------------------------------------------------- 

365 

366 

367def get_code_regex_elements( 

368 s: str, 

369 liberal: bool = True, 

370 very_liberal: bool = True, 

371 at_word_boundaries_only: bool = True, 

372 at_numeric_boundaries_only: bool = True, 

373) -> List[str]: 

374 """ 

375 Takes a **string** representation of a number or an alphanumeric code, 

376 which may include leading zeros (as for phone numbers), and produces a list 

377 of regex strings for scrubbing. 

378 

379 We allow all sorts of separators. For example, 0123456789 might appear as 

380 

381 .. code-block:: none 

382 

383 (01234) 56789 

384 0123 456 789 

385 01234-56789 

386 0123.456.789 

387 

388 This can also be used for postcodes, which should have whitespace 

389 prestripped, so e.g. PE123AB might appear as 

390 

391 .. code-block:: none 

392 

393 PE123AB 

394 PE12 3AB 

395 PE 12 3 AB 

396 

397 Args: 

398 s: 

399 The string representation of a number or code. 

400 liberal: 

401 Boolean. Use "optional non-newline whitespace" to separate 

402 characters in the source. 

403 very_liberal: 

404 Boolean. Use "optional nonword" to separate characters in the 

405 source. 

406 at_word_boundaries_only: 

407 Boolean. Ensure that the regex begins and ends with a word boundary 

408 requirement. So, if True, "123" will not be scrubbed from "M123". 

409 at_numeric_boundaries_only: 

410 Boolean. Only applicable if ``at_numeric_boundaries_only`` is 

411 False. Ensure that the number/code is only recognized when 

412 surrounded by non-numbers; that is, only at the boundaries of 

413 numbers (at numeric boundaries). 

414 

415 - Applicable if ``not at_word_boundaries_only``. 

416 

417 - Even though we're not restricting to word boundaries, because 

418 (for example) we want ``123456`` to match ``M123456``, it can be 

419 undesirable to match numbers that are bordered only by numbers; 

420 that is, with this setting, ``23`` should never match ``234`` or 

421 ``1234`` or ``123``. 

422 

423 - If set, this option ensures that the number/code is recognized 

424 only when it is bordered by non-numbers. 

425 

426 - But if you want to anonymise "123456" out of a phone number 

427 written like "01223123456", you might have to turn this off... 

428 

429 Returns: 

430 a list of regular expression strings 

431 

432 """ 

433 if not s: 

434 return [] 

435 chars = escape_literal_for_regex_giving_charlist( 

436 s 

437 ) # escape any decimal points, etc. 

438 if very_liberal: 

439 separators = OPTIONAL_NONWORD 

440 elif liberal: 

441 separators = OPTIONAL_NON_NEWLINE_WHITESPACE 

442 else: 

443 separators = "" 

444 s = separators.join([c for c in chars]) # ... can appear anywhere 

445 if at_word_boundaries_only: 

446 return [WB + s + WB] 

447 else: 

448 if at_numeric_boundaries_only: 

449 # http://www.regular-expressions.info/lookaround.html 

450 # https://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa: E501 

451 return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD] 

452 else: 

453 return [s] 

454 

455 

456# ----------------------------------------------------------------------------- 

457# Generic numbers 

458# ----------------------------------------------------------------------------- 

459 

460 

461def get_number_of_length_n_regex_elements( 

462 n: int, 

463 liberal: bool = True, 

464 very_liberal: bool = False, 

465 at_word_boundaries_only: bool = True, 

466) -> List[str]: 

467 """ 

468 Get a list of regex strings for scrubbing n-digit numbers -- for 

469 example, to remove all 10-digit numbers as putative NHS numbers, or all 

470 11-digit numbers as putative UK phone numbers. 

471 

472 Args: 

473 n: the length of the number 

474 liberal: 

475 Boolean. Use "optional non-newline whitespace" to separate 

476 the digits. 

477 very_liberal: 

478 Boolean. Use "optional nonword" to separate the digits. 

479 at_word_boundaries_only: 

480 Boolean. If set, ensure that the regex begins and ends with a word 

481 boundary requirement. If not set, the regex must be surrounded by 

482 non-digits. (If it were surrounded by more digits, it wouldn't be 

483 an n-digit number!) 

484 

485 Returns: 

486 a list of regular expression strings 

487 

488 """ 

489 s = ["[0-9]"] * n 

490 if very_liberal: 

491 separators = OPTIONAL_NONWORD 

492 elif liberal: 

493 separators = OPTIONAL_NON_NEWLINE_WHITESPACE 

494 else: 

495 separators = "" 

496 s = separators.join([c for c in s]) 

497 if at_word_boundaries_only: 

498 return [WB + s + WB] 

499 else: 

500 return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD] 

501 # ... if there was a digit before/after, it's not an n-digit number 

502 

503 

504# ----------------------------------------------------------------------------- 

505# UK postcodes 

506# ----------------------------------------------------------------------------- 

507 

508 

509def get_uk_postcode_regex_elements( 

510 at_word_boundaries_only: bool = True, 

511) -> List[str]: 

512 """ 

513 Get a list of regex strings for scrubbing UK postcodes. These have a 

514 well-defined format. 

515 

516 Unless compiled with the ``re.IGNORECASE``, they will match upper-case 

517 postcodes only. 

518 

519 Args: 

520 at_word_boundaries_only: 

521 Boolean. If set, ensure that the regex begins and ends with a word 

522 boundary requirement. 

523 

524 Returns: 

525 a list of regular expression strings 

526 

527 See: 

528 

529 - https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes 

530 """ # noqa: E501 

531 # ------------------------------------------------------------------------- 

532 # Old 

533 # ------------------------------------------------------------------------- 

534 

535 # e = [ 

536 # "AN NAA", 

537 # "ANN NAA", 

538 # "AAN NAA", 

539 # "AANN NAA", 

540 # "ANA NAA", 

541 # "AANA NAA", 

542 # ] # type: List[str] 

543 # for i in range(len(e)): 

544 # e[i] = e[i].replace("A", "[A-Z]") # letter 

545 # e[i] = e[i].replace("N", "[0-9]") # number 

546 # e[i] = e[i].replace(" ", OPTIONAL_WHITESPACE) 

547 # if at_word_boundaries_only: 

548 # e[i] = WB + e[i] + WB 

549 # return e 

550 

551 # ------------------------------------------------------------------------- 

552 # New 2020-04-28: much more efficient 

553 # ------------------------------------------------------------------------- 

554 e = r"[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}" 

555 if at_word_boundaries_only: 

556 e = WB + e + WB 

557 return [e] 

558 

559 

560def get_uk_postcode_regex_string(at_word_boundaries_only: bool = True) -> str: 

561 """ 

562 Shortcut to retrieve a single regex string for UK postcodes (following the 

563 changes above on 2020-04-28). See :func:`get_uk_postcode_regex_elements`. 

564 """ 

565 postcode_regexes = get_uk_postcode_regex_elements( 

566 at_word_boundaries_only=at_word_boundaries_only 

567 ) 

568 assert len(postcode_regexes) == 1 # as of 2020-04-28, this is true 

569 return postcode_regexes[0] 

570 

571 

572# ----------------------------------------------------------------------------- 

573# Generic strings and phrases 

574# ----------------------------------------------------------------------------- 

575# Note, for strings, several typo-detecting methods: 

576# http://en.wikipedia.org/wiki/Levenshtein_distance 

577# http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/ 

578# http://en.wikipedia.org/wiki/TRE_(computing) 

579# https://pypi.python.org/pypi/regex 

580# ... let's go with the fuzzy regex method (Python regex module). 

581 

582 

583def get_string_regex_elements( 

584 s: str, 

585 suffixes: List[str] = None, 

586 at_word_boundaries_only: bool = True, 

587 max_errors: int = 0, 

588) -> List[str]: 

589 """ 

590 Takes a string and returns a list of regex strings with which to scrub. 

591 

592 Args: 

593 s: 

594 The starting string. 

595 suffixes: 

596 A list of suffixes to permit, typically ``["s"]``. 

597 at_word_boundaries_only: 

598 Boolean. If set, ensure that the regex begins and ends with a word 

599 boundary requirement. 

600 (If false: will scrub ``ANN`` from ``bANNed``.) 

601 max_errors: 

602 The maximum number of typographical insertion/deletion/substitution 

603 errors to permit. 

604 

605 Returns: 

606 a list of regular expression strings 

607 

608 """ 

609 if not s: 

610 return [] 

611 s = escape_literal_string_for_regex(s) 

612 if max_errors > 0: 

613 s = "(" + s + "){e<" + str(max_errors + 1) + "}" 

614 # - a leading (?e) forces a search for a better match than the first; 

615 # the other way is to specify the regex.ENHANCEMATCH flag... 

616 # however, when doing this in get_regex_from_elements(), we got a 

617 # segmentation fault... and, less consistently, when we put it here. 

618 # So skip that! 

619 # - (...) is the pattern 

620 # - suffix up to n insertion/deletion/substitution errors 

621 # ... https://pypi.python.org/pypi/regex 

622 # ... http://www.gossamer-threads.com/lists/python/python/1002881 

623 if suffixes: 

624 suffixstr = ( 

625 "(?:" 

626 + "|".join([escape_literal_string_for_regex(x) for x in suffixes]) 

627 + "|)" # allows for no suffix at all 

628 ) 

629 else: 

630 suffixstr = "" 

631 if at_word_boundaries_only: 

632 return [WB + s + suffixstr + WB] 

633 else: 

634 return [s + suffixstr] 

635 

636 

637def get_phrase_regex_elements( 

638 phrase: str, 

639 suffixes: List[str] = None, 

640 at_word_boundaries_only: bool = True, 

641 max_errors: int = 0, 

642 alternatives: List[List[str]] = None, 

643) -> List[str]: 

644 """ 

645 Gets regular expressions to scrub a phrase; that is, all words within a 

646 phrase consecutively. 

647 

648 Args: 

649 phrase: 

650 E.g. '4 Privet Drive'. 

651 suffixes: 

652 A list of suffixes to permit (unusual). 

653 at_word_boundaries_only: 

654 Apply regex only at word boundaries? 

655 max_errors: 

656 Maximum number of typos, as defined by the regex module. 

657 alternatives: 

658 This allows words to be substituted by equivalents; such as 

659 ``St`` for ``Street`` or ``Rd`` for ``Road``. The parameter is a 

660 list of lists of equivalents; see 

661 :func:`crate_anon.anonymise.config.get_word_alternatives`. 

662 

663 Returns: 

664 A list of regex fragments. 

665 """ 

666 

667 # Break the phrase into consecutive strings. 

668 strings = get_anon_fragments_from_string(phrase) 

669 if not strings: 

670 return [] 

671 

672 if alternatives: 

673 # If we're allowing alternatives... 

674 for i, string in enumerate(strings): 

675 upperstring = string.upper() 

676 found_equivalents = False 

677 for equivalent_words in alternatives: 

678 if upperstring in equivalent_words: 

679 # Found it. Replace our single word with a regex 

680 # representing a whole set of alternatives (including what 

681 # we started with). 

682 strings[i] = ( 

683 "(?:" 

684 + "|".join( 

685 escape_literal_string_for_regex(x) 

686 for x in equivalent_words 

687 ) 

688 + ")" 

689 ) 

690 found_equivalents = True 

691 break 

692 if not found_equivalents: 

693 # No equivalents; just escape what we have 

694 strings[i] = escape_literal_string_for_regex(string) 

695 else: 

696 # Otherwise, escape what we have 

697 strings = [escape_literal_string_for_regex(x) for x in strings] 

698 

699 s = AT_LEAST_ONE_NONWORD.join(strings) 

700 if max_errors > 0: 

701 s = "(" + s + "){e<" + str(max_errors + 1) + "}" 

702 if suffixes: 

703 suffixstr = ( 

704 "(?:" 

705 + "|".join([escape_literal_string_for_regex(x) for x in suffixes]) 

706 + "|)" # allows for no suffix at all 

707 ) 

708 else: 

709 suffixstr = "" 

710 if at_word_boundaries_only: 

711 return [WB + s + suffixstr + WB] 

712 else: 

713 return [s + suffixstr] 

714 

715 

716# ============================================================================= 

717# Combining regex elements into a giant regex 

718# ============================================================================= 

719 

720 

721def get_regex_string_from_elements(elementlist: List[str]) -> str: 

722 """ 

723 Convert a list of regex elements into a single regex string. 

724 """ 

725 if not elementlist: 

726 return "" 

727 return "|".join(unique_list(elementlist)) 

728 # The or operator | has the lowest precedence. 

729 # ... http://www.regular-expressions.info/alternation.html 

730 # We also want to minimize the number of brackets. 

731 # THEREFORE, ANYTHING CONTRIBUTING FRAGMENTS HERE SHOULD NOT HAVE | 

732 # OPERATORS AT ITS TOP LEVEL. If it does, it should encapsulate them in a 

733 # non-capturing group, (?:...) 

734 

735 

736def get_regex_from_elements(elementlist: List[str]) -> Optional[Pattern]: 

737 """ 

738 Convert a list of regex elements into a compiled regex, which will operate 

739 in case-insensitive fashion on Unicode strings. 

740 """ 

741 if not elementlist: 

742 return None 

743 try: 

744 s = get_regex_string_from_elements(elementlist) 

745 return regex.compile(s, REGEX_COMPILE_FLAGS) 

746 except _regex_core.error: 

747 log.exception(f"Failed regex: elementlist={elementlist}") 

748 raise