Coverage for nlp_manager/regex_func.py: 88%
26 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/regex_func.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Functions to assist in building regular expressions.**
282019-01-01: RM notes Ragel (https://en.wikipedia.org/wiki/Ragel) for embedding
29actions within a regex parser. Not immediately applicable here, I don't think,
30but bear in mind.
32"""
34import logging
35from typing import Any, Dict, Optional, Pattern, Tuple
37import regex
39# noinspection PyProtectedMember
40from regex import _regex_core
42log = logging.getLogger(__name__)
45# =============================================================================
46# Core regex functions
47# =============================================================================
48# - All will use VERBOSE mode for legibility. (No impact on speed: compiled.)
49# - Don't forget to use raw strings for all regex definitions!
50# - Beware comments inside regexes. The comment parser isn't quite as benign
51# as you might think. Use very plain text only.
52# - (?: XXX ) makes XXX into an unnamed group.
55REGEX_COMPILE_FLAGS = (
56 regex.IGNORECASE | regex.MULTILINE | regex.VERBOSE | regex.UNICODE
57)
60def compile_regex(regex_str: str) -> Pattern:
61 """
62 Compiles a regular expression with our standard flags.
63 """
64 try:
65 return regex.compile(regex_str, REGEX_COMPILE_FLAGS)
66 except _regex_core.error:
67 log.critical(f"FAILING REGEX:\n{regex_str}")
68 raise
71def compile_regex_dict(
72 regexstr_to_value_dict: Dict[str, Any]
73) -> Dict[Pattern, Any]:
74 """
75 Converts a dictionary ``{regex_str: value}`` to a dictionary
76 ``{compiled_regex: value}``.
77 """
78 return {compile_regex(k): v for k, v in regexstr_to_value_dict.items()}
81def get_regex_dict_match(
82 text: Optional[str],
83 regex_to_value_dict: Dict[Pattern, Any],
84 default: Any = None,
85) -> Tuple[bool, Any]:
86 """
87 Checks text against a set of regular expressions. Returns whether there is
88 a match, and if there was a match, the value that was associated (in the
89 dictionary) with the matching regex.
91 (Note: "match", as usual, means "match at the beginning of the string".)
93 Args:
94 text:
95 text to test
96 regex_to_value_dict:
97 dictionary mapping ``{compiled_regex: value}``
98 default:
99 value to return if there is no match
101 Returns:
102 tuple: ``matched, associated_value_or_default``
104 """
105 if text:
106 for r, value in regex_to_value_dict.items():
107 if r.match(text):
108 return True, value
109 return False, default
112def get_regex_dict_search(
113 text: Optional[str],
114 regex_to_value_dict: Dict[Pattern, Any],
115 default: Any = None,
116) -> Tuple[bool, Any]:
117 """
118 As for :func:`get_regex_dict_match`, but performs a search (find anywhere
119 in the string) rather than a match.
120 """
121 if text:
122 for r, value in regex_to_value_dict.items():
123 if r.search(text):
124 return True, value
125 return False, default