Coverage for nlp_manager/regex_func.py: 88%

26 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/regex_func.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Functions to assist in building regular expressions.** 

27 

282019-01-01: RM notes Ragel (https://en.wikipedia.org/wiki/Ragel) for embedding 

29actions within a regex parser. Not immediately applicable here, I don't think, 

30but bear in mind. 

31 

32""" 

33 

34import logging 

35from typing import Any, Dict, Optional, Pattern, Tuple 

36 

37import regex 

38 

39# noinspection PyProtectedMember 

40from regex import _regex_core 

41 

42log = logging.getLogger(__name__) 

43 

44 

45# ============================================================================= 

46# Core regex functions 

47# ============================================================================= 

48# - All will use VERBOSE mode for legibility. (No impact on speed: compiled.) 

49# - Don't forget to use raw strings for all regex definitions! 

50# - Beware comments inside regexes. The comment parser isn't quite as benign 

51# as you might think. Use very plain text only. 

52# - (?: XXX ) makes XXX into an unnamed group. 

53 

54 

55REGEX_COMPILE_FLAGS = ( 

56 regex.IGNORECASE | regex.MULTILINE | regex.VERBOSE | regex.UNICODE 

57) 

58 

59 

60def compile_regex(regex_str: str) -> Pattern: 

61 """ 

62 Compiles a regular expression with our standard flags. 

63 """ 

64 try: 

65 return regex.compile(regex_str, REGEX_COMPILE_FLAGS) 

66 except _regex_core.error: 

67 log.critical(f"FAILING REGEX:\n{regex_str}") 

68 raise 

69 

70 

71def compile_regex_dict( 

72 regexstr_to_value_dict: Dict[str, Any] 

73) -> Dict[Pattern, Any]: 

74 """ 

75 Converts a dictionary ``{regex_str: value}`` to a dictionary 

76 ``{compiled_regex: value}``. 

77 """ 

78 return {compile_regex(k): v for k, v in regexstr_to_value_dict.items()} 

79 

80 

81def get_regex_dict_match( 

82 text: Optional[str], 

83 regex_to_value_dict: Dict[Pattern, Any], 

84 default: Any = None, 

85) -> Tuple[bool, Any]: 

86 """ 

87 Checks text against a set of regular expressions. Returns whether there is 

88 a match, and if there was a match, the value that was associated (in the 

89 dictionary) with the matching regex. 

90 

91 (Note: "match", as usual, means "match at the beginning of the string".) 

92 

93 Args: 

94 text: 

95 text to test 

96 regex_to_value_dict: 

97 dictionary mapping ``{compiled_regex: value}`` 

98 default: 

99 value to return if there is no match 

100 

101 Returns: 

102 tuple: ``matched, associated_value_or_default`` 

103 

104 """ 

105 if text: 

106 for r, value in regex_to_value_dict.items(): 

107 if r.match(text): 

108 return True, value 

109 return False, default 

110 

111 

112def get_regex_dict_search( 

113 text: Optional[str], 

114 regex_to_value_dict: Dict[Pattern, Any], 

115 default: Any = None, 

116) -> Tuple[bool, Any]: 

117 """ 

118 As for :func:`get_regex_dict_match`, but performs a search (find anywhere 

119 in the string) rather than a match. 

120 """ 

121 if text: 

122 for r, value in regex_to_value_dict.items(): 

123 if r.search(text): 

124 return True, value 

125 return False, default