Coverage for common/stringfunc.py: 31%

67 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/common/stringfunc.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Simple string functions.** 

27 

28""" 

29 

30import fnmatch 

31from functools import lru_cache 

32import sys 

33from typing import Any, List, Optional, Pattern, TextIO, Type 

34 

35from cardinal_pythonlib.extract_text import wordwrap 

36import prettytable 

37import regex 

38 

39 

40# ============================================================================= 

41# Simple string manipulation 

42# ============================================================================= 

43 

44 

45def get_digit_string_from_vaguely_numeric_string(s: str) -> str: 

46 """ 

47 Strips non-digit characters from a string. 

48 

49 For example, converts ``"(01223) 123456"`` to ``"01223123456"``. 

50 """ 

51 return "".join([d for d in s if d.isdigit()]) 

52 

53 

54def reduce_to_alphanumeric(s: str) -> str: 

55 """ 

56 Strips non-alphanumeric characters from a string. 

57 

58 For example, converts ``"PE12 3AB"`` to ``"PE12 3AB"``. 

59 """ 

60 return "".join([d for d in s if d.isalnum()]) 

61 

62 

63def remove_whitespace(s: str) -> str: 

64 """ 

65 Removes whitespace from a string. 

66 """ 

67 return "".join(s.split()) 

68 

69 

70# ============================================================================= 

71# Specification matching 

72# ============================================================================= 

73 

74 

75@lru_cache(maxsize=None) 

76def get_spec_match_regex(spec: str) -> Pattern: 

77 """ 

78 Returns a compiled, case-insensitive regular expression representing a 

79 shell-style pattern (using ``*``, ``?`` and similar wildcards; see 

80 https://docs.python.org/3.5/library/fnmatch.html). 

81 

82 Args: 

83 spec: the pattern to pass to ``fnmatch``, e.g. ``"patient_addr*"``. 

84 

85 Returns: 

86 the compiled regular expression 

87 """ 

88 return regex.compile(fnmatch.translate(spec), regex.IGNORECASE) 

89 

90 

91# ============================================================================= 

92# Printing/encoding 

93# ============================================================================= 

94 

95 

96def uprint( 

97 *objects: Any, sep: str = " ", end: str = "\n", file: TextIO = sys.stdout 

98) -> None: 

99 """ 

100 Prints strings to outputs that support UTF-8 encoding, but also to those 

101 that do not (e.g. Windows stdout, sometimes). 

102 

103 Args: 

104 *objects: things to print 

105 sep: separator between those objects 

106 end: print this at the end 

107 file: file-like object to print to 

108 

109 See 

110 https://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined 

111 

112 Examples: 

113 

114 - Linux, Python 3.6.8 console: ``sys.stdout.encoding == "UTF-8"`` 

115 - Windows, Python 3.7.4 console: ``sys.stdout.encoding == "utf-8"`` 

116 - Windows, Python 3.7.4, from script: ``sys.stdout.encoding == "cp1252"`` 

117 """ 

118 enc = file.encoding.lower() 

119 if enc == "utf-8": 

120 print(*objects, sep=sep, end=end, file=file) 

121 else: 

122 

123 def f(obj: Any) -> str: 

124 return str(obj).encode(enc, errors="backslashreplace").decode(enc) 

125 

126 # https://docs.python.org/3.5/library/codecs.html#codec-base-classes 

127 print(*map(f, objects), sep=sep, end=end, file=file) 

128 

129 

130# ============================================================================= 

131# String tests 

132# ============================================================================= 

133 

134 

135def does_text_contain_word_chars(text: str) -> bool: 

136 """ 

137 Is a string worth treating as interesting text -- does it contain "word" 

138 characters? 

139 """ 

140 # Slower (as per FS's tests): 

141 # regex_any_word_char = regex.compile(r'[\w\W]*[a-zA-Z0-9_][\w\W]*') 

142 # return bool(text and regex_any_word_char.match(text)) 

143 # Faster: 

144 return bool(text and any(33 <= ord(c) <= 126 for c in text)) 

145 

146 

147# ============================================================================= 

148# Docstring manipulation 

149# ============================================================================= 

150 

151 

152def get_docstring(cls: Type) -> str: 

153 """ 

154 Fetches a docstring from a class. 

155 """ 

156 # PyCharm thinks that __doc__ is bytes, but it's str! 

157 # ... ah, no, now it's stopped believing that. 

158 return cls.__doc__ or "" 

159 # This is likely unnecessary: even integer variables have the __doc__ 

160 # attribute. 

161 # return getattr(cls, '__doc__', "") or "" 

162 

163 

164def compress_docstring(docstring: str) -> str: 

165 """ 

166 Splats a docstring onto a single line, compressing all whitespace. 

167 """ 

168 docstring = docstring.replace("\n", " ") 

169 # https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python 

170 return " ".join(docstring.split()) 

171 

172 

173def trim_docstring(docstring: str) -> str: 

174 """ 

175 Removes initial/terminal blank lines and leading whitespace from 

176 docstrings. 

177 

178 This is the PEP257 implementation (https://peps.python.org/pep-0257/), 

179 except with ``sys.maxint`` replaced by ``sys.maxsize`` (see 

180 https://docs.python.org/3.1/whatsnew/3.0.html#integers). 

181 

182 Demonstration: 

183 

184 .. code-block:: python 

185 

186 from crate_anon.common.stringfunc import trim_docstring 

187 print(trim_docstring.__doc__) 

188 print(trim_docstring(trim_docstring.__doc__)) 

189 """ 

190 if not docstring: 

191 return "" 

192 # Convert tabs to spaces (following the normal Python rules) 

193 # and split into a list of lines: 

194 lines = docstring.expandtabs().splitlines() 

195 # Determine minimum indentation (first line doesn't count): 

196 indent = sys.maxsize 

197 for line in lines[1:]: 

198 stripped = line.lstrip() 

199 if stripped: 

200 indent = min(indent, len(line) - len(stripped)) 

201 # Remove indentation (first line is special): 

202 trimmed = [lines[0].strip()] 

203 if indent < sys.maxsize: 

204 for line in lines[1:]: 

205 trimmed.append(line[indent:].rstrip()) 

206 # Strip off trailing and leading blank lines: 

207 while trimmed and not trimmed[-1]: 

208 trimmed.pop() 

209 while trimmed and not trimmed[0]: 

210 trimmed.pop(0) 

211 # Return a single string: 

212 return "\n".join(trimmed) 

213 

214 

215# ============================================================================= 

216# Tabular 

217# ============================================================================= 

218 

219 

220def make_twocol_table( 

221 colnames: List[str], 

222 rows: List[List[str]], 

223 max_table_width: int = 79, 

224 padding_width: int = 1, 

225 vertical_lines: bool = True, 

226 rewrap_right_col: bool = True, 

227) -> str: 

228 """ 

229 Formats a two-column table. Tries not to split/wrap the left-hand column, 

230 but resizes the right-hand column. 

231 """ 

232 leftcol_width = max(len(r[0]) for r in [colnames] + rows) 

233 pt = prettytable.PrettyTable( 

234 colnames, 

235 header=True, 

236 border=True, 

237 hrules=prettytable.ALL, 

238 vrules=prettytable.ALL if vertical_lines else prettytable.NONE, 

239 align="l", # default alignment for all columns (left) 

240 valign="t", # default alignment for all rows (top) 

241 max_table_width=max_table_width, 

242 padding_width=padding_width, 

243 ) 

244 rightcol_width = max_table_width - leftcol_width - (4 * padding_width) - 3 

245 # ... 3 vertical lines (even if invisible); 4 paddings (2 per column) 

246 pt.max_width[colnames[0]] = leftcol_width 

247 pt.max_width[colnames[1]] = rightcol_width 

248 for row in rows: 

249 righttext = row[1] 

250 if rewrap_right_col: 

251 righttext = wordwrap(righttext, width=rightcol_width) 

252 ptrow = [row[0], righttext] 

253 pt.add_row(ptrow) 

254 return pt.get_string() 

255 

256 

257# ============================================================================= 

258# Checking strings for NLP 

259# ============================================================================= 

260 

261_RELEVANT_FOR_NLP_REGEX_STR = r"\w" # word character present 

262RELEVANT_FOR_NLP_REGEX = regex.compile( 

263 _RELEVANT_FOR_NLP_REGEX_STR, flags=regex.IGNORECASE 

264) 

265# regex deals with Unicode automatically, as verified in stringfunc_tests.py 

266 

267 

268def relevant_for_nlp(x: Optional[str]) -> bool: 

269 """ 

270 Does this string contain content that's relevant for NLP? 

271 We want to eliminate ``None`` values, and strings that do not contain 

272 relevant content. A string containing only whitespace is not relevant. 

273 """ 

274 if not x: 

275 # None, or empty string 

276 return False 

277 return RELEVANT_FOR_NLP_REGEX.search(x) is not None