Coverage for common/stringfunc.py: 31%
67 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/common/stringfunc.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Simple string functions.**
28"""
30import fnmatch
31from functools import lru_cache
32import sys
33from typing import Any, List, Optional, Pattern, TextIO, Type
35from cardinal_pythonlib.extract_text import wordwrap
36import prettytable
37import regex
40# =============================================================================
41# Simple string manipulation
42# =============================================================================
45def get_digit_string_from_vaguely_numeric_string(s: str) -> str:
46 """
47 Strips non-digit characters from a string.
49 For example, converts ``"(01223) 123456"`` to ``"01223123456"``.
50 """
51 return "".join([d for d in s if d.isdigit()])
54def reduce_to_alphanumeric(s: str) -> str:
55 """
56 Strips non-alphanumeric characters from a string.
58 For example, converts ``"PE12 3AB"`` to ``"PE12 3AB"``.
59 """
60 return "".join([d for d in s if d.isalnum()])
63def remove_whitespace(s: str) -> str:
64 """
65 Removes whitespace from a string.
66 """
67 return "".join(s.split())
70# =============================================================================
71# Specification matching
72# =============================================================================
75@lru_cache(maxsize=None)
76def get_spec_match_regex(spec: str) -> Pattern:
77 """
78 Returns a compiled, case-insensitive regular expression representing a
79 shell-style pattern (using ``*``, ``?`` and similar wildcards; see
80 https://docs.python.org/3.5/library/fnmatch.html).
82 Args:
83 spec: the pattern to pass to ``fnmatch``, e.g. ``"patient_addr*"``.
85 Returns:
86 the compiled regular expression
87 """
88 return regex.compile(fnmatch.translate(spec), regex.IGNORECASE)
91# =============================================================================
92# Printing/encoding
93# =============================================================================
96def uprint(
97 *objects: Any, sep: str = " ", end: str = "\n", file: TextIO = sys.stdout
98) -> None:
99 """
100 Prints strings to outputs that support UTF-8 encoding, but also to those
101 that do not (e.g. Windows stdout, sometimes).
103 Args:
104 *objects: things to print
105 sep: separator between those objects
106 end: print this at the end
107 file: file-like object to print to
109 See
110 https://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined
112 Examples:
114 - Linux, Python 3.6.8 console: ``sys.stdout.encoding == "UTF-8"``
115 - Windows, Python 3.7.4 console: ``sys.stdout.encoding == "utf-8"``
116 - Windows, Python 3.7.4, from script: ``sys.stdout.encoding == "cp1252"``
117 """
118 enc = file.encoding.lower()
119 if enc == "utf-8":
120 print(*objects, sep=sep, end=end, file=file)
121 else:
123 def f(obj: Any) -> str:
124 return str(obj).encode(enc, errors="backslashreplace").decode(enc)
126 # https://docs.python.org/3.5/library/codecs.html#codec-base-classes
127 print(*map(f, objects), sep=sep, end=end, file=file)
130# =============================================================================
131# String tests
132# =============================================================================
135def does_text_contain_word_chars(text: str) -> bool:
136 """
137 Is a string worth treating as interesting text -- does it contain "word"
138 characters?
139 """
140 # Slower (as per FS's tests):
141 # regex_any_word_char = regex.compile(r'[\w\W]*[a-zA-Z0-9_][\w\W]*')
142 # return bool(text and regex_any_word_char.match(text))
143 # Faster:
144 return bool(text and any(33 <= ord(c) <= 126 for c in text))
147# =============================================================================
148# Docstring manipulation
149# =============================================================================
152def get_docstring(cls: Type) -> str:
153 """
154 Fetches a docstring from a class.
155 """
156 # PyCharm thinks that __doc__ is bytes, but it's str!
157 # ... ah, no, now it's stopped believing that.
158 return cls.__doc__ or ""
159 # This is likely unnecessary: even integer variables have the __doc__
160 # attribute.
161 # return getattr(cls, '__doc__', "") or ""
164def compress_docstring(docstring: str) -> str:
165 """
166 Splats a docstring onto a single line, compressing all whitespace.
167 """
168 docstring = docstring.replace("\n", " ")
169 # https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
170 return " ".join(docstring.split())
173def trim_docstring(docstring: str) -> str:
174 """
175 Removes initial/terminal blank lines and leading whitespace from
176 docstrings.
178 This is the PEP257 implementation (https://peps.python.org/pep-0257/),
179 except with ``sys.maxint`` replaced by ``sys.maxsize`` (see
180 https://docs.python.org/3.1/whatsnew/3.0.html#integers).
182 Demonstration:
184 .. code-block:: python
186 from crate_anon.common.stringfunc import trim_docstring
187 print(trim_docstring.__doc__)
188 print(trim_docstring(trim_docstring.__doc__))
189 """
190 if not docstring:
191 return ""
192 # Convert tabs to spaces (following the normal Python rules)
193 # and split into a list of lines:
194 lines = docstring.expandtabs().splitlines()
195 # Determine minimum indentation (first line doesn't count):
196 indent = sys.maxsize
197 for line in lines[1:]:
198 stripped = line.lstrip()
199 if stripped:
200 indent = min(indent, len(line) - len(stripped))
201 # Remove indentation (first line is special):
202 trimmed = [lines[0].strip()]
203 if indent < sys.maxsize:
204 for line in lines[1:]:
205 trimmed.append(line[indent:].rstrip())
206 # Strip off trailing and leading blank lines:
207 while trimmed and not trimmed[-1]:
208 trimmed.pop()
209 while trimmed and not trimmed[0]:
210 trimmed.pop(0)
211 # Return a single string:
212 return "\n".join(trimmed)
215# =============================================================================
216# Tabular
217# =============================================================================
220def make_twocol_table(
221 colnames: List[str],
222 rows: List[List[str]],
223 max_table_width: int = 79,
224 padding_width: int = 1,
225 vertical_lines: bool = True,
226 rewrap_right_col: bool = True,
227) -> str:
228 """
229 Formats a two-column table. Tries not to split/wrap the left-hand column,
230 but resizes the right-hand column.
231 """
232 leftcol_width = max(len(r[0]) for r in [colnames] + rows)
233 pt = prettytable.PrettyTable(
234 colnames,
235 header=True,
236 border=True,
237 hrules=prettytable.ALL,
238 vrules=prettytable.ALL if vertical_lines else prettytable.NONE,
239 align="l", # default alignment for all columns (left)
240 valign="t", # default alignment for all rows (top)
241 max_table_width=max_table_width,
242 padding_width=padding_width,
243 )
244 rightcol_width = max_table_width - leftcol_width - (4 * padding_width) - 3
245 # ... 3 vertical lines (even if invisible); 4 paddings (2 per column)
246 pt.max_width[colnames[0]] = leftcol_width
247 pt.max_width[colnames[1]] = rightcol_width
248 for row in rows:
249 righttext = row[1]
250 if rewrap_right_col:
251 righttext = wordwrap(righttext, width=rightcol_width)
252 ptrow = [row[0], righttext]
253 pt.add_row(ptrow)
254 return pt.get_string()
257# =============================================================================
258# Checking strings for NLP
259# =============================================================================
261_RELEVANT_FOR_NLP_REGEX_STR = r"\w" # word character present
262RELEVANT_FOR_NLP_REGEX = regex.compile(
263 _RELEVANT_FOR_NLP_REGEX_STR, flags=regex.IGNORECASE
264)
265# regex deals with Unicode automatically, as verified in stringfunc_tests.py
268def relevant_for_nlp(x: Optional[str]) -> bool:
269 """
270 Does this string contain content that's relevant for NLP?
271 We want to eliminate ``None`` values, and strings that do not contain
272 relevant content. A string containing only whitespace is not relevant.
273 """
274 if not x:
275 # None, or empty string
276 return False
277 return RELEVANT_FOR_NLP_REGEX.search(x) is not None