Coverage for src/dataknobs_xization/normalize.py: 18%
156 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 17:41 -0700
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 17:41 -0700
1"""Text normalization utilities and regular expressions.
3Provides functions and regex patterns for normalizing text including
4whitespace handling, camelCase splitting, and symbol processing.
5"""
7import math
8import re
9from itertools import product
10from typing import List, Set
12# squash whitespace: to collapse consecutive whitespace to a single space by
13# x.sub(' ', text)
14SQUASH_WS_RE = re.compile(r"\s+")
17# to identify strings with any symbols by
18# x.search(text)
19ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+")
22# camelcase LU: to split between consecutive lower and upper chars by
23# x.sub(r'\1 \2', text)
24CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])")
27# camelcase UL: to split between consecutive upper and uppler lower chars by
28# x.sub(r'\1 \2', text)
29CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
32# non-embedded symbols: those without a word char on both sides by
33# x.sub('', text)
34NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))")
37# embedded symbols: to drop embedded symbols by
38# x.sub('', text)
39EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)")
42# hyphen-slash: to split between an embedded hyphen and/or slash by
43# x.split(text)
44HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)")
47# hyphen-only: to split between an embedded hyphen by
48# x.split(text)
49HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)")
52# slash-only: to split between an embedded slash by
53# x.split(text)
54SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)")
57# parenthetical expressions: to drop parenthetical expressions by
58# x.sub('', text)
59PARENTHETICAL_RE = re.compile(r"\(.*\)")
62# ampersand: to replace an ampersand with " and " by
63# x.sub(' and ', text)
64AMPERSAND_RE = re.compile(r"\s*\&\s*")
67def expand_camelcase_fn(text: str) -> str:
68 """Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """
69 text = CAMELCASE_LU_RE.sub(r"\1 \2", text)
70 return CAMELCASE_UL_RE.sub(r"\1 \2", text)
73def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str:
74 """Drop symbols not embedded within word characters"""
75 return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text)
78def drop_embedded_symbols_fn(text: str, repl: str = "") -> str:
79 """Drop symbols embedded within word characters"""
80 return EMBEDDED_SYMS_RE.sub(repl, text)
83def get_hyphen_slash_expansions_fn(
84 text: str,
85 subs: List[str] = ("-", " ", ""),
86 add_self: bool = True,
87 do_split: bool = True,
88 min_split_token_len: int = 2,
89 hyphen_slash_re: re.Pattern[str] = HYPHEN_SLASH_RE,
90) -> Set[str]:
91 """Given text with words that may or may not appear as hyphenated or with a
92 slash, return the set potential variations:
93 - the text as-is (add_self)
94 - with a hyphen between all words (if '-' in subs)
95 - with a space between all words (if ' ' in subs)
96 - with all words squashed together (empty string between if '' in subs)
97 - with each word separately (do_split as long as min_split_token_len is
98 met for all tokens)
100 Note:
101 * To add a variation with a slash, add '/' to subs.
102 * To not add any variations with symbols, leave them out of subs
103 and don't add self.
105 Args:
106 text: The hyphen-worthy snippet of text, either already
107 hyphenated or with a slash or space delimited.
108 subs: A string of characters or list of strings to insert between
109 tokens.
110 add_self: True to include the text itself in the result.
111 do_split: True to add split tokens separately.
112 min_split_token_len: If any of the split tokens fail
113 to meet the min token length, don't add any of the splits.
114 hyphen_slash_re: The regex to identify hyphen/slash to expand.
116 Returns:
117 The set of text variations.
118 """
119 variations = {text} if add_self else set()
120 if subs is not None and len(subs) > 0:
121 # create variant with all <s>'s
122 for s in subs:
123 variations.add(HYPHEN_SLASH_RE.sub(s, text))
124 if do_split:
125 # add each word separately
126 tokens = set(hyphen_slash_re.split(text))
127 if not max(len(t) < min_split_token_len for t in tokens):
128 variations.update(tokens)
129 return variations
132def drop_parentheticals_fn(text: str) -> str:
133 """Drop parenthetical expressions from the text."""
134 return PARENTHETICAL_RE.sub("", text)
137def expand_ampersand_fn(text: str) -> str:
138 """Replace '&' with ' and '."""
139 return AMPERSAND_RE.sub(" and ", text)
142def get_lexical_variations(
143 text: str,
144 include_self: bool = True,
145 expand_camelcase: bool = True,
146 drop_non_embedded_symbols: bool = True,
147 drop_embedded_symbols: bool = True,
148 spacify_embedded_symbols: bool = False,
149 do_hyphen_expansion: bool = True,
150 hyphen_subs: List[str] = (" ", ""),
151 do_hyphen_split: bool = True,
152 min_hyphen_split_token_len: int = 2,
153 do_slash_expansion: bool = True,
154 slash_subs: List[str] = (" ", " or "),
155 do_slash_split: bool = True,
156 min_slash_split_token_len: int = 1,
157 drop_parentheticals: bool = True,
158 expand_ampersands: bool = True,
159 add_eng_plurals: bool = True,
160) -> Set[str]:
161 """Get all variations for the text (including the text itself).
163 Args:
164 text: The text to generate variations for.
165 include_self: True to include the original text in the result.
166 expand_camelcase: True to expand camelCase text.
167 drop_non_embedded_symbols: True to drop symbols not embedded in words.
168 drop_embedded_symbols: True to drop symbols embedded in words.
169 spacify_embedded_symbols: True to replace embedded symbols with spaces.
170 do_hyphen_expansion: True to expand hyphenated text.
171 hyphen_subs: List of strings to substitute for hyphens.
172 do_hyphen_split: True to split on hyphens.
173 min_hyphen_split_token_len: Minimum token length for hyphen splits.
174 do_slash_expansion: True to expand slashes.
175 slash_subs: List of strings to substitute for slashes.
176 do_slash_split: True to split on slashes.
177 min_slash_split_token_len: Minimum token length for slash splits.
178 drop_parentheticals: True to drop parenthetical expressions.
179 expand_ampersands: True to expand ampersands to ' and '.
180 add_eng_plurals: True to add English plural forms.
182 Returns:
183 The set of all text variations.
184 """
185 variations = {text} if include_self else set()
186 if expand_camelcase:
187 variations.add(expand_camelcase_fn(text))
188 if drop_non_embedded_symbols:
189 variations.add(drop_non_embedded_symbols_fn(text))
190 if drop_embedded_symbols:
191 variations.add(drop_embedded_symbols_fn(text))
192 if spacify_embedded_symbols:
193 variations.add(drop_embedded_symbols_fn(text, " "))
194 if (
195 do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0
196 ) or do_hyphen_split:
197 variations.update(
198 get_hyphen_slash_expansions_fn(
199 text,
200 subs=hyphen_subs,
201 add_self=False,
202 do_split=do_hyphen_split,
203 min_split_token_len=min_hyphen_split_token_len,
204 )
205 )
206 if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split:
207 variations.update(
208 get_hyphen_slash_expansions_fn(
209 text,
210 subs=slash_subs,
211 add_self=False,
212 do_split=do_slash_split,
213 min_split_token_len=min_slash_split_token_len,
214 )
215 )
216 if drop_parentheticals:
217 variations.add(drop_parentheticals_fn(text))
218 if expand_ampersands:
219 variations.add(expand_ampersand_fn(text))
220 if add_eng_plurals:
221 # TODO: Use a better pluralizer
222 plurals = {f"{v}s" for v in variations}
223 variations.update(plurals)
224 return variations
227def int_to_en(num: int) -> str:
228 d = {
229 0: "zero",
230 1: "one",
231 2: "two",
232 3: "three",
233 4: "four",
234 5: "five",
235 6: "six",
236 7: "seven",
237 8: "eight",
238 9: "nine",
239 10: "ten",
240 11: "eleven",
241 12: "twelve",
242 13: "thirteen",
243 14: "fourteen",
244 15: "fifteen",
245 16: "sixteen",
246 17: "seventeen",
247 18: "eighteen",
248 19: "nineteen",
249 20: "twenty",
250 30: "thirty",
251 40: "forty",
252 50: "fifty",
253 60: "sixty",
254 70: "seventy",
255 80: "eighty",
256 90: "ninety",
257 }
258 k = 1000
259 m = k * 1000
260 b = m * 1000
261 t = b * 1000
263 if not isinstance(num, int):
264 return num
266 if num < 0:
267 return "negative " + int_to_en(abs(num))
269 if num < 20:
270 return d[num]
272 if num < 100:
273 if num % 10 == 0:
274 return d[num]
275 else:
276 return d[num // 10 * 10] + " " + d[num % 10]
278 if num < k:
279 if num % 100 == 0:
280 return d[num // 100] + " hundred"
281 else:
282 return d[num // 100] + " hundred and " + int_to_en(num % 100)
284 if num < m:
285 if num % k == 0:
286 return int_to_en(num // k) + " thousand"
287 else:
288 return int_to_en(num // k) + " thousand " + int_to_en(num % k)
290 if num < b:
291 if (num % m) == 0:
292 return int_to_en(num // m) + " million"
293 else:
294 return int_to_en(num // m) + " million " + int_to_en(num % m)
296 if num < t:
297 if (num % b) == 0:
298 return int_to_en(num // b) + " billion"
299 else:
300 return int_to_en(num // b) + " billion " + int_to_en(num % b)
302 if num % t == 0:
303 return int_to_en(num // t) + " trillion"
304 else:
305 return int_to_en(num // t) + " trillion " + int_to_en(num % t)
307 # num is too large
308 return str(num)
311def zero_pad_variations(
312 val: int,
313 min_zpad_len: int,
314 max_zpad_len: int,
315) -> Set[str]:
316 """Get (only) zero-padded variations of the given value from min (inclusive)
317 to max (exclusive) zero-pad lengths.
319 Examples:
320 >>> from dataknobs_xization.normalize import zero_pad_variations
321 >>> zero_pad_variations(9, 2, 4)
322 {'09', '009'}
323 >>> zero_pad_variations(90, 2, 4)
324 {'090'}
325 >>> zero_pad_variations(90, 2, 3)
326 set()
327 >>> zero_pad_variations(3, 0, 5)
328 {'03', '003', '0003'}
330 Args:
331 val: The integer value to zero-pad.
332 min_zpad_len: The minimum zero-padded string length (inclusive).
333 max_zpad_len: The maximum zero-padded string length (exclusive).
335 Returns:
336 The set of all requested zero-padded number strings.
337 """
338 return {
339 f"{val:0{zpad}d}"
340 for zpad in range(
341 max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len
342 )
343 }
346def month_day_variations_fn(
347 month_or_day: int,
348 do_int_to_en: bool = False,
349) -> Set[str]:
350 """Get the variations for a month or day number, including the number
351 itself as a string, a 2-digit zero-padded form of the number, and
352 (optionally) english word for the number.
354 Args:
355 month_or_day: The month or day for which to get variations.
356 do_int_to_en: Optionally include the english word for the number.
358 Returns:
359 The set of variations for the value.
360 """
361 result = zero_pad_variations(month_or_day, 2, 3)
362 result.add(str(month_or_day))
363 if do_int_to_en:
364 result.add(int_to_en(month_or_day))
365 return result
368def year_variations_fn(
369 year: int,
370 min_year: int = 0,
371 max_year: int = 9999,
372 do_int_to_en_below_100: bool = False,
373 numeric_only: bool = False,
374) -> Set[str]:
375 """Convert a year to various text representations.
377 Generates variations including:
378 * "1999" (numeric)
379 * Long text: "one thousand, nine hundred and ninety nine"
380 * Short text: "nineteen [hundred and] ninety nine"
382 Args:
383 year: The year value to convert.
384 min_year: Minimum year to process (inclusive).
385 max_year: Maximum year to process (inclusive).
386 do_int_to_en_below_100: True to convert years below 100 to English text.
387 numeric_only: True to return only numeric variations.
389 Returns:
390 The set of year variations.
391 """
392 variations = {str(year)}
394 if year < min_year or year > max_year:
395 return variations
397 # one thousand, nine hundred and ninety nine
398 if not numeric_only and (do_int_to_en_below_100 or year >= 100):
399 variations.add(int_to_en(year))
401 # nineteen ninety five
402 century = year // 100
403 remainder = year % 100
404 remainder_text = int_to_en(remainder)
406 variations.update(zero_pad_variations(remainder, 2, 3))
408 if century > 0:
409 remainder_texts = []
410 if remainder > 0:
411 if remainder < 10:
412 if not numeric_only:
413 remainder_texts.append(f" oh {remainder_text}")
414 remainder_texts.append(f" 0{remainder}")
415 else:
416 if not numeric_only:
417 remainder_texts.append(f" {remainder_text}")
418 remainder_texts.append(f" {remainder}")
419 if not numeric_only:
420 remainder_texts.append(f" and {remainder_text}")
422 century_text = int_to_en(century)
423 scales = ["", century_text]
424 if century % 10 == 0:
425 mil_text = int_to_en(century // 10)
426 scales.append(f"{mil_text} thousand")
427 else:
428 scales.append(f"{century_text} hundred")
430 def clean_up(s):
431 s = s.strip()
432 if s.startswith("and "):
433 s = s[4:]
434 return s
436 variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)})
438 return variations
441def replace_smart_quotes_fn(text: str) -> str:
442 """Replace "smart" quotes with their ascii version."""
443 return (
444 text.replace(
445 "\u201c",
446 '"', # left double quote U+201C
447 )
448 .replace(
449 "\u201d",
450 '"', # right double quote U+201D
451 )
452 .replace(
453 "\u2018",
454 "'", # left single quote U+2018
455 )
456 .replace(
457 "\u2019",
458 "'", # right single quote U+2019
459 )
460 )
463def basic_normalization_fn(
464 text: str,
465 lowercase: bool = True,
466 expand_camelcase: bool = True,
467 simplify_quote_chars: bool = True,
468 drop_non_embedded_symbols: bool = False,
469 spacify_embedded_symbols: bool = False,
470 drop_embedded_symbols: bool = False,
471 squash_whitespace: bool = False,
472 do_all: bool = False,
473) -> str:
474 """Basic normalization functions include:
475 * lowercasing [default]
476 * expanding camelcase [default]
477 * replacing "smart" quotes and apostrophes with ascii versions [default]
478 * dropping non_embedded symbols [optional]
479 * replacing embedded symbols with a space [takes precedence over dropping unless do_all]
480 * or dropping embedded symbols [optional]
481 * collapsing multiple spaces and stripping spaces from ends [optional]
483 Args:
484 text: The text to normalize.
485 lowercase: True to convert to lowercase.
486 expand_camelcase: True to expand camelCase text.
487 simplify_quote_chars: True to replace smart quotes with ASCII quotes.
488 drop_non_embedded_symbols: True to drop symbols not embedded in words.
489 spacify_embedded_symbols: True to replace embedded symbols with spaces.
490 drop_embedded_symbols: True to drop embedded symbols.
491 squash_whitespace: True to collapse whitespace and strip ends.
492 do_all: True to apply all normalization steps.
494 Returns:
495 The normalized text.
496 """
497 # NOTE: do this before changing case
498 if expand_camelcase or do_all:
499 text = expand_camelcase_fn(text)
501 if lowercase or do_all:
502 text = text.lower()
503 if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all:
504 text = re.sub(r"[^\w\s]+", "", text)
505 elif drop_non_embedded_symbols:
506 text = drop_non_embedded_symbols_fn(text)
507 elif spacify_embedded_symbols:
508 text = drop_embedded_symbols_fn(text, " ")
509 elif drop_embedded_symbols:
510 text = drop_embedded_symbols_fn(text)
512 # NOTE: do this after dropping (only some) symbols
513 if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols):
514 # NOTE: It only makes sense to do this if we're keeping symbols
515 text = replace_smart_quotes_fn(text)
517 # NOTE: do this last
518 if squash_whitespace or do_all:
519 text = re.sub(r"\s+", " ", text).strip()
520 return text