Coverage for src / dataknobs_xization / normalize.py: 18%

156 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 15:46 -0700

1"""Text normalization utilities and regular expressions. 

2 

3Provides functions and regex patterns for normalizing text including 

4whitespace handling, camelCase splitting, and symbol processing. 

5""" 

6 

7import math 

8import re 

9from itertools import product 

10from typing import List, Set 

11 

12# squash whitespace: to collapse consecutive whitespace to a single space by 

13# x.sub(' ', text) 

14SQUASH_WS_RE = re.compile(r"\s+") 

15 

16 

17# to identify strings with any symbols by 

18# x.search(text) 

19ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+") 

20 

21 

22# camelcase LU: to split between consecutive lower and upper chars by 

23# x.sub(r'\1 \2', text) 

24CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])") 

25 

26 

27# camelcase UL: to split between consecutive upper and uppler lower chars by 

28# x.sub(r'\1 \2', text) 

29CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])") 

30 

31 

32# non-embedded symbols: those without a word char on both sides by 

33# x.sub('', text) 

34NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))") 

35 

36 

37# embedded symbols: to drop embedded symbols by 

38# x.sub('', text) 

39EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)") 

40 

41 

42# hyphen-slash: to split between an embedded hyphen and/or slash by 

43# x.split(text) 

44HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)") 

45 

46 

47# hyphen-only: to split between an embedded hyphen by 

48# x.split(text) 

49HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)") 

50 

51 

52# slash-only: to split between an embedded slash by 

53# x.split(text) 

54SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)") 

55 

56 

57# parenthetical expressions: to drop parenthetical expressions by 

58# x.sub('', text) 

59PARENTHETICAL_RE = re.compile(r"\(.*\)") 

60 

61 

62# ampersand: to replace an ampersand with " and " by 

63# x.sub(' and ', text) 

64AMPERSAND_RE = re.compile(r"\s*\&\s*") 

65 

66 

67def expand_camelcase_fn(text: str) -> str: 

68 """Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """ 

69 text = CAMELCASE_LU_RE.sub(r"\1 \2", text) 

70 return CAMELCASE_UL_RE.sub(r"\1 \2", text) 

71 

72 

73def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str: 

74 """Drop symbols not embedded within word characters""" 

75 return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text) 

76 

77 

78def drop_embedded_symbols_fn(text: str, repl: str = "") -> str: 

79 """Drop symbols embedded within word characters""" 

80 return EMBEDDED_SYMS_RE.sub(repl, text) 

81 

82 

83def get_hyphen_slash_expansions_fn( 

84 text: str, 

85 subs: List[str] = ("-", " ", ""), 

86 add_self: bool = True, 

87 do_split: bool = True, 

88 min_split_token_len: int = 2, 

89 hyphen_slash_re: re.Pattern[str] = HYPHEN_SLASH_RE, 

90) -> Set[str]: 

91 """Given text with words that may or may not appear as hyphenated or with a 

92 slash, return the set potential variations: 

93 - the text as-is (add_self) 

94 - with a hyphen between all words (if '-' in subs) 

95 - with a space between all words (if ' ' in subs) 

96 - with all words squashed together (empty string between if '' in subs) 

97 - with each word separately (do_split as long as min_split_token_len is 

98 met for all tokens) 

99 

100 Note: 

101 * To add a variation with a slash, add '/' to subs. 

102 * To not add any variations with symbols, leave them out of subs 

103 and don't add self. 

104 

105 Args: 

106 text: The hyphen-worthy snippet of text, either already 

107 hyphenated or with a slash or space delimited. 

108 subs: A string of characters or list of strings to insert between 

109 tokens. 

110 add_self: True to include the text itself in the result. 

111 do_split: True to add split tokens separately. 

112 min_split_token_len: If any of the split tokens fail 

113 to meet the min token length, don't add any of the splits. 

114 hyphen_slash_re: The regex to identify hyphen/slash to expand. 

115 

116 Returns: 

117 The set of text variations. 

118 """ 

119 variations = {text} if add_self else set() 

120 if subs is not None and len(subs) > 0: 

121 # create variant with all <s>'s 

122 for s in subs: 

123 variations.add(HYPHEN_SLASH_RE.sub(s, text)) 

124 if do_split: 

125 # add each word separately 

126 tokens = set(hyphen_slash_re.split(text)) 

127 if not max(len(t) < min_split_token_len for t in tokens): 

128 variations.update(tokens) 

129 return variations 

130 

131 

132def drop_parentheticals_fn(text: str) -> str: 

133 """Drop parenthetical expressions from the text.""" 

134 return PARENTHETICAL_RE.sub("", text) 

135 

136 

137def expand_ampersand_fn(text: str) -> str: 

138 """Replace '&' with ' and '.""" 

139 return AMPERSAND_RE.sub(" and ", text) 

140 

141 

142def get_lexical_variations( 

143 text: str, 

144 include_self: bool = True, 

145 expand_camelcase: bool = True, 

146 drop_non_embedded_symbols: bool = True, 

147 drop_embedded_symbols: bool = True, 

148 spacify_embedded_symbols: bool = False, 

149 do_hyphen_expansion: bool = True, 

150 hyphen_subs: List[str] = (" ", ""), 

151 do_hyphen_split: bool = True, 

152 min_hyphen_split_token_len: int = 2, 

153 do_slash_expansion: bool = True, 

154 slash_subs: List[str] = (" ", " or "), 

155 do_slash_split: bool = True, 

156 min_slash_split_token_len: int = 1, 

157 drop_parentheticals: bool = True, 

158 expand_ampersands: bool = True, 

159 add_eng_plurals: bool = True, 

160) -> Set[str]: 

161 """Get all variations for the text (including the text itself). 

162 

163 Args: 

164 text: The text to generate variations for. 

165 include_self: True to include the original text in the result. 

166 expand_camelcase: True to expand camelCase text. 

167 drop_non_embedded_symbols: True to drop symbols not embedded in words. 

168 drop_embedded_symbols: True to drop symbols embedded in words. 

169 spacify_embedded_symbols: True to replace embedded symbols with spaces. 

170 do_hyphen_expansion: True to expand hyphenated text. 

171 hyphen_subs: List of strings to substitute for hyphens. 

172 do_hyphen_split: True to split on hyphens. 

173 min_hyphen_split_token_len: Minimum token length for hyphen splits. 

174 do_slash_expansion: True to expand slashes. 

175 slash_subs: List of strings to substitute for slashes. 

176 do_slash_split: True to split on slashes. 

177 min_slash_split_token_len: Minimum token length for slash splits. 

178 drop_parentheticals: True to drop parenthetical expressions. 

179 expand_ampersands: True to expand ampersands to ' and '. 

180 add_eng_plurals: True to add English plural forms. 

181 

182 Returns: 

183 The set of all text variations. 

184 """ 

185 variations = {text} if include_self else set() 

186 if expand_camelcase: 

187 variations.add(expand_camelcase_fn(text)) 

188 if drop_non_embedded_symbols: 

189 variations.add(drop_non_embedded_symbols_fn(text)) 

190 if drop_embedded_symbols: 

191 variations.add(drop_embedded_symbols_fn(text)) 

192 if spacify_embedded_symbols: 

193 variations.add(drop_embedded_symbols_fn(text, " ")) 

194 if ( 

195 do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0 

196 ) or do_hyphen_split: 

197 variations.update( 

198 get_hyphen_slash_expansions_fn( 

199 text, 

200 subs=hyphen_subs, 

201 add_self=False, 

202 do_split=do_hyphen_split, 

203 min_split_token_len=min_hyphen_split_token_len, 

204 ) 

205 ) 

206 if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split: 

207 variations.update( 

208 get_hyphen_slash_expansions_fn( 

209 text, 

210 subs=slash_subs, 

211 add_self=False, 

212 do_split=do_slash_split, 

213 min_split_token_len=min_slash_split_token_len, 

214 ) 

215 ) 

216 if drop_parentheticals: 

217 variations.add(drop_parentheticals_fn(text)) 

218 if expand_ampersands: 

219 variations.add(expand_ampersand_fn(text)) 

220 if add_eng_plurals: 

221 # TODO: Use a better pluralizer 

222 plurals = {f"{v}s" for v in variations} 

223 variations.update(plurals) 

224 return variations 

225 

226 

227def int_to_en(num: int) -> str: 

228 d = { 

229 0: "zero", 

230 1: "one", 

231 2: "two", 

232 3: "three", 

233 4: "four", 

234 5: "five", 

235 6: "six", 

236 7: "seven", 

237 8: "eight", 

238 9: "nine", 

239 10: "ten", 

240 11: "eleven", 

241 12: "twelve", 

242 13: "thirteen", 

243 14: "fourteen", 

244 15: "fifteen", 

245 16: "sixteen", 

246 17: "seventeen", 

247 18: "eighteen", 

248 19: "nineteen", 

249 20: "twenty", 

250 30: "thirty", 

251 40: "forty", 

252 50: "fifty", 

253 60: "sixty", 

254 70: "seventy", 

255 80: "eighty", 

256 90: "ninety", 

257 } 

258 k = 1000 

259 m = k * 1000 

260 b = m * 1000 

261 t = b * 1000 

262 

263 if not isinstance(num, int): 

264 return num 

265 

266 if num < 0: 

267 return "negative " + int_to_en(abs(num)) 

268 

269 if num < 20: 

270 return d[num] 

271 

272 if num < 100: 

273 if num % 10 == 0: 

274 return d[num] 

275 else: 

276 return d[num // 10 * 10] + " " + d[num % 10] 

277 

278 if num < k: 

279 if num % 100 == 0: 

280 return d[num // 100] + " hundred" 

281 else: 

282 return d[num // 100] + " hundred and " + int_to_en(num % 100) 

283 

284 if num < m: 

285 if num % k == 0: 

286 return int_to_en(num // k) + " thousand" 

287 else: 

288 return int_to_en(num // k) + " thousand " + int_to_en(num % k) 

289 

290 if num < b: 

291 if (num % m) == 0: 

292 return int_to_en(num // m) + " million" 

293 else: 

294 return int_to_en(num // m) + " million " + int_to_en(num % m) 

295 

296 if num < t: 

297 if (num % b) == 0: 

298 return int_to_en(num // b) + " billion" 

299 else: 

300 return int_to_en(num // b) + " billion " + int_to_en(num % b) 

301 

302 if num % t == 0: 

303 return int_to_en(num // t) + " trillion" 

304 else: 

305 return int_to_en(num // t) + " trillion " + int_to_en(num % t) 

306 

307 # num is too large 

308 return str(num) 

309 

310 

311def zero_pad_variations( 

312 val: int, 

313 min_zpad_len: int, 

314 max_zpad_len: int, 

315) -> Set[str]: 

316 """Get (only) zero-padded variations of the given value from min (inclusive) 

317 to max (exclusive) zero-pad lengths. 

318 

319 Examples: 

320 >>> from dataknobs_xization.normalize import zero_pad_variations 

321 >>> zero_pad_variations(9, 2, 4) 

322 {'09', '009'} 

323 >>> zero_pad_variations(90, 2, 4) 

324 {'090'} 

325 >>> zero_pad_variations(90, 2, 3) 

326 set() 

327 >>> zero_pad_variations(3, 0, 5) 

328 {'03', '003', '0003'} 

329 

330 Args: 

331 val: The integer value to zero-pad. 

332 min_zpad_len: The minimum zero-padded string length (inclusive). 

333 max_zpad_len: The maximum zero-padded string length (exclusive). 

334 

335 Returns: 

336 The set of all requested zero-padded number strings. 

337 """ 

338 return { 

339 f"{val:0{zpad}d}" 

340 for zpad in range( 

341 max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len 

342 ) 

343 } 

344 

345 

346def month_day_variations_fn( 

347 month_or_day: int, 

348 do_int_to_en: bool = False, 

349) -> Set[str]: 

350 """Get the variations for a month or day number, including the number 

351 itself as a string, a 2-digit zero-padded form of the number, and 

352 (optionally) english word for the number. 

353 

354 Args: 

355 month_or_day: The month or day for which to get variations. 

356 do_int_to_en: Optionally include the english word for the number. 

357 

358 Returns: 

359 The set of variations for the value. 

360 """ 

361 result = zero_pad_variations(month_or_day, 2, 3) 

362 result.add(str(month_or_day)) 

363 if do_int_to_en: 

364 result.add(int_to_en(month_or_day)) 

365 return result 

366 

367 

368def year_variations_fn( 

369 year: int, 

370 min_year: int = 0, 

371 max_year: int = 9999, 

372 do_int_to_en_below_100: bool = False, 

373 numeric_only: bool = False, 

374) -> Set[str]: 

375 """Convert a year to various text representations. 

376 

377 Generates variations including: 

378 * "1999" (numeric) 

379 * Long text: "one thousand, nine hundred and ninety nine" 

380 * Short text: "nineteen [hundred and] ninety nine" 

381 

382 Args: 

383 year: The year value to convert. 

384 min_year: Minimum year to process (inclusive). 

385 max_year: Maximum year to process (inclusive). 

386 do_int_to_en_below_100: True to convert years below 100 to English text. 

387 numeric_only: True to return only numeric variations. 

388 

389 Returns: 

390 The set of year variations. 

391 """ 

392 variations = {str(year)} 

393 

394 if year < min_year or year > max_year: 

395 return variations 

396 

397 # one thousand, nine hundred and ninety nine 

398 if not numeric_only and (do_int_to_en_below_100 or year >= 100): 

399 variations.add(int_to_en(year)) 

400 

401 # nineteen ninety five 

402 century = year // 100 

403 remainder = year % 100 

404 remainder_text = int_to_en(remainder) 

405 

406 variations.update(zero_pad_variations(remainder, 2, 3)) 

407 

408 if century > 0: 

409 remainder_texts = [] 

410 if remainder > 0: 

411 if remainder < 10: 

412 if not numeric_only: 

413 remainder_texts.append(f" oh {remainder_text}") 

414 remainder_texts.append(f" 0{remainder}") 

415 else: 

416 if not numeric_only: 

417 remainder_texts.append(f" {remainder_text}") 

418 remainder_texts.append(f" {remainder}") 

419 if not numeric_only: 

420 remainder_texts.append(f" and {remainder_text}") 

421 

422 century_text = int_to_en(century) 

423 scales = ["", century_text] 

424 if century % 10 == 0: 

425 mil_text = int_to_en(century // 10) 

426 scales.append(f"{mil_text} thousand") 

427 else: 

428 scales.append(f"{century_text} hundred") 

429 

430 def clean_up(s): 

431 s = s.strip() 

432 if s.startswith("and "): 

433 s = s[4:] 

434 return s 

435 

436 variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)}) 

437 

438 return variations 

439 

440 

441def replace_smart_quotes_fn(text: str) -> str: 

442 """Replace "smart" quotes with their ascii version.""" 

443 return ( 

444 text.replace( 

445 "\u201c", 

446 '"', # left double quote U+201C 

447 ) 

448 .replace( 

449 "\u201d", 

450 '"', # right double quote U+201D 

451 ) 

452 .replace( 

453 "\u2018", 

454 "'", # left single quote U+2018 

455 ) 

456 .replace( 

457 "\u2019", 

458 "'", # right single quote U+2019 

459 ) 

460 ) 

461 

462 

463def basic_normalization_fn( 

464 text: str, 

465 lowercase: bool = True, 

466 expand_camelcase: bool = True, 

467 simplify_quote_chars: bool = True, 

468 drop_non_embedded_symbols: bool = False, 

469 spacify_embedded_symbols: bool = False, 

470 drop_embedded_symbols: bool = False, 

471 squash_whitespace: bool = False, 

472 do_all: bool = False, 

473) -> str: 

474 """Basic normalization functions include: 

475 * lowercasing [default] 

476 * expanding camelcase [default] 

477 * replacing "smart" quotes and apostrophes with ascii versions [default] 

478 * dropping non_embedded symbols [optional] 

479 * replacing embedded symbols with a space [takes precedence over dropping unless do_all] 

480 * or dropping embedded symbols [optional] 

481 * collapsing multiple spaces and stripping spaces from ends [optional] 

482 

483 Args: 

484 text: The text to normalize. 

485 lowercase: True to convert to lowercase. 

486 expand_camelcase: True to expand camelCase text. 

487 simplify_quote_chars: True to replace smart quotes with ASCII quotes. 

488 drop_non_embedded_symbols: True to drop symbols not embedded in words. 

489 spacify_embedded_symbols: True to replace embedded symbols with spaces. 

490 drop_embedded_symbols: True to drop embedded symbols. 

491 squash_whitespace: True to collapse whitespace and strip ends. 

492 do_all: True to apply all normalization steps. 

493 

494 Returns: 

495 The normalized text. 

496 """ 

497 # NOTE: do this before changing case 

498 if expand_camelcase or do_all: 

499 text = expand_camelcase_fn(text) 

500 

501 if lowercase or do_all: 

502 text = text.lower() 

503 if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all: 

504 text = re.sub(r"[^\w\s]+", "", text) 

505 elif drop_non_embedded_symbols: 

506 text = drop_non_embedded_symbols_fn(text) 

507 elif spacify_embedded_symbols: 

508 text = drop_embedded_symbols_fn(text, " ") 

509 elif drop_embedded_symbols: 

510 text = drop_embedded_symbols_fn(text) 

511 

512 # NOTE: do this after dropping (only some) symbols 

513 if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols): 

514 # NOTE: It only makes sense to do this if we're keeping symbols 

515 text = replace_smart_quotes_fn(text) 

516 

517 # NOTE: do this last 

518 if squash_whitespace or do_all: 

519 text = re.sub(r"\s+", " ", text).strip() 

520 return text