Coverage for src/usaspending/utils/formatter.py: 84%

201 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-03 17:15 -0700

1from typing import List, Any, Optional, Set 

2from datetime import datetime 

3import re 

4import yaml 

5from pathlib import Path 

6 

7from ..config import BUSINESS_CATEGORIES 

8from titlecase import titlecase 

9from ..logging_config import USASpendingLogger 

10 

11logger = USASpendingLogger.get_logger(__name__) 

12 

13 

14def to_date(date_string: str) -> Optional[datetime]: 

15 """Convert date string to datetime object. 

16 

17 Supports multiple date formats: 

18 - YYYY-MM-DD (date only) 

19 - YYYY-MM-DDTHH:MM:SS (ISO datetime) 

20 - YYYY-MM-DDTHH:MM:SS.ffffff (ISO datetime with microseconds) 

21 - YYYY-MM-DDTHH:MM:SSZ (ISO datetime with UTC indicator) 

22 - YYYY-MM-DDTHH:MM:SS+/-HH:MM (ISO datetime with timezone offset) 

23 

24 Args: 

25 date_string: Date string in any supported format 

26 

27 Returns: 

28 datetime object or None if parsing fails 

29 """ 

30 if not date_string: 

31 return None 

32 

33 # Define formats to try, in order of likelihood 

34 formats = [ 

35 "%Y-%m-%d", # Date only (original format) 

36 "%Y-%m-%dT%H:%M:%S", # ISO datetime without timezone 

37 "%Y-%m-%dT%H:%M:%S.%f", # ISO datetime with microseconds 

38 "%Y-%m-%dT%H:%M:%SZ", # ISO datetime with UTC indicator 

39 "%Y-%m-%dT%H:%M:%S%z", # ISO datetime with timezone offset 

40 ] 

41 

42 for fmt in formats: 

43 try: 

44 return datetime.strptime(date_string, fmt) 

45 except ValueError: 

46 continue 

47 

48 # If no format matched, log warning and return None 

49 logger.warning(f"Could not parse date string: {date_string}") 

50 return None 

51 

52 

53def round_to_millions(amount: float) -> str: 

54 """Format money amount with commas and 2 decimal places, display as millions or billions based on the amount.""" 

55 if amount is None: 

56 return "$0.00" 

57 if amount >= 1_000_000_000: 

58 return "${:,.1f} billion".format(amount / 1_000_000_000) 

59 elif amount >= 10_000_000: 

60 return "${:,.1f} million".format(amount / 1_000_000) 

61 elif amount >= 1_000_000: 

62 return "${:,.1f} million".format(amount / 1_000_000) 

63 return "${:,.2f}".format(amount) 

64 

65 

66def current_fiscal_year() -> int: 

67 """Returns the current fiscal year""" 

68 current_date = datetime.now() 

69 current_month = datetime.now().month 

70 if current_month < 10: 

71 return current_date.year 

72 else: 

73 return current_date.year + 1 

74 

75 

76def get_past_fiscal_years(num_years: int = 3) -> List[int]: 

77 """ 

78 Get the past N fiscal years. 

79 In the US, the federal fiscal year starts on October 1. 

80 

81 Args: 

82 num_years: Number of past fiscal years to return 

83 

84 Returns: 

85 List of fiscal years, starting with the most recent 

86 """ 

87 current_date = datetime.now() 

88 current_year = current_date.year 

89 

90 # We always want the last completed fiscal year 

91 if current_date.month < 10: 

92 current_fiscal_year = current_year - 1 

93 else: 

94 current_fiscal_year = current_year 

95 

96 return [current_fiscal_year - i for i in range(num_years)] 

97 

98 

99def to_float(x: Any) -> Optional[float]: 

100 try: 

101 return float(x) 

102 except (TypeError, ValueError): 

103 return x 

104 

105def to_int(x: Any) -> Optional[int]: 

106 try: 

107 return int(x) 

108 except (TypeError, ValueError): 

109 return x 

110 

111 

112# --- Configuration --- 

113# Set of acronyms and initialisms to always keep uppercase. 

114# This could be loaded from a config file or environment variables in a larger application. 

115DEFAULT_KEEP_UPPERCASE: Set[str] = { 

116 # Common Business / Legal 

117 "LLC", 

118 "INC", 

119 "LLP", 

120 "LTD", 

121 "L.L.C.", 

122 "I.N.C.", 

123 "L.L.P.", 

124 "L.T.D.", 

125 # Geographical / Governmental 

126 "USA", 

127 "US", 

128 "UK", 

129 # Organizations / Agencies 

130 "NASA", 

131 "ESA", 

132 "JAXA", 

133 # NASA Facilities & Major Programs (add more as needed) 

134 "JPL", # Jet Propulsion Laboratory 

135 "JSC", # Johnson Space Center 

136 "KSC", # Kennedy Space Center 

137 "GSFC", # Goddard Space Flight Center 

138 "MSFC", # Marshall Space Flight Center 

139 "ARC", # Ames Research Center 

140 "GRC", # Glenn Research Center 

141 "LARC", # Langley Research Center (or LaRC - handled by case-insensitive check) 

142 "AFRC", # Armstrong Flight Research Center 

143 "SSC", # Stennis Space Center 

144 "ISS", # International Space Station 

145 "JWST", # James Webb Space Telescope 

146 # Specific examples from user input 

147 "CSOS", 

148 "CL", 

149 "FL", 

150 "FPRW", 

151 "PADF", 

152 "ICAT", 

153 "ICATEQ", 

154 "AC" # For A.C. style 

155 # Add other common contract/technical acronyms as needed 

156 "RFQ", 

157 "RFP", 

158 "SOW", 

159 "CDR", 

160 "PDR", 

161 "QA", 

162 "PI", 

163 "COTS", 

164} 

165 

166# Maximum length for parenthesized text to be uppercased 

167PAREN_UPPERCASE_MAX_LEN: int = 9 # Fewer than 10 characters 

168 

169# --- Helper Function --- 

170 

171 

172def smart_sentence_case( 

173 text: Optional[str], 

174 paren_max_len: int = PAREN_UPPERCASE_MAX_LEN, 

175) -> str: 

176 """ 

177 Converts an uppercase string to sentence case, preserving specified acronyms 

178 and short parenthesized text in uppercase. 

179 

180 Rules: 

181 1. Converts the text to lowercase as a base. 

182 2. Capitalizes the first letter of the resulting string. 

183 3. Keeps special cases from YAML configuration in proper case. 

184 4. Keeps text within parentheses uppercase if its length is less than 

185 paren_max_len + 1 characters. 

186 5. Handles standard punctuation like apostrophes correctly. 

187 

188 Args: 

189 text: The input string, expected to be mostly uppercase. 

190 Can be None or empty. 

191 paren_max_len: The maximum character length of text inside parentheses 

192 to be kept uppercase. Defaults to PAREN_UPPERCASE_MAX_LEN. 

193 

194 Returns: 

195 The processed string in smart sentence case, or an empty string if 

196 the input was None or empty. 

197 """ 

198 # Use the new TextFormatter class 

199 return TextFormatter.to_sentence_case(text, paren_max_len) 

200 

201 

202class TextFormatter: 

203 """Unified text formatting utility class for sentence and title case conversions.""" 

204 

205 _special_cases_cache = None 

206 

207 @classmethod 

208 def _load_special_cases(cls): 

209 """Load and cache special cases from YAML file.""" 

210 if cls._special_cases_cache is None: 

211 yaml_path = Path(__file__).parent / "special_cases.yaml" 

212 try: 

213 with open(yaml_path, "r") as f: 

214 cls._special_cases_cache = yaml.safe_load(f) or [] 

215 except FileNotFoundError: 

216 logger.warning(f"Special cases file not found: {yaml_path}") 

217 cls._special_cases_cache = [] 

218 except Exception as e: 

219 logger.error(f"Error loading special cases: {e}") 

220 cls._special_cases_cache = [] 

221 return cls._special_cases_cache 

222 

223 @classmethod 

224 def _get_special_cases_set(cls): 

225 """Get special cases as a set for fast lookups.""" 

226 special_cases = cls._load_special_cases() 

227 return {case.upper() for case in special_cases if isinstance(case, str)} 

228 

229 @classmethod 

230 def _split_word_punctuation(cls, word): 

231 """Split a word into clean word and trailing punctuation.""" 

232 if not word: 

233 return word, "" 

234 

235 trailing_punct = "" 

236 clean_word = word 

237 

238 # Handle contractions separately 

239 if "'" in word: 

240 # For words like "NASA's", split at apostrophe 

241 parts = word.split("'", 1) 

242 if len(parts) == 2: 

243 clean_word = parts[0] 

244 trailing_punct = "'" + parts[1] 

245 else: 

246 clean_word = word 

247 trailing_punct = "" 

248 else: 

249 # Find where the alphanumeric part ends 

250 i = len(word) - 1 

251 while i >= 0 and not word[i].isalnum(): 

252 i -= 1 

253 if i >= 0: 

254 clean_word = word[: i + 1] 

255 trailing_punct = word[i + 1 :] 

256 else: 

257 # All punctuation, no alphanumeric chars 

258 clean_word = word 

259 trailing_punct = "" 

260 

261 return clean_word, trailing_punct 

262 

263 @classmethod 

264 def _preserve_special_case(cls, word): 

265 """Check if word should be preserved as special case, return preserved version or None.""" 

266 if not isinstance(word, str): 

267 return None 

268 

269 # If the word is enclosed in parentheses, preserve the case inside 

270 if word.startswith("(") and word.endswith(")"): 

271 return word 

272 

273 # Load special cases YAML file 

274 special_cases = cls._load_special_cases() 

275 clean_word, trailing_punct = cls._split_word_punctuation(word) 

276 

277 # Check for case-insensitive match 

278 for special_word in special_cases: 

279 # Ensure special_word is a string 

280 if not isinstance(special_word, str): 

281 continue 

282 

283 # First try exact match with full word (including punctuation) 

284 if word.lower() == special_word.lower(): 

285 return special_word 

286 

287 # Then try match with clean word 

288 if clean_word.lower() == special_word.lower(): 

289 return special_word + trailing_punct 

290 

291 # Also try if special_word has punctuation that matches 

292 if ( 

293 special_word.endswith(".") 

294 and clean_word.lower() == special_word[:-1].lower() 

295 ): 

296 # Word like "inc" matching "Inc." 

297 return special_word + trailing_punct 

298 

299 return None 

300 

301 @classmethod 

302 def to_sentence_case(cls, text: Optional[str], paren_max_len: int = 9) -> str: 

303 """ 

304 Convert text to sentence case, preserving special cases from YAML. 

305  

306 True sentence case: only capitalize first word of sentences and special cases. 

307 Includes progressive acronym expansion for parenthetical content. 

308  

309 Args: 

310 text: Input text to convert 

311 paren_max_len: Max length for parenthesized text to keep uppercase 

312  

313 Returns: 

314 Text in sentence case with special cases preserved 

315 """ 

316 if not text: 

317 return "" 

318 

319 try: 

320 # Start with lowercase 

321 processed_text = text.lower() 

322 special_cases_set = cls._get_special_cases_set() 

323 

324 # Small words to ignore in acronym expansion 

325 SMALL_WORDS = r'\b(a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?)\b' 

326 

327 # First, handle acronym expansion for parenthetical content 

328 def expand_acronyms(match): 

329 full_match = match.group(0) 

330 paren_content = match.group(1) 

331 

332 # If content is too long, handle normally 

333 if len(paren_content) > paren_max_len: 

334 return full_match 

335 

336 # Always try acronym expansion first, even for known acronyms 

337 # This allows us to capitalize the expanded form 

338 

339 # Try progressive acronym expansion 

340 start_pos = match.start() 

341 text_before = processed_text[:start_pos].strip() 

342 

343 if text_before: 

344 # Split into words 

345 words_before = re.findall(r'\b\w+\b', text_before) 

346 acronym_letters = [c.lower() for c in paren_content if c.isalpha()] 

347 

348 if len(acronym_letters) > 0: 

349 # Try direct match first 

350 if len(words_before) >= len(acronym_letters): 

351 last_n_words = words_before[-len(acronym_letters):] 

352 if [w[0].lower() for w in last_n_words] == acronym_letters: 

353 # Mark these words for capitalization 

354 cls._acronym_expansion_words = getattr(cls, '_acronym_expansion_words', set()) 

355 cls._acronym_expansion_words.update(last_n_words) 

356 else: 

357 # If direct match failed, try skipping small words 

358 # Filter out small words 

359 content_words = [] 

360 for word in words_before: 

361 if not re.match(SMALL_WORDS, word, re.IGNORECASE): 

362 content_words.append(word) 

363 

364 if len(content_words) >= len(acronym_letters): 

365 last_n_content = content_words[-len(acronym_letters):] 

366 if [w[0].lower() for w in last_n_content] == acronym_letters: 

367 # Mark these words for capitalization 

368 cls._acronym_expansion_words = getattr(cls, '_acronym_expansion_words', set()) 

369 cls._acronym_expansion_words.update(last_n_content) 

370 

371 # Return uppercase parenthetical if short enough 

372 if len(paren_content) <= paren_max_len: 

373 return f"({paren_content.upper()})" 

374 else: 

375 return full_match 

376 

377 # Initialize acronym expansion tracking 

378 cls._acronym_expansion_words = set() 

379 

380 # Apply acronym expansion 

381 processed_text = re.sub(r'\(([^)]+)\)', expand_acronyms, processed_text) 

382 

383 # Handle special cases and sentence boundaries 

384 def word_replacer(match): 

385 word = match.group(1) 

386 word_start = match.start() 

387 

388 # Check if word should be capitalized due to acronym expansion 

389 if hasattr(cls, '_acronym_expansion_words') and word in cls._acronym_expansion_words: 

390 return word.capitalize() 

391 

392 # Check if word is a special case from YAML 

393 if word.upper() in special_cases_set: 

394 for special_case in cls._load_special_cases(): 

395 if isinstance(special_case, str) and word.upper() == special_case.upper(): 

396 return special_case 

397 

398 # Check if this is the start of a sentence (beginning or after . ! ? + space) 

399 if word_start == 0: 

400 return word.capitalize() 

401 

402 # Look for sentence boundaries (punctuation + one or more spaces) 

403 text_before = processed_text[:word_start] 

404 if re.search(r'[.!?]\s+$', text_before): 

405 return word.capitalize() 

406 

407 return word 

408 

409 processed_text = re.sub(r'\b([a-zA-Z]+(?:-[a-zA-Z]+)*)\b', word_replacer, processed_text) 

410 

411 # Clean up acronym expansion tracking 

412 if hasattr(cls, '_acronym_expansion_words'): 

413 delattr(cls, '_acronym_expansion_words') 

414 

415 return processed_text 

416 

417 except Exception as e: 

418 logger.error(f"Error processing text: '{text[:50]}...' - {e}", exc_info=True) 

419 return text # Fallback to original text on error 

420 

421 @classmethod 

422 def titlecase_callback(cls, word, **kwargs): 

423 """Custom titlecase callback using YAML configuration.""" 

424 if not isinstance(word, str): 

425 return word 

426 

427 # normalizations for common business suffixes 

428 normalized_words = {"L.L.C.": "LLC", "I.N.C.": "Inc", "L.L.P.": "LLP", "L.T.D.": "LTD", 

429 "P.L.L.C.": "PLLC", "P.A.": "PA", "P.C.": "PC"} 

430 

431 if word.upper() in normalized_words.keys(): 

432 word = normalized_words[word.upper()] 

433 

434 return cls._preserve_special_case(word) 

435 

436 

437# Cache for special cases from YAML (keep for backward compatibility) 

438_special_cases_cache = None 

439 

440 

441def _load_special_cases(): 

442 """Load and cache special cases from YAML file.""" 

443 return TextFormatter._load_special_cases() 

444 

445 

446# Define a callback function for custom word handling 

447def custom_titlecase_callback(word, **kwargs): 

448 """Custom titlecase callback using YAML configuration.""" 

449 return TextFormatter.titlecase_callback(word, **kwargs) 

450 

451 

452def contracts_titlecase(text): 

453 """Apply NASA-specific title casing rules to text""" 

454 if text is None: 

455 return None 

456 return titlecase(text, callback=TextFormatter.titlecase_callback) 

457 

458 

459def get_business_category_display_names(business_category_list): 

460 business_category_display_name_list = [] 

461 for business_category in business_category_list: 

462 display_name = BUSINESS_CATEGORIES.get(business_category) 

463 if display_name: 

464 business_category_display_name_list.append(display_name) 

465 return business_category_display_name_list