Coverage for src/usaspending/utils/formatter.py: 84%
201 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-03 17:15 -0700
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-03 17:15 -0700
1from typing import List, Any, Optional, Set
2from datetime import datetime
3import re
4import yaml
5from pathlib import Path
7from ..config import BUSINESS_CATEGORIES
8from titlecase import titlecase
9from ..logging_config import USASpendingLogger
11logger = USASpendingLogger.get_logger(__name__)
14def to_date(date_string: str) -> Optional[datetime]:
15 """Convert date string to datetime object.
17 Supports multiple date formats:
18 - YYYY-MM-DD (date only)
19 - YYYY-MM-DDTHH:MM:SS (ISO datetime)
20 - YYYY-MM-DDTHH:MM:SS.ffffff (ISO datetime with microseconds)
21 - YYYY-MM-DDTHH:MM:SSZ (ISO datetime with UTC indicator)
22 - YYYY-MM-DDTHH:MM:SS+/-HH:MM (ISO datetime with timezone offset)
24 Args:
25 date_string: Date string in any supported format
27 Returns:
28 datetime object or None if parsing fails
29 """
30 if not date_string:
31 return None
33 # Define formats to try, in order of likelihood
34 formats = [
35 "%Y-%m-%d", # Date only (original format)
36 "%Y-%m-%dT%H:%M:%S", # ISO datetime without timezone
37 "%Y-%m-%dT%H:%M:%S.%f", # ISO datetime with microseconds
38 "%Y-%m-%dT%H:%M:%SZ", # ISO datetime with UTC indicator
39 "%Y-%m-%dT%H:%M:%S%z", # ISO datetime with timezone offset
40 ]
42 for fmt in formats:
43 try:
44 return datetime.strptime(date_string, fmt)
45 except ValueError:
46 continue
48 # If no format matched, log warning and return None
49 logger.warning(f"Could not parse date string: {date_string}")
50 return None
53def round_to_millions(amount: float) -> str:
54 """Format money amount with commas and 2 decimal places, display as millions or billions based on the amount."""
55 if amount is None:
56 return "$0.00"
57 if amount >= 1_000_000_000:
58 return "${:,.1f} billion".format(amount / 1_000_000_000)
59 elif amount >= 10_000_000:
60 return "${:,.1f} million".format(amount / 1_000_000)
61 elif amount >= 1_000_000:
62 return "${:,.1f} million".format(amount / 1_000_000)
63 return "${:,.2f}".format(amount)
66def current_fiscal_year() -> int:
67 """Returns the current fiscal year"""
68 current_date = datetime.now()
69 current_month = datetime.now().month
70 if current_month < 10:
71 return current_date.year
72 else:
73 return current_date.year + 1
76def get_past_fiscal_years(num_years: int = 3) -> List[int]:
77 """
78 Get the past N fiscal years.
79 In the US, the federal fiscal year starts on October 1.
81 Args:
82 num_years: Number of past fiscal years to return
84 Returns:
85 List of fiscal years, starting with the most recent
86 """
87 current_date = datetime.now()
88 current_year = current_date.year
90 # We always want the last completed fiscal year
91 if current_date.month < 10:
92 current_fiscal_year = current_year - 1
93 else:
94 current_fiscal_year = current_year
96 return [current_fiscal_year - i for i in range(num_years)]
99def to_float(x: Any) -> Optional[float]:
100 try:
101 return float(x)
102 except (TypeError, ValueError):
103 return x
105def to_int(x: Any) -> Optional[int]:
106 try:
107 return int(x)
108 except (TypeError, ValueError):
109 return x
112# --- Configuration ---
113# Set of acronyms and initialisms to always keep uppercase.
114# This could be loaded from a config file or environment variables in a larger application.
115DEFAULT_KEEP_UPPERCASE: Set[str] = {
116 # Common Business / Legal
117 "LLC",
118 "INC",
119 "LLP",
120 "LTD",
121 "L.L.C.",
122 "I.N.C.",
123 "L.L.P.",
124 "L.T.D.",
125 # Geographical / Governmental
126 "USA",
127 "US",
128 "UK",
129 # Organizations / Agencies
130 "NASA",
131 "ESA",
132 "JAXA",
133 # NASA Facilities & Major Programs (add more as needed)
134 "JPL", # Jet Propulsion Laboratory
135 "JSC", # Johnson Space Center
136 "KSC", # Kennedy Space Center
137 "GSFC", # Goddard Space Flight Center
138 "MSFC", # Marshall Space Flight Center
139 "ARC", # Ames Research Center
140 "GRC", # Glenn Research Center
141 "LARC", # Langley Research Center (or LaRC - handled by case-insensitive check)
142 "AFRC", # Armstrong Flight Research Center
143 "SSC", # Stennis Space Center
144 "ISS", # International Space Station
145 "JWST", # James Webb Space Telescope
146 # Specific examples from user input
147 "CSOS",
148 "CL",
149 "FL",
150 "FPRW",
151 "PADF",
152 "ICAT",
153 "ICATEQ",
154 "AC" # For A.C. style
155 # Add other common contract/technical acronyms as needed
156 "RFQ",
157 "RFP",
158 "SOW",
159 "CDR",
160 "PDR",
161 "QA",
162 "PI",
163 "COTS",
164}
166# Maximum length for parenthesized text to be uppercased
167PAREN_UPPERCASE_MAX_LEN: int = 9 # Fewer than 10 characters
169# --- Helper Function ---
172def smart_sentence_case(
173 text: Optional[str],
174 paren_max_len: int = PAREN_UPPERCASE_MAX_LEN,
175) -> str:
176 """
177 Converts an uppercase string to sentence case, preserving specified acronyms
178 and short parenthesized text in uppercase.
180 Rules:
181 1. Converts the text to lowercase as a base.
182 2. Capitalizes the first letter of the resulting string.
183 3. Keeps special cases from YAML configuration in proper case.
184 4. Keeps text within parentheses uppercase if its length is less than
185 paren_max_len + 1 characters.
186 5. Handles standard punctuation like apostrophes correctly.
188 Args:
189 text: The input string, expected to be mostly uppercase.
190 Can be None or empty.
191 paren_max_len: The maximum character length of text inside parentheses
192 to be kept uppercase. Defaults to PAREN_UPPERCASE_MAX_LEN.
194 Returns:
195 The processed string in smart sentence case, or an empty string if
196 the input was None or empty.
197 """
198 # Use the new TextFormatter class
199 return TextFormatter.to_sentence_case(text, paren_max_len)
202class TextFormatter:
203 """Unified text formatting utility class for sentence and title case conversions."""
205 _special_cases_cache = None
207 @classmethod
208 def _load_special_cases(cls):
209 """Load and cache special cases from YAML file."""
210 if cls._special_cases_cache is None:
211 yaml_path = Path(__file__).parent / "special_cases.yaml"
212 try:
213 with open(yaml_path, "r") as f:
214 cls._special_cases_cache = yaml.safe_load(f) or []
215 except FileNotFoundError:
216 logger.warning(f"Special cases file not found: {yaml_path}")
217 cls._special_cases_cache = []
218 except Exception as e:
219 logger.error(f"Error loading special cases: {e}")
220 cls._special_cases_cache = []
221 return cls._special_cases_cache
223 @classmethod
224 def _get_special_cases_set(cls):
225 """Get special cases as a set for fast lookups."""
226 special_cases = cls._load_special_cases()
227 return {case.upper() for case in special_cases if isinstance(case, str)}
229 @classmethod
230 def _split_word_punctuation(cls, word):
231 """Split a word into clean word and trailing punctuation."""
232 if not word:
233 return word, ""
235 trailing_punct = ""
236 clean_word = word
238 # Handle contractions separately
239 if "'" in word:
240 # For words like "NASA's", split at apostrophe
241 parts = word.split("'", 1)
242 if len(parts) == 2:
243 clean_word = parts[0]
244 trailing_punct = "'" + parts[1]
245 else:
246 clean_word = word
247 trailing_punct = ""
248 else:
249 # Find where the alphanumeric part ends
250 i = len(word) - 1
251 while i >= 0 and not word[i].isalnum():
252 i -= 1
253 if i >= 0:
254 clean_word = word[: i + 1]
255 trailing_punct = word[i + 1 :]
256 else:
257 # All punctuation, no alphanumeric chars
258 clean_word = word
259 trailing_punct = ""
261 return clean_word, trailing_punct
263 @classmethod
264 def _preserve_special_case(cls, word):
265 """Check if word should be preserved as special case, return preserved version or None."""
266 if not isinstance(word, str):
267 return None
269 # If the word is enclosed in parentheses, preserve the case inside
270 if word.startswith("(") and word.endswith(")"):
271 return word
273 # Load special cases YAML file
274 special_cases = cls._load_special_cases()
275 clean_word, trailing_punct = cls._split_word_punctuation(word)
277 # Check for case-insensitive match
278 for special_word in special_cases:
279 # Ensure special_word is a string
280 if not isinstance(special_word, str):
281 continue
283 # First try exact match with full word (including punctuation)
284 if word.lower() == special_word.lower():
285 return special_word
287 # Then try match with clean word
288 if clean_word.lower() == special_word.lower():
289 return special_word + trailing_punct
291 # Also try if special_word has punctuation that matches
292 if (
293 special_word.endswith(".")
294 and clean_word.lower() == special_word[:-1].lower()
295 ):
296 # Word like "inc" matching "Inc."
297 return special_word + trailing_punct
299 return None
301 @classmethod
302 def to_sentence_case(cls, text: Optional[str], paren_max_len: int = 9) -> str:
303 """
304 Convert text to sentence case, preserving special cases from YAML.
306 True sentence case: only capitalize first word of sentences and special cases.
307 Includes progressive acronym expansion for parenthetical content.
309 Args:
310 text: Input text to convert
311 paren_max_len: Max length for parenthesized text to keep uppercase
313 Returns:
314 Text in sentence case with special cases preserved
315 """
316 if not text:
317 return ""
319 try:
320 # Start with lowercase
321 processed_text = text.lower()
322 special_cases_set = cls._get_special_cases_set()
324 # Small words to ignore in acronym expansion
325 SMALL_WORDS = r'\b(a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?)\b'
327 # First, handle acronym expansion for parenthetical content
328 def expand_acronyms(match):
329 full_match = match.group(0)
330 paren_content = match.group(1)
332 # If content is too long, handle normally
333 if len(paren_content) > paren_max_len:
334 return full_match
336 # Always try acronym expansion first, even for known acronyms
337 # This allows us to capitalize the expanded form
339 # Try progressive acronym expansion
340 start_pos = match.start()
341 text_before = processed_text[:start_pos].strip()
343 if text_before:
344 # Split into words
345 words_before = re.findall(r'\b\w+\b', text_before)
346 acronym_letters = [c.lower() for c in paren_content if c.isalpha()]
348 if len(acronym_letters) > 0:
349 # Try direct match first
350 if len(words_before) >= len(acronym_letters):
351 last_n_words = words_before[-len(acronym_letters):]
352 if [w[0].lower() for w in last_n_words] == acronym_letters:
353 # Mark these words for capitalization
354 cls._acronym_expansion_words = getattr(cls, '_acronym_expansion_words', set())
355 cls._acronym_expansion_words.update(last_n_words)
356 else:
357 # If direct match failed, try skipping small words
358 # Filter out small words
359 content_words = []
360 for word in words_before:
361 if not re.match(SMALL_WORDS, word, re.IGNORECASE):
362 content_words.append(word)
364 if len(content_words) >= len(acronym_letters):
365 last_n_content = content_words[-len(acronym_letters):]
366 if [w[0].lower() for w in last_n_content] == acronym_letters:
367 # Mark these words for capitalization
368 cls._acronym_expansion_words = getattr(cls, '_acronym_expansion_words', set())
369 cls._acronym_expansion_words.update(last_n_content)
371 # Return uppercase parenthetical if short enough
372 if len(paren_content) <= paren_max_len:
373 return f"({paren_content.upper()})"
374 else:
375 return full_match
377 # Initialize acronym expansion tracking
378 cls._acronym_expansion_words = set()
380 # Apply acronym expansion
381 processed_text = re.sub(r'\(([^)]+)\)', expand_acronyms, processed_text)
383 # Handle special cases and sentence boundaries
384 def word_replacer(match):
385 word = match.group(1)
386 word_start = match.start()
388 # Check if word should be capitalized due to acronym expansion
389 if hasattr(cls, '_acronym_expansion_words') and word in cls._acronym_expansion_words:
390 return word.capitalize()
392 # Check if word is a special case from YAML
393 if word.upper() in special_cases_set:
394 for special_case in cls._load_special_cases():
395 if isinstance(special_case, str) and word.upper() == special_case.upper():
396 return special_case
398 # Check if this is the start of a sentence (beginning or after . ! ? + space)
399 if word_start == 0:
400 return word.capitalize()
402 # Look for sentence boundaries (punctuation + one or more spaces)
403 text_before = processed_text[:word_start]
404 if re.search(r'[.!?]\s+$', text_before):
405 return word.capitalize()
407 return word
409 processed_text = re.sub(r'\b([a-zA-Z]+(?:-[a-zA-Z]+)*)\b', word_replacer, processed_text)
411 # Clean up acronym expansion tracking
412 if hasattr(cls, '_acronym_expansion_words'):
413 delattr(cls, '_acronym_expansion_words')
415 return processed_text
417 except Exception as e:
418 logger.error(f"Error processing text: '{text[:50]}...' - {e}", exc_info=True)
419 return text # Fallback to original text on error
421 @classmethod
422 def titlecase_callback(cls, word, **kwargs):
423 """Custom titlecase callback using YAML configuration."""
424 if not isinstance(word, str):
425 return word
427 # normalizations for common business suffixes
428 normalized_words = {"L.L.C.": "LLC", "I.N.C.": "Inc", "L.L.P.": "LLP", "L.T.D.": "LTD",
429 "P.L.L.C.": "PLLC", "P.A.": "PA", "P.C.": "PC"}
431 if word.upper() in normalized_words.keys():
432 word = normalized_words[word.upper()]
434 return cls._preserve_special_case(word)
437# Cache for special cases from YAML (keep for backward compatibility)
438_special_cases_cache = None
441def _load_special_cases():
442 """Load and cache special cases from YAML file."""
443 return TextFormatter._load_special_cases()
446# Define a callback function for custom word handling
447def custom_titlecase_callback(word, **kwargs):
448 """Custom titlecase callback using YAML configuration."""
449 return TextFormatter.titlecase_callback(word, **kwargs)
452def contracts_titlecase(text):
453 """Apply NASA-specific title casing rules to text"""
454 if text is None:
455 return None
456 return titlecase(text, callback=TextFormatter.titlecase_callback)
459def get_business_category_display_names(business_category_list):
460 business_category_display_name_list = []
461 for business_category in business_category_list:
462 display_name = BUSINESS_CATEGORIES.get(business_category)
463 if display_name:
464 business_category_display_name_list.append(display_name)
465 return business_category_display_name_list