readabs.recalibrate
Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
1"""Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.""" 2 3import sys 4from collections.abc import Callable 5from operator import mul, truediv 6from typing import Any 7 8import numpy as np 9from pandas import DataFrame, Series 10 11from readabs.datatype import Datatype as DataT 12 13# Constants 14NDIM_SERIES = 1 15NDIM_DATAFRAME = 2 16MAX_VALUE_THRESHOLD = 1000 17MIN_VALUE_THRESHOLD = 1 18STEP_SIZE = 3 19DIVISOR = 1000 20 21 22# --- public 23def recalibrate( 24 data: DataT, 25 units: str, 26) -> tuple[DataT, str]: 27 """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000. 28 29 Change the name of the units to reflect the recalibration. 30 31 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 32 If you provide a Series, you will get a Series back. If you provide a DataFrame, 33 you will get a DataFrame back. 34 35 Parameters 36 ---------- 37 data : Series or DataFrame 38 The data to recalibrate. 39 units : str 40 The units of the data. This string should be in the form of 41 "Number", "Thousands", "Millions", "Billions", etc. The units 42 should be in title case. 43 44 Returns 45 ------- 46 Series or DataFrame 47 The recalibrated data will be a Series if a Series was provided, 48 or a DataFrame if a DataFrame was provided. 49 50 Examples 51 -------- 52 ```python 53 from pandas import Series 54 from readabs import recalibrate 55 s = Series([1_000, 10_000, 100_000, 1_000_000]) 56 recalibrated, units = recalibrate(s, "$") 57 print(f"{recalibrated=}, {units=}") 58 ``` 59 60 """ 61 if not isinstance(data, (Series, DataFrame)): 62 raise TypeError("data must be a Series or DataFrame") 63 units, restore_name = _prepare_units(units) 64 flat_data = data.to_numpy().flatten() 65 flat_data, units = _recalibrate(flat_data, units) 66 67 if restore_name: 68 units = f"{restore_name} {units}" 69 for n in "numbers", "number": 70 if n in units: 71 units = units.replace(n, "").strip() 72 break 73 units = units.title() 74 75 result = data.__class__(flat_data.reshape(data.shape)) 76 result.index = data.index 77 if len(data.shape) == NDIM_DATAFRAME: 78 result.columns = data.columns 79 if len(data.shape) == NDIM_SERIES: 80 result.name = data.name # pyright: ignore[reportAttributeAccessIssue] 81 return result, units 82 83 84def recalibrate_value(value: float, units: str) -> tuple[float, str]: 85 """Recalibrate a floating point value. 86 87 The value will be recalibrated so it is in the range -1000 to 1000. 88 The units will be changed to reflect the recalibration. 89 90 Parameters 91 ---------- 92 value : float 93 The value to recalibrate. 94 units : str 95 The units of the value. This string should be in the form of 96 "Number", "Thousands", "Millions", "Billions", etc. The units 97 should be in title case. 98 99 Returns 100 ------- 101 tuple[float, str] 102 A tuple containing the recalibrated value and the recalibrated units. 103 104 Examples 105 -------- 106 ```python 107 from readabs import recalibrate_value 108 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 109 print(recalibrated, units) 110 ``` 111 112 """ 113 series = Series([value]) 114 output, units = recalibrate(series, units) 115 return output.to_numpy()[0], units 116 117 118# --- private 119_MIN_RECALIBRATE = "number" # all lower case 120_MAX_RECALIBRATE = "decillion" # all lower case 121_keywords = { 122 _MIN_RECALIBRATE.title(): 0, 123 "Thousand": 3, 124 "Million": 6, 125 "Billion": 9, 126 "Trillion": 12, 127 "Quadrillion": 15, 128 "Quintillion": 18, 129 "Sextillion": 21, 130 "Septillion": 24, 131 "Octillion": 27, 132 "Nonillion": 30, 133 _MAX_RECALIBRATE.title(): 33, 134} 135_r_keywords = {v: k for k, v in _keywords.items()} 136 137 138def _prepare_units(units: str) -> tuple[str, str]: 139 """Prepare the units for recalibration.""" 140 substitutions = [ 141 ("000 Hours", "Thousand Hours"), 142 ("$'000,000", "$ Million"), 143 ("$'000", " $ Thousand"), 144 ("'000,000", "Millions"), 145 ("'000", "Thousands"), 146 ("000,000", "Millions"), 147 ("000", "Thousands"), 148 ] 149 units = units.strip() 150 for pattern, replacement in substitutions: 151 units = units.replace(pattern, replacement) 152 153 # manage the names for some gnarly units 154 possible_units = ("$", "Tonnes") # there may be more possible units 155 found_unit = "" 156 for pu in possible_units: 157 if pu.lower() in units.lower(): 158 units = units.lower().replace(pu.lower(), "").strip() 159 if units == "": 160 units = "number" 161 found_unit = pu 162 break 163 164 return units, found_unit 165 166 167def _find_calibration(units: str) -> str | None: 168 found = None 169 for keyword in _keywords: 170 if keyword in units or keyword.lower() in units: 171 found = keyword 172 break 173 return found 174 175 176# private 177def _perfect_already(data: np.ndarray) -> bool: 178 """No need to recalibrate if the data is already perfect.""" 179 check_max = np.nanmax(np.abs(data)) 180 return bool(MIN_VALUE_THRESHOLD <= check_max < MAX_VALUE_THRESHOLD) 181 182 183def _all_zero(data: np.ndarray) -> bool: 184 """Cannot recalibrate if all the data is zero.""" 185 if np.nanmax(np.abs(data)) == 0: 186 print("recalibrate(): All zero data") 187 return True 188 return False 189 190 191def _not_numbers(data: np.ndarray) -> bool: 192 """Cannot recalibrate if the data is not numeric.""" 193 if (not np.issubdtype(data.dtype, np.number)) or np.isinf(data).any() or np.isnan(data).all(): 194 print("recalibrate(): Data is partly or completely non-numeric.") 195 return True 196 return False 197 198 199def _can_recalibrate(flat_data: np.ndarray, units: str) -> bool: 200 """Check if the data can be recalibrated.""" 201 if _find_calibration(units) is None: 202 print(f"recalibrate(): Units not appropriately calibrated: {units}") 203 return False 204 205 return all(not f(flat_data) for f in (_not_numbers, _all_zero, _perfect_already)) 206 207 208def _recalibrate(flat_data: np.ndarray, units: str) -> tuple[np.ndarray, str]: 209 """Recalibrate the data. 210 211 Loop over the data until its maximum value is between -1000 and 1000. 212 """ 213 if _can_recalibrate(flat_data, units): 214 while True: 215 maximum = np.nanmax(np.abs(flat_data)) 216 if maximum >= MAX_VALUE_THRESHOLD: 217 if _MAX_RECALIBRATE in units.lower(): 218 print("recalibrate() is not designed for very big units") 219 break 220 flat_data, units = _do_recal(flat_data, units, STEP_SIZE, truediv) 221 continue 222 if maximum < 1: 223 if _MIN_RECALIBRATE in units.lower(): 224 print("recalibrate() is not designed for very small units") 225 break 226 flat_data, units = _do_recal(flat_data, units, -STEP_SIZE, mul) 227 continue 228 break 229 return flat_data, units 230 231 232def _do_recal( 233 flat_data: np.ndarray, units: str, step: int, operator: Callable[[np.ndarray, int], np.ndarray] 234) -> tuple[np.ndarray, str]: 235 calibration = _find_calibration(units) 236 if calibration is None: 237 raise ValueError(f"No calibration found for units: {units}") 238 factor = _keywords[calibration] 239 if factor + step not in _r_keywords: 240 print(f"Unexpected factor: {factor + step}") 241 sys.exit(-1) 242 replacement = _r_keywords[factor + step] 243 units = units.replace(calibration, replacement) 244 units = units.replace(calibration.lower(), replacement) 245 flat_data = operator(flat_data, DIVISOR) 246 return flat_data, units 247 248 249# --- test 250if __name__ == "__main__": 251 252 def test_example() -> None: 253 """Test the example in the docstring.""" 254 s = Series([1_000, 10_000, 100_000, 1_000_000]) 255 recalibrated, units = recalibrate(s, "$") 256 print(f"{recalibrated=}, {units=}") 257 258 recalibrated_val, units_val = recalibrate_value(10_000_000, "Thousand") 259 print(f"{recalibrated_val=}, {units_val=}") 260 print("=" * 40) 261 262 test_example() 263 264 def test_recalibrate() -> None: 265 """Test the recalibrate() function.""" 266 267 def run_test(dataset: tuple[tuple[list[Any], str], ...]) -> None: 268 for d, u in dataset: 269 data: Series[Any] = Series(d) 270 recalibrated, units = recalibrate(data, u) 271 print(f"{data.to_numpy()}, {u} ==> {recalibrated.to_numpy()}, {units}") 272 print("=" * 40) 273 274 # good examples 275 good = ( 276 ([1, 2, 3, 4, 5], "Number"), # no change 277 ([1_000, 10_000, 100_000, 1_000_000], "$"), 278 ([1_000, 10_000, 100_000, 1_000_000], "Number Spiders"), 279 ([1_000, 10_000, 100_000, 1_000_000], "Thousand"), 280 ([0.2, 0.3], "Thousands"), 281 ([0.000_000_2, 0.000_000_3], "Trillion"), 282 ) 283 run_test(good) 284 285 # bad sets of data - should produce error messages and do nothing 286 bad = ( 287 ([1, 2, 3, 4, 5], "Hundreds"), 288 ([0, 0, 0], "Thousands"), 289 ([np.nan, 0, 0], "Thousands"), 290 ([np.inf, 1, 2], "Thousands"), 291 ([0, 0, "a"], "Thousands"), 292 ) 293 run_test(bad) 294 295 test_recalibrate() 296 297 def test_recalibrate_value() -> None: 298 """Test the recalibrate_value() function.""" 299 # good example 300 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 301 print(recalibrated, units) 302 print("=" * 40) 303 304 # bad example 305 recalibrated, units = recalibrate_value(3_900, "Spiders") 306 print(recalibrated, units) 307 print("=" * 40) 308 309 test_recalibrate_value()
24def recalibrate( 25 data: DataT, 26 units: str, 27) -> tuple[DataT, str]: 28 """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000. 29 30 Change the name of the units to reflect the recalibration. 31 32 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 33 If you provide a Series, you will get a Series back. If you provide a DataFrame, 34 you will get a DataFrame back. 35 36 Parameters 37 ---------- 38 data : Series or DataFrame 39 The data to recalibrate. 40 units : str 41 The units of the data. This string should be in the form of 42 "Number", "Thousands", "Millions", "Billions", etc. The units 43 should be in title case. 44 45 Returns 46 ------- 47 Series or DataFrame 48 The recalibrated data will be a Series if a Series was provided, 49 or a DataFrame if a DataFrame was provided. 50 51 Examples 52 -------- 53 ```python 54 from pandas import Series 55 from readabs import recalibrate 56 s = Series([1_000, 10_000, 100_000, 1_000_000]) 57 recalibrated, units = recalibrate(s, "$") 58 print(f"{recalibrated=}, {units=}") 59 ``` 60 61 """ 62 if not isinstance(data, (Series, DataFrame)): 63 raise TypeError("data must be a Series or DataFrame") 64 units, restore_name = _prepare_units(units) 65 flat_data = data.to_numpy().flatten() 66 flat_data, units = _recalibrate(flat_data, units) 67 68 if restore_name: 69 units = f"{restore_name} {units}" 70 for n in "numbers", "number": 71 if n in units: 72 units = units.replace(n, "").strip() 73 break 74 units = units.title() 75 76 result = data.__class__(flat_data.reshape(data.shape)) 77 result.index = data.index 78 if len(data.shape) == NDIM_DATAFRAME: 79 result.columns = data.columns 80 if len(data.shape) == NDIM_SERIES: 81 result.name = data.name # pyright: ignore[reportAttributeAccessIssue] 82 return result, units
Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
Change the name of the units to reflect the recalibration.
Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.
Parameters
data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.
Examples
from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
85def recalibrate_value(value: float, units: str) -> tuple[float, str]: 86 """Recalibrate a floating point value. 87 88 The value will be recalibrated so it is in the range -1000 to 1000. 89 The units will be changed to reflect the recalibration. 90 91 Parameters 92 ---------- 93 value : float 94 The value to recalibrate. 95 units : str 96 The units of the value. This string should be in the form of 97 "Number", "Thousands", "Millions", "Billions", etc. The units 98 should be in title case. 99 100 Returns 101 ------- 102 tuple[float, str] 103 A tuple containing the recalibrated value and the recalibrated units. 104 105 Examples 106 -------- 107 ```python 108 from readabs import recalibrate_value 109 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 110 print(recalibrated, units) 111 ``` 112 113 """ 114 series = Series([value]) 115 output, units = recalibrate(series, units) 116 return output.to_numpy()[0], units
Recalibrate a floating point value.
The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.
Parameters
value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.
Examples
from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)