readabs.recalibrate

Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.

  1"""Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000."""
  2
  3import sys
  4from collections.abc import Callable
  5from operator import mul, truediv
  6from typing import Any
  7
  8import numpy as np
  9from pandas import DataFrame, Series
 10
 11from readabs.datatype import Datatype as DataT
 12
 13# Constants
 14NDIM_SERIES = 1
 15NDIM_DATAFRAME = 2
 16MAX_VALUE_THRESHOLD = 1000
 17MIN_VALUE_THRESHOLD = 1
 18STEP_SIZE = 3
 19DIVISOR = 1000
 20
 21
 22# --- public
 23def recalibrate(
 24    data: DataT,
 25    units: str,
 26) -> tuple[DataT, str]:
 27    """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
 28
 29    Change the name of the units to reflect the recalibration.
 30
 31    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
 32    If you provide a Series, you will get a Series back. If you provide a DataFrame,
 33    you will get a DataFrame back.
 34
 35    Parameters
 36    ----------
 37    data : Series or DataFrame
 38        The data to recalibrate.
 39    units : str
 40        The units of the data. This string should be in the form of
 41        "Number", "Thousands", "Millions", "Billions", etc. The units
 42        should be in title case.
 43
 44    Returns
 45    -------
 46    Series or DataFrame
 47        The recalibrated data will be a Series if a Series was provided,
 48        or a DataFrame if a DataFrame was provided.
 49
 50    Examples
 51    --------
 52    ```python
 53    from pandas import Series
 54    from readabs import recalibrate
 55    s = Series([1_000, 10_000, 100_000, 1_000_000])
 56    recalibrated, units = recalibrate(s, "$")
 57    print(f"{recalibrated=}, {units=}")
 58    ```
 59
 60    """
 61    if not isinstance(data, (Series, DataFrame)):
 62        raise TypeError("data must be a Series or DataFrame")
 63    units, restore_name = _prepare_units(units)
 64    flat_data = data.to_numpy().flatten()
 65    flat_data, units = _recalibrate(flat_data, units)
 66
 67    if restore_name:
 68        units = f"{restore_name} {units}"
 69        for n in "numbers", "number":
 70            if n in units:
 71                units = units.replace(n, "").strip()
 72                break
 73    units = units.title()
 74
 75    result = data.__class__(flat_data.reshape(data.shape))
 76    result.index = data.index
 77    if len(data.shape) == NDIM_DATAFRAME:
 78        result.columns = data.columns
 79    if len(data.shape) == NDIM_SERIES:
 80        result.name = data.name  # pyright: ignore[reportAttributeAccessIssue]
 81    return result, units
 82
 83
 84def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 85    """Recalibrate a floating point value.
 86
 87    The value will be recalibrated so it is in the range -1000 to 1000.
 88    The units will be changed to reflect the recalibration.
 89
 90    Parameters
 91    ----------
 92    value : float
 93        The value to recalibrate.
 94    units : str
 95        The units of the value. This string should be in the form of
 96        "Number", "Thousands", "Millions", "Billions", etc. The units
 97        should be in title case.
 98
 99    Returns
100    -------
101    tuple[float, str]
102        A tuple containing the recalibrated value and the recalibrated units.
103
104    Examples
105    --------
106    ```python
107    from readabs import recalibrate_value
108    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
109    print(recalibrated, units)
110    ```
111
112    """
113    series = Series([value])
114    output, units = recalibrate(series, units)
115    return output.to_numpy()[0], units
116
117
118# --- private
119_MIN_RECALIBRATE = "number"  # all lower case
120_MAX_RECALIBRATE = "decillion"  # all lower case
121_keywords = {
122    _MIN_RECALIBRATE.title(): 0,
123    "Thousand": 3,
124    "Million": 6,
125    "Billion": 9,
126    "Trillion": 12,
127    "Quadrillion": 15,
128    "Quintillion": 18,
129    "Sextillion": 21,
130    "Septillion": 24,
131    "Octillion": 27,
132    "Nonillion": 30,
133    _MAX_RECALIBRATE.title(): 33,
134}
135_r_keywords = {v: k for k, v in _keywords.items()}
136
137
138def _prepare_units(units: str) -> tuple[str, str]:
139    """Prepare the units for recalibration."""
140    substitutions = [
141        ("000 Hours", "Thousand Hours"),
142        ("$'000,000", "$ Million"),
143        ("$'000", " $ Thousand"),
144        ("'000,000", "Millions"),
145        ("'000", "Thousands"),
146        ("000,000", "Millions"),
147        ("000", "Thousands"),
148    ]
149    units = units.strip()
150    for pattern, replacement in substitutions:
151        units = units.replace(pattern, replacement)
152
153    # manage the names for some gnarly units
154    possible_units = ("$", "Tonnes")  # there may be more possible units
155    found_unit = ""
156    for pu in possible_units:
157        if pu.lower() in units.lower():
158            units = units.lower().replace(pu.lower(), "").strip()
159            if units == "":
160                units = "number"
161            found_unit = pu
162            break
163
164    return units, found_unit
165
166
167def _find_calibration(units: str) -> str | None:
168    found = None
169    for keyword in _keywords:
170        if keyword in units or keyword.lower() in units:
171            found = keyword
172            break
173    return found
174
175
176# private
177def _perfect_already(data: np.ndarray) -> bool:
178    """No need to recalibrate if the data is already perfect."""
179    check_max = np.nanmax(np.abs(data))
180    return bool(MIN_VALUE_THRESHOLD <= check_max < MAX_VALUE_THRESHOLD)
181
182
183def _all_zero(data: np.ndarray) -> bool:
184    """Cannot recalibrate if all the data is zero."""
185    if np.nanmax(np.abs(data)) == 0:
186        print("recalibrate(): All zero data")
187        return True
188    return False
189
190
191def _not_numbers(data: np.ndarray) -> bool:
192    """Cannot recalibrate if the data is not numeric."""
193    if (not np.issubdtype(data.dtype, np.number)) or np.isinf(data).any() or np.isnan(data).all():
194        print("recalibrate(): Data is partly or completely non-numeric.")
195        return True
196    return False
197
198
199def _can_recalibrate(flat_data: np.ndarray, units: str) -> bool:
200    """Check if the data can be recalibrated."""
201    if _find_calibration(units) is None:
202        print(f"recalibrate(): Units not appropriately calibrated: {units}")
203        return False
204
205    return all(not f(flat_data) for f in (_not_numbers, _all_zero, _perfect_already))
206
207
208def _recalibrate(flat_data: np.ndarray, units: str) -> tuple[np.ndarray, str]:
209    """Recalibrate the data.
210
211    Loop over the data until its maximum value is between -1000 and 1000.
212    """
213    if _can_recalibrate(flat_data, units):
214        while True:
215            maximum = np.nanmax(np.abs(flat_data))
216            if maximum >= MAX_VALUE_THRESHOLD:
217                if _MAX_RECALIBRATE in units.lower():
218                    print("recalibrate() is not designed for very big units")
219                    break
220                flat_data, units = _do_recal(flat_data, units, STEP_SIZE, truediv)
221                continue
222            if maximum < 1:
223                if _MIN_RECALIBRATE in units.lower():
224                    print("recalibrate() is not designed for very small units")
225                    break
226                flat_data, units = _do_recal(flat_data, units, -STEP_SIZE, mul)
227                continue
228            break
229    return flat_data, units
230
231
232def _do_recal(
233    flat_data: np.ndarray, units: str, step: int, operator: Callable[[np.ndarray, int], np.ndarray]
234) -> tuple[np.ndarray, str]:
235    calibration = _find_calibration(units)
236    if calibration is None:
237        raise ValueError(f"No calibration found for units: {units}")
238    factor = _keywords[calibration]
239    if factor + step not in _r_keywords:
240        print(f"Unexpected factor: {factor + step}")
241        sys.exit(-1)
242    replacement = _r_keywords[factor + step]
243    units = units.replace(calibration, replacement)
244    units = units.replace(calibration.lower(), replacement)
245    flat_data = operator(flat_data, DIVISOR)
246    return flat_data, units
247
248
249# --- test
250if __name__ == "__main__":
251
252    def test_example() -> None:
253        """Test the example in the docstring."""
254        s = Series([1_000, 10_000, 100_000, 1_000_000])
255        recalibrated, units = recalibrate(s, "$")
256        print(f"{recalibrated=}, {units=}")
257
258        recalibrated_val, units_val = recalibrate_value(10_000_000, "Thousand")
259        print(f"{recalibrated_val=}, {units_val=}")
260        print("=" * 40)
261
262    test_example()
263
264    def test_recalibrate() -> None:
265        """Test the recalibrate() function."""
266
267        def run_test(dataset: tuple[tuple[list[Any], str], ...]) -> None:
268            for d, u in dataset:
269                data: Series[Any] = Series(d)
270                recalibrated, units = recalibrate(data, u)
271                print(f"{data.to_numpy()}, {u} ==> {recalibrated.to_numpy()}, {units}")
272                print("=" * 40)
273
274        # good examples
275        good = (
276            ([1, 2, 3, 4, 5], "Number"),  # no change
277            ([1_000, 10_000, 100_000, 1_000_000], "$"),
278            ([1_000, 10_000, 100_000, 1_000_000], "Number Spiders"),
279            ([1_000, 10_000, 100_000, 1_000_000], "Thousand"),
280            ([0.2, 0.3], "Thousands"),
281            ([0.000_000_2, 0.000_000_3], "Trillion"),
282        )
283        run_test(good)
284
285        # bad sets of data - should produce error messages and do nothing
286        bad = (
287            ([1, 2, 3, 4, 5], "Hundreds"),
288            ([0, 0, 0], "Thousands"),
289            ([np.nan, 0, 0], "Thousands"),
290            ([np.inf, 1, 2], "Thousands"),
291            ([0, 0, "a"], "Thousands"),
292        )
293        run_test(bad)
294
295    test_recalibrate()
296
297    def test_recalibrate_value() -> None:
298        """Test the recalibrate_value() function."""
299        # good example
300        recalibrated, units = recalibrate_value(10_000_000, "Thousand")
301        print(recalibrated, units)
302        print("=" * 40)
303
304        # bad example
305        recalibrated, units = recalibrate_value(3_900, "Spiders")
306        print(recalibrated, units)
307        print("=" * 40)
308
309    test_recalibrate_value()
NDIM_SERIES = 1
NDIM_DATAFRAME = 2
MAX_VALUE_THRESHOLD = 1000
MIN_VALUE_THRESHOLD = 1
STEP_SIZE = 3
DIVISOR = 1000
def recalibrate(data: ~Datatype, units: str) -> tuple[~Datatype, str]:
24def recalibrate(
25    data: DataT,
26    units: str,
27) -> tuple[DataT, str]:
28    """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
29
30    Change the name of the units to reflect the recalibration.
31
32    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
33    If you provide a Series, you will get a Series back. If you provide a DataFrame,
34    you will get a DataFrame back.
35
36    Parameters
37    ----------
38    data : Series or DataFrame
39        The data to recalibrate.
40    units : str
41        The units of the data. This string should be in the form of
42        "Number", "Thousands", "Millions", "Billions", etc. The units
43        should be in title case.
44
45    Returns
46    -------
47    Series or DataFrame
48        The recalibrated data will be a Series if a Series was provided,
49        or a DataFrame if a DataFrame was provided.
50
51    Examples
52    --------
53    ```python
54    from pandas import Series
55    from readabs import recalibrate
56    s = Series([1_000, 10_000, 100_000, 1_000_000])
57    recalibrated, units = recalibrate(s, "$")
58    print(f"{recalibrated=}, {units=}")
59    ```
60
61    """
62    if not isinstance(data, (Series, DataFrame)):
63        raise TypeError("data must be a Series or DataFrame")
64    units, restore_name = _prepare_units(units)
65    flat_data = data.to_numpy().flatten()
66    flat_data, units = _recalibrate(flat_data, units)
67
68    if restore_name:
69        units = f"{restore_name} {units}"
70        for n in "numbers", "number":
71            if n in units:
72                units = units.replace(n, "").strip()
73                break
74    units = units.title()
75
76    result = data.__class__(flat_data.reshape(data.shape))
77    result.index = data.index
78    if len(data.shape) == NDIM_DATAFRAME:
79        result.columns = data.columns
80    if len(data.shape) == NDIM_SERIES:
81        result.name = data.name  # pyright: ignore[reportAttributeAccessIssue]
82    return result, units

Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.

Change the name of the units to reflect the recalibration.

Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.

Parameters

data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.

Examples

from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 85def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 86    """Recalibrate a floating point value.
 87
 88    The value will be recalibrated so it is in the range -1000 to 1000.
 89    The units will be changed to reflect the recalibration.
 90
 91    Parameters
 92    ----------
 93    value : float
 94        The value to recalibrate.
 95    units : str
 96        The units of the value. This string should be in the form of
 97        "Number", "Thousands", "Millions", "Billions", etc. The units
 98        should be in title case.
 99
100    Returns
101    -------
102    tuple[float, str]
103        A tuple containing the recalibrated value and the recalibrated units.
104
105    Examples
106    --------
107    ```python
108    from readabs import recalibrate_value
109    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
110    print(recalibrated, units)
111    ```
112
113    """
114    series = Series([value])
115    output, units = recalibrate(series, units)
116    return output.to_numpy()[0], units

Recalibrate a floating point value.

The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.

Parameters

value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.

Examples

from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)