Source code for stats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
:Purpose:   Provide access to various statistical calculations, namely:

            - **CUSUM:** :meth:`~Stats.cusum`
            - **Gaussian KDE:** :meth:`~Stats.kde`
            - **Linear Regression:** :class:`~LinearRegression`

:Platform:  Linux/Windows | Python 3.7+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

:Example:

    Create a sample dataset for the stats methods::

        >>> import matplotlib.pyplot as plt
        >>> import numpy as np
        >>> import pandas as pd

        >>> np.random.seed(73)
        >>> data = np.random.normal(size=100)*100
        >>> x = np.arange(data.size)
        >>> y = pd.Series(data).rolling(window=25, min_periods=25).mean().cumsum()

        >>> # Preview the trend.
        >>> plt.plot(x, y)

"""
# pylint: disable=line-too-long
# pylint: disable=wrong-import-order

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde
from typing import Union
# local
from utils4.reporterror import reporterror


[docs] class LinearRegression: """Calculate the linear regression of a dataset. Args: x (np.array): Array of X-values. y (np.array): Array of Y-values. :Slope Calculation: The calculation for the slope itself is borrowed from the :func:`scipy.stats.linregress` function. Whose `source code`_ was obtained on GitHub. .. _source code: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_stats_mstats_common.py#L16-L203 :Example Use: .. tip:: For a sample dataset and imports to go along with this example, refer to the docstring for :mod:`this module <stats>`. Calculate a linear regression line on an X/Y dataset:: >>> from lib.stats import LinearRegression >>> linreg = LinearRegression(x, y) >>> linreg.calculate() >>> # Obtain the regression line array. >>> y_ = linreg.regression_line >>> # View the intercept value. >>> linreg.intercept -31.26630 >>> # View the slope value. >>> linreg.slope 1.95332 >>> # Plot the trend and regression line. >>> plt.plot(x, y, 'grey') >>> plt.plot(x, y_, 'red') >>> plt.show() """
[docs] def __init__(self, x: np.array, y: np.array): """LinearRegression class initialiser.""" self._x = x self._y = y self._xbar = 0.0 self._ybar = 0.0 self._c = 0.0 self._m = 0.0 self._line = np.array(())
@property def slope(self): """Accessor to the slope value.""" return self._m @property def intercept(self): """Accessor to the slope's y-intercept.""" return self._c @property def regression_line(self): """Accessor to the calculated regression line, as y-values.""" return self._line
[docs] def calculate(self): """Calculate the linear regression for the X/Y data arrays. The result of the calculation is accessible via the :attr:`regression_line` property. """ self._calc_means() self._calc_slope() self._calc_intercept() self._calc_regression_line()
def _calc_intercept(self): """Calculate the intercept as: ybar - m * xbar.""" self._c = self._ybar - self._m * self._xbar def _calc_means(self) -> float: """Calculate the mean of the X and Y arrays.""" self._xbar = self._x.mean() self._ybar = self._y.mean() def _calc_regression_line(self): """Calculate the regression line as: y = mx + c.""" self._line = self._m * self._x + self._c def _calc_slope(self): """Calculate the slope value as: R * ( std(y) / std(x) ). Per the ``scipy`` source code comments:: # Average sums of square differences from the mean # ssxm = mean( (x-mean(x))^2 ) # ssxym = mean( (x-mean(x)) * (y-mean(y)) ) ... slope = ssxym / ssxm """ ssxm = np.mean( (self._x - self._xbar)**2 ) ssxym = np.mean( (self._x - self._xbar) * (self._y - self._ybar) ) self._m = ssxym / ssxm @staticmethod def _calc_std(data: np.array, ddof: int=1) -> float: # pragma nocover """Calculate the standard deviation. Args: data (np.array): Array of values. ddof (int): Degrees of freedom. Defaults to 1. Returns: float: Standard deviation of the given values. """ return np.std(data, ddof=ddof)
[docs] class Stats: """Wrapper class for various statistical calculations."""
[docs] @staticmethod def cusum(df: pd.DataFrame, cols: Union[list, str], window: int=None, min_periods: int=1, inplace=False, show_plot: bool=False) -> Union[pd.DataFrame, None]: r"""Calculate a CUSUM on a set of data. A CUSUM is a generalised method for smoothing a noisy trend, or for detecting a change in the trend. Note: A CUSUM is *not* a cumulative sum (cumsum), although a cumulative sum is used. A CUSUM is a cumulative sum of derived values, where each derived value is calculated as the delta of a single value relative to the rolling mean of all previous values. Args: df (pd.DataFrame): The DataFrame containing the column(s) on which a CUSUM is to be calculated. cols (Union[list, str]): The column (or list of columns) on which the CUSUM is to be calculated. window (int, optional): Size of the window on which the rolling mean is to be calculated. This corresponds to the ``pandas.df.rolling(window)`` parameter. Defaults to None. - If None is received, a %5 window is calculated based on the length of the DataFrame. This method helps smooth the trend, while keeping a representation of the original trend. - For a *true* CUSUM, a running average should be calculated on the length of the DataFrame, except for the current value. For this method, pass ``window=len(df)``. min_periods (int, optional): Number of periods to wait before calculating the rolling average. Defaults to 1. inplace (bool, optional): Update the passed DataFrame (in-place), rather returning a *copy* of the passed DataFrame. Defaults to False. show_plot (bool, optional): Display a graph of the raw value, and the calculated CUSUM results. Defaults to False. :Calculation: The CUSUM is calculated by taking a rolling mean :math:`RA` (optionally locked at the first value), and calculate the delta of the current value, relative to the rolling mean all previous values. A cumulative sum is applied to the deltas. The cumulative sum for each data point is returned as the CUSUM value. :Equation: :math:`c_i = \sum_{i=1}^{n}(x_i - RA_i)` where :math:`RA` (Rolling Mean) is defined as: :math:`RA_{i+1} = \frac{1}{n}\sum_{j=1}^{n}x_j` :Example Use: Generate a *sample* trend dataset:: >>> import numpy as np >>> import pandas as pd >>> np.random.seed(13) >>> s1 = pd.Series(np.random.randn(1000)).rolling(window=100).mean() >>> np.random.seed(73) >>> s2 = pd.Series(np.random.randn(1000)).rolling(window=100).mean() >>> df = pd.DataFrame({'sample1': s1, 'sample2': s2}) Example for calculating a CUSUM on two columns:: >>> from EHM.stats import stats >>> df_c = stats.cusum(df=df, cols=['sample1', 'sample2'], window=len(df), inplace=False, show_plot=True) >>> df_c.tail() sample1 sample2 sample1_cusum sample2_cusum 995 0.057574 0.065887 23.465337 29.279936 996 0.062781 0.072213 23.556592 29.369397 997 0.028513 0.072658 23.613478 29.459204 998 0.024518 0.070769 23.666305 29.547022 999 0.000346 0.074849 23.694901 29.638822 Returns: Union[pd.DataFrame, None]: If the ``inplace`` argument is ``False``, a *copy* of the original DataFrame with the new CUSUM columns appended is returned. Otherwise, the passed DataFrame is *updated*, and ``None`` is returned. """ # Convert a single column name to a list. cols = [cols] if not isinstance(cols, list) else cols if not inplace: df = df.copy(deep=True) window = int(len(df) * 0.05) if window is None else window # Set default window as 5% for col in cols: new_col = f'{col}_cusum' # CUSUM calculation (rolling_sum on rolling_mean, with a shift of 1). df[new_col] = ((df[col] - df[col].rolling(window=window, min_periods=min_periods) .mean() .shift(1)) .rolling(window=len(df), min_periods=min_periods).sum()) # Show simple plot if requested. if show_plot: # pragma: nocover df[[col, new_col]].plot(title=f'TEMP PLOT\n{col} vs {new_col}', color=['lightgrey', 'red'], secondary_y=new_col, legend=False, grid=False) return None if inplace else df
[docs] def kde(self, data: Union[list, np.array, pd.Series], n: int=500) -> tuple: """Calculate the kernel density estimate (KDE) for an array X. This function returns the *probability density* (PDF) using Gaussian KDE. Args: data (Union[list, np.array, pd.Series]): An array-like object containing the data against which the Gaussian KDE is calculated. This can be a list, numpy array, or pandas Series. n (int, optional): Number of values returned in the X, Y arrays. Defaults to 500. :Example Use: .. tip:: For a sample dataset and imports to go along with this example, refer to the docstring for :mod:`this module <stats>`. Calculate a Gaussian KDE on Y:: >>> from utils4.stats import stats >>> # Preview the histogram. >>> _ = plt.hist(data) >>> X, Y, max_x = stats.kde(data=data, n=500) >>> plt.plot(X, Y) >>> # Show X value at peak of curve. >>> max_x -9.718684033029376 :Max X: This function also returns the X value of the curve's peak; where ``max_x`` is the ``X`` value corresponding to the max ``Y`` value on the curve. The result (``max_x``) is returned as the third tuple element. :Further Detail: This method uses the :func:`scipy.stats.gaussian_kde` method for the KDE calculation. For further detail on the calculation itself, refer to that function's docstring. :Background: Originally, :func:`plotly.figure_factory.dist_plot` was used to calculate the KDE. However, to remove the ``plotly`` dependency from this library, their code was copied and refactored (simplified) into this function. Both the :func:`dist_plot` and :func:`pandas.DataFrame.plot.kde` method call :func:`scipy.stats.gaussian_kde` for the calculation, which this function also calls. Returns: tuple: A tuple containing the X-array, Y-array (both of ``n`` size), as well a the X value at max Y, as:: (curve_x, curve_y, max_x) """ try: data_ = self._obj_to_array(data=data) curve_x = np.linspace(data_.min(), data_.max(), n) curve_y = gaussian_kde(data_).evaluate(curve_x) max_x = curve_x[curve_y.argmax()] return (curve_x, curve_y, max_x) except Exception as err: reporterror(err) return (np.array(()), np.array(()), 0.0)
@staticmethod def _obj_to_array(data: Union[list, np.array, pd.Series]) -> np.ndarray: """Convert an iterable object to a numpy array. Args: data (Union[list, np.array, pd.Series]): Array-like object to be converted into a ``numpy.ndarray``. :NaN Values: In addition to converting the following types to a ``numpy.ndarray``, any ``nan`` values are dropped from the ``numpy.array`` and ``pd.Series`` objects. Returns: np.array: A ``numpy.ndarray``, with ``nan`` values removed. """ data_ = None if isinstance(data, np.ndarray): data_ = data elif isinstance(data, pd.Series): data_ = data.astype(float).to_numpy() elif isinstance(data, list): data_ = np.array(data) data_ = data_[~np.isnan(data_)] return data_
stats = Stats()