Module statkit.dataset

Various methods for partitioning the dataset, such as downsampling and splitting.

Expand source code
"""Various methods for partitioning the dataset, such as downsampling and splitting."""

from functools import partial

import numpy as np
from numpy.typing import NDArray
from pandas import DataFrame, Series
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.preprocessing import LabelBinarizer  # type: ignore


def _as_categories(x_multinomial):
    """Convert multinomial sample to long vector of categorical draws."""
    #  Line up all draws in one long vector (of size `n_samples`), indicating which
    # feature was drawn.
    # Use Numpy instead of JAX implementation because it is faster.
    return np.repeat(np.arange(len(x_multinomial)), x_multinomial)  # type: ignore


def _as_multinomial(x_categorical, n_features: int):
    """Convert string of categorical draws to multinomial representation."""
    x_test = np.zeros(n_features, dtype=int)
    np.add.at(x_test, x_categorical, 1)  # In place change.
    return x_test  # type: ignore


def _single_multinomial_train_test_split(
    random_state, x_i, test_size: float = 0.2
) -> tuple:
    """Make train-test split for a single multinomial draw.

    Args:
        random_state: Instance of NumPy pseudo random number state.
        x_i: A single multinomial observation.
        test_size: Proportion of draws for test set.
    """
    x_i = x_i.astype(int)
    x_draws = _as_categories(x_i)
    # Take, on average, `n_test` draws from test set (i.e., without replacement).
    u = random_state.uniform(size=len(x_draws))
    selected = u <= test_size
    x_test_draws = x_draws[selected]
    # Go back to multinomial representation.
    x_test = _as_multinomial(x_test_draws, n_features=len(x_i))  # type: ignore
    # Remainder is train set.
    x_train = x_i - x_test
    return x_train, x_test


def split_multinomial_dataset(
    X: NDArray[np.int_] | DataFrame, test_size: float = 0.5, random_state=None
) -> tuple[NDArray[np.int_] | DataFrame, NDArray[np.int_] | DataFrame]:
    """Partition dataset, with number of observations per row, in a train-test split.

    Each row in `X` counts the number of observations per category (columns). This
    function equally divides, for each row, the observations in a train and test set
    (with the test set getting a proportion of `test_size`).

    Example:
        Let's say you have a dataset with questionnaire fields, with the total number of
        product ratings:
        ```python
        import pandas as pd
        product_names = ["a", "b"]
        rating_names = ["🙁", "😐", "😃"]
        product_ratings = pd.DataFrame(
            [[0, 1, 0], [2, 3, 7]], product_names, rating_names,
        )
        ```

        The total ratings of each product is multinomially distributed.

        >>> product_ratings
        🙁  😐  😃
        a  0  1  0
        b  2  3  7

        Here is how you make a train test split, equaly partitioning the ratings per
        product:

        >>> from statkit.dataset import split_multinomial_dataset
        >>> x_train, x_test = split_multinomial_dataset(
            product_ratings, test_size=0.5,
        )
        >>> x_train
           🙁  😐  😃
            a  0  1  0
            b  1  2  4
        >>> x_test
        🙁  😐  😃
        a  0  0  0
        b  1  1  3

    Args:
        X: A dataset where each row counts the number of observations per category
            (columns). That is, each row is a multinomial draw.
        test_size: Proportion of draws to reserve for the test set.
        random_state: Seed for numpy pseudo random number generator state.

    Returns:
        A pair `X_train`, `X_test` both shaped like `X`.
    """
    random_state = np.random.default_rng(random_state)

    _single_split = partial(_single_multinomial_train_test_split, test_size=test_size)

    X_np = X
    if isinstance(X, DataFrame):
        X_np = X.to_numpy()

    x_as = []
    x_bs = []
    for x_i in X_np:
        x_a, x_b = _single_split(random_state, x_i)
        x_as.append(x_a)
        x_bs.append(x_b)
    X_train = np.stack(x_as)
    X_test = np.stack(x_bs)

    if isinstance(X, DataFrame):
        df_train = DataFrame(X_train, index=X.index, columns=X.columns)
        df_test = DataFrame(X_test, index=X.index, columns=X.columns)
        return df_train, df_test
    return X_train, X_test


def balanced_downsample(
    X: DataFrame | NDArray,
    y: Series | NDArray,
    ratio: int = 1,
    replace: bool = False,
    verbose: bool = False,
) -> NDArray:
    r"""Downsample majority class while stratifying for variables `X`.

    This method uses propensity score matching to subsample the majority class so that
    covariates `X` of both groups are balanced. That is, the logits
    (i.e., \( \ln \frac{p}{1-p} \) predicted by a logistic regression model
    \( p(\pmb{y}|\pmb{X}) \)) from the cases (`y=1`) are matched to the control group
    (`y=0`). This ensure that after downsampling both groups are equally likely to be in
    the case and control group (according to the features `X`).

    Warning: In the worst case scenario, this method has a time complexity of
    \( O(m^2) \), where \( m \) is the number of samples.

    Example:
        Let's say you have two groups with systematic group differences in sex.
        ```python
        import numpy as np
        import pandas as pd

        names = ["eve", "alice", "carol", "dian", "bob", "frank"]
        group_label = pd.Series([1, 1, 0, 0, 0, 0], index=names)
        # Notice, no men in the case group (systematic bias).
        x_gender = np.array([0, 0, 0, 0, 1, 1])  # Female: 0; Male: 1.
        x_age = np.array([55, 75, 50, 60, 70, 80])
        demographics = pd.DataFrame(
            data={"gender": x_gender, "age": x_age}, index=names,
        )
        ```
        Make a subselection of the majority class matching on age and gender. After
        down sampling, the control group has similar age and gender distributions (
        namely, no men).

        >>> from statkit.dataset import balanced_downsample
        >>> controls = balanced_downsample(X=demographics, y=group_label)
        >>> controls
        Index(["carol", "dian"], dtype='object')

    Args:
        X: Downsample while balancing (stratifying) the classes based on these
            features/covariates/exogeneous variables.
        y: Binary classes to match (e.g., `y=1` case, `y=0` is control).
        ratio: Downsample majority class to achieve this `majority:minority` ratio.
        replace: By default, subsample without replacement.
        verbose: If True, print progress.

    Returns:
        Indices of the matched majority class (control group).
    """
    if replace:
        raise NotImplementedError("Downsampling with replacement is not implemented.")

    if ratio != 1:
        raise NotImplementedError("Downsampling with ratio != 1 is not implemented.")

    y_ = LabelBinarizer().fit_transform(y)
    y_ = np.squeeze(y_)
    # Swap classes if y=1 is the majority class.
    if sum(y_) > sum(1 - y_):
        y_ = 1 - y_

    # 1) Compute logits.
    model = LogisticRegression(penalty=None).fit(X, y_)
    logits = model.decision_function(X)

    # 2) Match the case with controls using propensity scores.
    control_indices = _find_nearest_matches_greedily(logits, y_, verbose)

    if isinstance(X, DataFrame):
        return X.index[control_indices]
    return control_indices


def _find_nearest_matches_greedily(logits, y, verbose):
    # Split cases and controls.
    case = y == 1
    control = y == 0

    # Select without replacement: we keep track of previously selected controls.
    not_selected = np.ones_like(y, dtype=bool)
    for idx_case in np.nonzero(case)[0]:
        idx_controls = np.nonzero(control & not_selected)[0]
        k = np.argmin(np.abs(logits[idx_controls] - logits[idx_case]))
        idx_match = idx_controls[k]
        not_selected[idx_match] = False
        if verbose:
            print(".", end="")

    if verbose:
        print()

    return np.nonzero(control & (~not_selected))[0]

Functions

def balanced_downsample(X: pandas.core.frame.DataFrame | numpy.ndarray[typing.Any, numpy.dtype[+_ScalarType_co]], y: pandas.core.series.Series | numpy.ndarray[typing.Any, numpy.dtype[+_ScalarType_co]], ratio: int = 1, replace: bool = False, verbose: bool = False) ‑> numpy.ndarray[typing.Any, numpy.dtype[+_ScalarType_co]]

Downsample majority class while stratifying for variables X.

This method uses propensity score matching to subsample the majority class so that covariates X of both groups are balanced. That is, the logits (i.e., \ln \frac{p}{1-p} predicted by a logistic regression model p(\pmb{y}|\pmb{X}) ) from the cases (y=1) are matched to the control group (y=0). This ensure that after downsampling both groups are equally likely to be in the case and control group (according to the features X).

Warning: In the worst case scenario, this method has a time complexity of O(m^2) , where m is the number of samples.

Example

Let's say you have two groups with systematic group differences in sex.

import numpy as np
import pandas as pd

names = ["eve", "alice", "carol", "dian", "bob", "frank"]
group_label = pd.Series([1, 1, 0, 0, 0, 0], index=names)
# Notice, no men in the case group (systematic bias).
x_gender = np.array([0, 0, 0, 0, 1, 1])  # Female: 0; Male: 1.
x_age = np.array([55, 75, 50, 60, 70, 80])
demographics = pd.DataFrame(
    data={"gender": x_gender, "age": x_age}, index=names,
)

Make a subselection of the majority class matching on age and gender. After down sampling, the control group has similar age and gender distributions ( namely, no men).

>>> from statkit.dataset import balanced_downsample
>>> controls = balanced_downsample(X=demographics, y=group_label)
>>> controls
Index(["carol", "dian"], dtype='object')

Args

X
Downsample while balancing (stratifying) the classes based on these features/covariates/exogeneous variables.
y
Binary classes to match (e.g., y=1 case, y=0 is control).
ratio
Downsample majority class to achieve this majority:minority ratio.
replace
By default, subsample without replacement.
verbose
If True, print progress.

Returns

Indices of the matched majority class (control group).

Expand source code
def balanced_downsample(
    X: DataFrame | NDArray,
    y: Series | NDArray,
    ratio: int = 1,
    replace: bool = False,
    verbose: bool = False,
) -> NDArray:
    r"""Downsample majority class while stratifying for variables `X`.

    This method uses propensity score matching to subsample the majority class so that
    covariates `X` of both groups are balanced. That is, the logits
    (i.e., \( \ln \frac{p}{1-p} \) predicted by a logistic regression model
    \( p(\pmb{y}|\pmb{X}) \)) from the cases (`y=1`) are matched to the control group
    (`y=0`). This ensure that after downsampling both groups are equally likely to be in
    the case and control group (according to the features `X`).

    Warning: In the worst case scenario, this method has a time complexity of
    \( O(m^2) \), where \( m \) is the number of samples.

    Example:
        Let's say you have two groups with systematic group differences in sex.
        ```python
        import numpy as np
        import pandas as pd

        names = ["eve", "alice", "carol", "dian", "bob", "frank"]
        group_label = pd.Series([1, 1, 0, 0, 0, 0], index=names)
        # Notice, no men in the case group (systematic bias).
        x_gender = np.array([0, 0, 0, 0, 1, 1])  # Female: 0; Male: 1.
        x_age = np.array([55, 75, 50, 60, 70, 80])
        demographics = pd.DataFrame(
            data={"gender": x_gender, "age": x_age}, index=names,
        )
        ```
        Make a subselection of the majority class matching on age and gender. After
        down sampling, the control group has similar age and gender distributions (
        namely, no men).

        >>> from statkit.dataset import balanced_downsample
        >>> controls = balanced_downsample(X=demographics, y=group_label)
        >>> controls
        Index(["carol", "dian"], dtype='object')

    Args:
        X: Downsample while balancing (stratifying) the classes based on these
            features/covariates/exogeneous variables.
        y: Binary classes to match (e.g., `y=1` case, `y=0` is control).
        ratio: Downsample majority class to achieve this `majority:minority` ratio.
        replace: By default, subsample without replacement.
        verbose: If True, print progress.

    Returns:
        Indices of the matched majority class (control group).
    """
    if replace:
        raise NotImplementedError("Downsampling with replacement is not implemented.")

    if ratio != 1:
        raise NotImplementedError("Downsampling with ratio != 1 is not implemented.")

    y_ = LabelBinarizer().fit_transform(y)
    y_ = np.squeeze(y_)
    # Swap classes if y=1 is the majority class.
    if sum(y_) > sum(1 - y_):
        y_ = 1 - y_

    # 1) Compute logits.
    model = LogisticRegression(penalty=None).fit(X, y_)
    logits = model.decision_function(X)

    # 2) Match the case with controls using propensity scores.
    control_indices = _find_nearest_matches_greedily(logits, y_, verbose)

    if isinstance(X, DataFrame):
        return X.index[control_indices]
    return control_indices
def split_multinomial_dataset(X: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | pandas.core.frame.DataFrame, test_size: float = 0.5, random_state=None) ‑> tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | pandas.core.frame.DataFrame, numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | pandas.core.frame.DataFrame]

Partition dataset, with number of observations per row, in a train-test split.

Each row in X counts the number of observations per category (columns). This function equally divides, for each row, the observations in a train and test set (with the test set getting a proportion of test_size).

Example

Let's say you have a dataset with questionnaire fields, with the total number of product ratings:

import pandas as pd
product_names = ["a", "b"]
rating_names = ["🙁", "😐", "😃"]
product_ratings = pd.DataFrame(
    [[0, 1, 0], [2, 3, 7]], product_names, rating_names,
)

The total ratings of each product is multinomially distributed.

>>> product_ratings
🙁  😐  😃
a  0  1  0
b  2  3  7

Here is how you make a train test split, equaly partitioning the ratings per product:

>>> from statkit.dataset import split_multinomial_dataset
>>> x_train, x_test = split_multinomial_dataset(
    product_ratings, test_size=0.5,
)
>>> x_train
   🙁  😐  😃
    a  0  1  0
    b  1  2  4
>>> x_test
🙁  😐  😃
a  0  0  0
b  1  1  3

Args

X
A dataset where each row counts the number of observations per category (columns). That is, each row is a multinomial draw.
test_size
Proportion of draws to reserve for the test set.
random_state
Seed for numpy pseudo random number generator state.

Returns

A pair X_train, X_test both shaped like X.

Expand source code
def split_multinomial_dataset(
    X: NDArray[np.int_] | DataFrame, test_size: float = 0.5, random_state=None
) -> tuple[NDArray[np.int_] | DataFrame, NDArray[np.int_] | DataFrame]:
    """Partition dataset, with number of observations per row, in a train-test split.

    Each row in `X` counts the number of observations per category (columns). This
    function equally divides, for each row, the observations in a train and test set
    (with the test set getting a proportion of `test_size`).

    Example:
        Let's say you have a dataset with questionnaire fields, with the total number of
        product ratings:
        ```python
        import pandas as pd
        product_names = ["a", "b"]
        rating_names = ["🙁", "😐", "😃"]
        product_ratings = pd.DataFrame(
            [[0, 1, 0], [2, 3, 7]], product_names, rating_names,
        )
        ```

        The total ratings of each product is multinomially distributed.

        >>> product_ratings
        🙁  😐  😃
        a  0  1  0
        b  2  3  7

        Here is how you make a train test split, equaly partitioning the ratings per
        product:

        >>> from statkit.dataset import split_multinomial_dataset
        >>> x_train, x_test = split_multinomial_dataset(
            product_ratings, test_size=0.5,
        )
        >>> x_train
           🙁  😐  😃
            a  0  1  0
            b  1  2  4
        >>> x_test
        🙁  😐  😃
        a  0  0  0
        b  1  1  3

    Args:
        X: A dataset where each row counts the number of observations per category
            (columns). That is, each row is a multinomial draw.
        test_size: Proportion of draws to reserve for the test set.
        random_state: Seed for numpy pseudo random number generator state.

    Returns:
        A pair `X_train`, `X_test` both shaped like `X`.
    """
    random_state = np.random.default_rng(random_state)

    _single_split = partial(_single_multinomial_train_test_split, test_size=test_size)

    X_np = X
    if isinstance(X, DataFrame):
        X_np = X.to_numpy()

    x_as = []
    x_bs = []
    for x_i in X_np:
        x_a, x_b = _single_split(random_state, x_i)
        x_as.append(x_a)
        x_bs.append(x_b)
    X_train = np.stack(x_as)
    X_test = np.stack(x_bs)

    if isinstance(X, DataFrame):
        df_train = DataFrame(X_train, index=X.index, columns=X.columns)
        df_test = DataFrame(X_test, index=X.index, columns=X.columns)
        return df_train, df_test
    return X_train, X_test