Module statkit.non_parametric
Confidence intervals and p-values of a model's (test) score.
This module contains a set of non-parametric (i.e., without assuming any specific distribution) and exact methods of computing 95 % confidence intervals and p-values of your sci-kit learn model's predictions.
Expand source code
r"""Confidence intervals and \(p\)-values of a model's (test) score.
This module contains a set of non-parametric (i.e., without assuming any
specific distribution) and exact methods of computing 95 % confidence intervals
and \(p\)-values of your sci-kit learn model's predictions.
"""
from typing import Callable, Literal, Optional, Union
import numpy as np
from numpy import (
array,
stack,
concatenate,
mean,
ones_like,
quantile,
unique,
where,
zeros_like,
)
from pandas import Series
from sklearn.utils import check_random_state, resample, shuffle
from statkit.types import Estimate
def bootstrap_score(
y_true,
y_pred,
metric: Callable,
n_iterations: int = 1000,
random_state=None,
pos_label: Optional[Union[str, int]] = None,
quantile_range: tuple[float, float] = (0.025, 0.975),
metrics_kwargs: dict = {},
) -> Estimate:
"""Estimate 95 % confidence interval for `metric` by bootstrapping.
Example:
Estimate 95 % confidence interval of area under the receiver operating
characteristic curve (ROC AUC) on the test set of a binary classifier:
```python
y_pred = model.predict_proba(X_test)[:, 1]
bootstrap_score(y_test, y_pred, metric=roc_auc_score)
```
Args:
y_true: Ground truth labels.
y_pred: Labels predicted by the classifier (or label probabilities,
depending on the metric).
metric: Performance metric that takes the true and predicted labels and
returns a score.
n_iterations: Resample the data (with replacement) this many times.
pos_label: When `y_true` is a binary classification, the positive class.
quantile_range: Confidence interval range.
metric_kwargs: Pass additional keyword arguments to `metric`.
Returns:
The point estimate (see `statkit.types.Estimate`) with 95 % confidence interval
of `metric` distribution.
"""
if pos_label is not None:
labels = unique(y_true)
if len(labels) != 2:
raise ValueError("Must be binary labels when `pos_label` is specified")
_y_true = where(y_true == pos_label, ones_like(y_true), zeros_like(y_true))
else:
_y_true = y_true
random_state = check_random_state(random_state)
statistics = []
for _ in range(n_iterations):
y_true_rnd, y_pred_rnd = resample(_y_true, y_pred, random_state=random_state)
# Reject sample if all class labels are the same.
if len(unique(y_true_rnd)) == 1:
continue
score = metric(y_true_rnd, y_pred_rnd, **metrics_kwargs)
statistics.append(score)
point_estimate = metric(_y_true, y_pred, **metrics_kwargs)
# If all samples were rejected, there is no variability in the dataset to bootstrap.
if len(statistics) == 0:
return Estimate(
point=point_estimate, lower=point_estimate, upper=point_estimate
)
# Estimate confidence intervals.
lower = quantile(statistics, quantile_range[0], axis=0)
upper = quantile(statistics, quantile_range[1], axis=0)
return Estimate(point=point_estimate, lower=lower, upper=upper)
def unpaired_permutation_test(
y_true_1: Series,
y_pred_1: Series,
y_true_2: Series,
y_pred_2: Series,
metric: Callable,
alternative: Literal["less", "greater", "two-sided"] = "two-sided",
n_iterations: int = 1000,
random_state=None,
metrics_kwargs: dict = {},
) -> float:
r"""Unpaired permutation test comparing scores `y_pred_1` with `y_pred_2`.
Null hypothesis, \( H_0 \): metric is not different.
Example:
```python
unpaired_permutation_test(
# Ground truth - prediction pair first sample set.
y_test_1,
y_pred_1,
# Ground truth - prediction pair second sample set.
y_test_2,
y_pred_2,
metric=roc_auc_score,
)
```
Args:
y_true_1, y_true_2: Ground truth labels of unpaired groups.
y_pred_1, y_pred_2: Predicted labels (or label probabilities,
depending on the metric) of corresponding groups.
metric: Performance metric that takes the true and predicted labels and
returns a score.
n_iterations: Resample the data (with replacement) this many times.
metric_kwargs: Pass additional keyword arguments to `metric`.
Returns:
The p-value for observering the difference given \( H_0 \).
"""
random_state = check_random_state(random_state)
score1 = metric(y_true_1, y_pred_1, **metrics_kwargs)
score2 = metric(y_true_2, y_pred_2, **metrics_kwargs)
observed_difference = score1 - score2
n_1 = len(y_pred_1)
score_diff = []
for i in range(n_iterations):
# Pool slices and randomly split into groups of size n_1 and n_2.
Y_1 = stack([y_true_1, y_pred_1], axis=-1)
Y_2 = stack([y_true_2, y_pred_2], axis=-1)
y_H0 = shuffle(concatenate([Y_1, Y_2]), random_state=random_state)
y1_true = y_H0[:n_1, ..., 0]
y2_true = y_H0[n_1:, ..., 0]
y1_pred_H0 = y_H0[:n_1, ..., 1]
y2_pred_H0 = y_H0[n_1:, ..., 1]
if len(unique(y1_true)) == 1 or len(unique(y2_true)) == 1:
continue
permuted_score1 = metric(y1_true, y1_pred_H0, **metrics_kwargs)
permuted_score2 = metric(y2_true, y2_pred_H0, **metrics_kwargs)
score_diff.append(permuted_score1 - permuted_score2)
permuted_diff = array(score_diff)
if alternative == "greater":
p_value = mean(permuted_diff >= observed_difference, axis=0)
elif alternative == "less":
p_value = mean(permuted_diff <= observed_difference, axis=0)
elif alternative == "two-sided":
p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0)
return p_value
def one_vs_rest_permutation_test(
y_true,
y_pred_1,
*y_preds_rest,
metric: Callable,
alternative: Literal["greater", "two-sided"] = "greater",
n_iterations: int = 1000,
random_state=None,
metrics_kwargs: dict = {},
) -> float:
r"""Test superiority of first classifer over the rest.
Non-parameteric one-versus-rest comparison to test if the predictions `y_pred_1`
is better than the predictions of the other models `y_preds_rest[0]`,..,
`y_preds_rest[n]`.
\( H_0 \): The predictions of the first model is similar to the
highest metric of (and thus, doesn't outperform) the rest of the models in terms
of `metric`.
\( H_a \): The predictions of the first model is significantly different from the
rest.
Example:
Test if the average precision of `model_1` is statistically better than `model_2`
and `model_3`:
```python
from sklearn.metrics import average_precision_score
y_pred_1 = model_1.predict_proba(X_test)[:, 1]
y_pred_2 = model_2.predict_proba(X_test)[:, 1]
y_pred_3 = model_3.predict_proba(X_test)[:, 1]
p_value = one_vs_rest_permutation_test(
y_test,
y_pred_1,
y_pred_2,
y_pred_3,
metric=average_precision_score,
)
```
Args:
y_true: True labels (ground truth) of the corresponding predictions.
y_pred_1: Test if these predictions are superior to the rest.
*y_preds_rest: Sequence of predictions of the other models. Each set of
predictions has the same shape as `y_pred_1`.
metric: Function that takes a pair of true and predicted labels and gives a
scalar score (higher is better).
alternative: When `"greater"`, test absolute superiority of the first model.
When `"two-sided"`, test if the first model is significantly different
(higher or lower) from the largest metric (i.e., `max`) of the rest.
n_iterations: Randomly permute the data this many times.
metric_kwargs: Pass additional keyword arguments to `metric`.
Returns:
The p-value for observering the difference given \( H_0 \).
"""
random_state = np.random.default_rng(seed=random_state)
n_rest = len(y_preds_rest)
Y_pred = stack([y_pred_1, *y_preds_rest], axis=-1)
score1 = metric(y_true, Y_pred[:, 0], **metrics_kwargs)
# Compare with largest metric (`max`) of the rest.
score2 = max(
(metric(y_true, Y_pred[:, i], **metrics_kwargs) for i in range(1, n_rest + 1))
)
observed_difference = score1 - score2
score_diff = []
for _ in range(n_iterations):
# Paired permutation of predictions (independently along the the sample's rows).
Y_permuted = random_state.permuted(Y_pred, axis=1)
permuted_score1 = metric(y_true, Y_permuted[:, 0], **metrics_kwargs)
permuted_score2 = max(
(
metric(y_true, Y_permuted[:, i], **metrics_kwargs)
for i in range(1, n_rest + 1)
)
)
score_diff.append(permuted_score1 - permuted_score2)
permuted_diff = array(score_diff)
if alternative == "greater":
p_value = mean(permuted_diff >= observed_difference, axis=0)
elif alternative == "two-sided":
p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0)
return p_value
def paired_permutation_test(
y_true,
y_pred_1,
y_pred_2,
metric: Callable,
alternative: Literal["less", "greater", "two-sided"] = "two-sided",
n_iterations: int = 1000,
random_state=None,
metrics_kwargs: dict = {},
) -> float:
r"""Paired permutation test comparing scores from `y_pred_1` with `y_pred_2`.
Non-parametric head-to-head comparison of two predictions. Test if
`y_pred_1` is statistically different from `y_pred_2` for a given `metric`.
\( H_0 \): metric scores of `y_pred_1` and `y_pred_2` come from the same population
(i.e., invariant under group permutation 1 <--> 2).
Example:
Test if the area under the receiver operating characteristic curve
(ROC AUC) of model 1 statistically significantly better than model 2:
```python
y_pred_1 = model_1.predict_proba(X_test)[:, 1]
y_pred_2 = model_2.predict_proba(X_test)[:, 1]
paired_permutation_test(
y_test,
y_pred_1,
y_pred_2,
metric=roc_auc_score,
)
```
Args:
y_true: Ground truth labels.
y_pred_1, y_pred_2: Predicted labels to compare (or label probabilities,
depending on the metric).
metric: Performance metric that takes the true and predicted labels and
returns a score.
n_iterations: Resample the data (with replacement) this many times.
metric_kwargs: Pass additional keyword arguments to `metric`.
Returns:
The p-value for observering the difference given \( H_0 \).
"""
random_state = check_random_state(random_state)
score1 = metric(y_true, y_pred_1, **metrics_kwargs)
score2 = metric(y_true, y_pred_2, **metrics_kwargs)
observed_difference = score1 - score2
# Broadcast mask to shape of y_pred_1.
mask_shape = (-1,) + len(y_pred_1.shape[1:]) * (1,)
m = len(y_true)
score_diff = []
for _ in range(n_iterations):
mask = random_state.randint(2, size=m).reshape(mask_shape)
# Randomly permute pairs of predictions.
p1 = where(mask, y_pred_1, y_pred_2)
p2 = where(mask, y_pred_2, y_pred_1)
permuted_score1 = metric(y_true, p1, **metrics_kwargs)
permuted_score2 = metric(y_true, p2, **metrics_kwargs)
score_diff.append(permuted_score1 - permuted_score2)
permuted_diff = array(score_diff)
if alternative == "greater":
p_value = mean(permuted_diff >= observed_difference, axis=0)
elif alternative == "less":
p_value = mean(permuted_diff <= observed_difference, axis=0)
elif alternative == "two-sided":
p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0)
return p_value
Functions
def bootstrap_score(y_true, y_pred, metric: Callable, n_iterations: int = 1000, random_state=None, pos_label: Union[str, int, ForwardRef(None)] = None, quantile_range: tuple[float, float] = (0.025, 0.975), metrics_kwargs: dict = {}) ‑> Estimate
-
Estimate 95 % confidence interval for
metric
by bootstrapping.Example
Estimate 95 % confidence interval of area under the receiver operating characteristic curve (ROC AUC) on the test set of a binary classifier:
y_pred = model.predict_proba(X_test)[:, 1] bootstrap_score(y_test, y_pred, metric=roc_auc_score)
Args
y_true
- Ground truth labels.
y_pred
- Labels predicted by the classifier (or label probabilities, depending on the metric).
metric
- Performance metric that takes the true and predicted labels and returns a score.
n_iterations
- Resample the data (with replacement) this many times.
pos_label
- When
y_true
is a binary classification, the positive class. quantile_range
- Confidence interval range.
metric_kwargs
- Pass additional keyword arguments to
metric
.
Returns
The point estimate (see
Estimate
) with 95 % confidence interval ofmetric
distribution.Expand source code
def bootstrap_score( y_true, y_pred, metric: Callable, n_iterations: int = 1000, random_state=None, pos_label: Optional[Union[str, int]] = None, quantile_range: tuple[float, float] = (0.025, 0.975), metrics_kwargs: dict = {}, ) -> Estimate: """Estimate 95 % confidence interval for `metric` by bootstrapping. Example: Estimate 95 % confidence interval of area under the receiver operating characteristic curve (ROC AUC) on the test set of a binary classifier: ```python y_pred = model.predict_proba(X_test)[:, 1] bootstrap_score(y_test, y_pred, metric=roc_auc_score) ``` Args: y_true: Ground truth labels. y_pred: Labels predicted by the classifier (or label probabilities, depending on the metric). metric: Performance metric that takes the true and predicted labels and returns a score. n_iterations: Resample the data (with replacement) this many times. pos_label: When `y_true` is a binary classification, the positive class. quantile_range: Confidence interval range. metric_kwargs: Pass additional keyword arguments to `metric`. Returns: The point estimate (see `statkit.types.Estimate`) with 95 % confidence interval of `metric` distribution. """ if pos_label is not None: labels = unique(y_true) if len(labels) != 2: raise ValueError("Must be binary labels when `pos_label` is specified") _y_true = where(y_true == pos_label, ones_like(y_true), zeros_like(y_true)) else: _y_true = y_true random_state = check_random_state(random_state) statistics = [] for _ in range(n_iterations): y_true_rnd, y_pred_rnd = resample(_y_true, y_pred, random_state=random_state) # Reject sample if all class labels are the same. if len(unique(y_true_rnd)) == 1: continue score = metric(y_true_rnd, y_pred_rnd, **metrics_kwargs) statistics.append(score) point_estimate = metric(_y_true, y_pred, **metrics_kwargs) # If all samples were rejected, there is no variability in the dataset to bootstrap. if len(statistics) == 0: return Estimate( point=point_estimate, lower=point_estimate, upper=point_estimate ) # Estimate confidence intervals. lower = quantile(statistics, quantile_range[0], axis=0) upper = quantile(statistics, quantile_range[1], axis=0) return Estimate(point=point_estimate, lower=lower, upper=upper)
def one_vs_rest_permutation_test(y_true, y_pred_1, *y_preds_rest, metric: Callable, alternative: Literal['greater', 'two-sided'] = 'greater', n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}) ‑> float
-
Test superiority of first classifer over the rest.
Non-parameteric one-versus-rest comparison to test if the predictions
y_pred_1
is better than the predictions of the other modelsy_preds_rest[0]
,..,y_preds_rest[n]
.H_0 : The predictions of the first model is similar to the highest metric of (and thus, doesn't outperform) the rest of the models in terms of
metric
.H_a : The predictions of the first model is significantly different from the rest.
Example
Test if the average precision of
model_1
is statistically better thanmodel_2
andmodel_3
:from sklearn.metrics import average_precision_score y_pred_1 = model_1.predict_proba(X_test)[:, 1] y_pred_2 = model_2.predict_proba(X_test)[:, 1] y_pred_3 = model_3.predict_proba(X_test)[:, 1] p_value = one_vs_rest_permutation_test( y_test, y_pred_1, y_pred_2, y_pred_3, metric=average_precision_score, )
Args
y_true
- True labels (ground truth) of the corresponding predictions.
y_pred_1
- Test if these predictions are superior to the rest.
*y_preds_rest
- Sequence of predictions of the other models. Each set of
predictions has the same shape as
y_pred_1
. metric
- Function that takes a pair of true and predicted labels and gives a scalar score (higher is better).
alternative
- When
"greater"
, test absolute superiority of the first model. When"two-sided"
, test if the first model is significantly different (higher or lower) from the largest metric (i.e.,max
) of the rest. n_iterations
- Randomly permute the data this many times.
metric_kwargs
- Pass additional keyword arguments to
metric
.
Returns
The p-value for observering the difference given H_0 .
Expand source code
def one_vs_rest_permutation_test( y_true, y_pred_1, *y_preds_rest, metric: Callable, alternative: Literal["greater", "two-sided"] = "greater", n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}, ) -> float: r"""Test superiority of first classifer over the rest. Non-parameteric one-versus-rest comparison to test if the predictions `y_pred_1` is better than the predictions of the other models `y_preds_rest[0]`,.., `y_preds_rest[n]`. \( H_0 \): The predictions of the first model is similar to the highest metric of (and thus, doesn't outperform) the rest of the models in terms of `metric`. \( H_a \): The predictions of the first model is significantly different from the rest. Example: Test if the average precision of `model_1` is statistically better than `model_2` and `model_3`: ```python from sklearn.metrics import average_precision_score y_pred_1 = model_1.predict_proba(X_test)[:, 1] y_pred_2 = model_2.predict_proba(X_test)[:, 1] y_pred_3 = model_3.predict_proba(X_test)[:, 1] p_value = one_vs_rest_permutation_test( y_test, y_pred_1, y_pred_2, y_pred_3, metric=average_precision_score, ) ``` Args: y_true: True labels (ground truth) of the corresponding predictions. y_pred_1: Test if these predictions are superior to the rest. *y_preds_rest: Sequence of predictions of the other models. Each set of predictions has the same shape as `y_pred_1`. metric: Function that takes a pair of true and predicted labels and gives a scalar score (higher is better). alternative: When `"greater"`, test absolute superiority of the first model. When `"two-sided"`, test if the first model is significantly different (higher or lower) from the largest metric (i.e., `max`) of the rest. n_iterations: Randomly permute the data this many times. metric_kwargs: Pass additional keyword arguments to `metric`. Returns: The p-value for observering the difference given \( H_0 \). """ random_state = np.random.default_rng(seed=random_state) n_rest = len(y_preds_rest) Y_pred = stack([y_pred_1, *y_preds_rest], axis=-1) score1 = metric(y_true, Y_pred[:, 0], **metrics_kwargs) # Compare with largest metric (`max`) of the rest. score2 = max( (metric(y_true, Y_pred[:, i], **metrics_kwargs) for i in range(1, n_rest + 1)) ) observed_difference = score1 - score2 score_diff = [] for _ in range(n_iterations): # Paired permutation of predictions (independently along the the sample's rows). Y_permuted = random_state.permuted(Y_pred, axis=1) permuted_score1 = metric(y_true, Y_permuted[:, 0], **metrics_kwargs) permuted_score2 = max( ( metric(y_true, Y_permuted[:, i], **metrics_kwargs) for i in range(1, n_rest + 1) ) ) score_diff.append(permuted_score1 - permuted_score2) permuted_diff = array(score_diff) if alternative == "greater": p_value = mean(permuted_diff >= observed_difference, axis=0) elif alternative == "two-sided": p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0) return p_value
def paired_permutation_test(y_true, y_pred_1, y_pred_2, metric: Callable, alternative: Literal['less', 'greater', 'two-sided'] = 'two-sided', n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}) ‑> float
-
Paired permutation test comparing scores from
y_pred_1
withy_pred_2
.Non-parametric head-to-head comparison of two predictions. Test if
y_pred_1
is statistically different fromy_pred_2
for a givenmetric
.H_0 : metric scores of
y_pred_1
andy_pred_2
come from the same population (i.e., invariant under group permutation 1 <–> 2).Example
Test if the area under the receiver operating characteristic curve (ROC AUC) of model 1 statistically significantly better than model 2:
y_pred_1 = model_1.predict_proba(X_test)[:, 1] y_pred_2 = model_2.predict_proba(X_test)[:, 1] paired_permutation_test( y_test, y_pred_1, y_pred_2, metric=roc_auc_score, )
Args
y_true
- Ground truth labels.
- y_pred_1, y_pred_2: Predicted labels to compare (or label probabilities,
- depending on the metric).
metric
- Performance metric that takes the true and predicted labels and returns a score.
n_iterations
- Resample the data (with replacement) this many times.
metric_kwargs
- Pass additional keyword arguments to
metric
.
Returns
The p-value for observering the difference given H_0 .
Expand source code
def paired_permutation_test( y_true, y_pred_1, y_pred_2, metric: Callable, alternative: Literal["less", "greater", "two-sided"] = "two-sided", n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}, ) -> float: r"""Paired permutation test comparing scores from `y_pred_1` with `y_pred_2`. Non-parametric head-to-head comparison of two predictions. Test if `y_pred_1` is statistically different from `y_pred_2` for a given `metric`. \( H_0 \): metric scores of `y_pred_1` and `y_pred_2` come from the same population (i.e., invariant under group permutation 1 <--> 2). Example: Test if the area under the receiver operating characteristic curve (ROC AUC) of model 1 statistically significantly better than model 2: ```python y_pred_1 = model_1.predict_proba(X_test)[:, 1] y_pred_2 = model_2.predict_proba(X_test)[:, 1] paired_permutation_test( y_test, y_pred_1, y_pred_2, metric=roc_auc_score, ) ``` Args: y_true: Ground truth labels. y_pred_1, y_pred_2: Predicted labels to compare (or label probabilities, depending on the metric). metric: Performance metric that takes the true and predicted labels and returns a score. n_iterations: Resample the data (with replacement) this many times. metric_kwargs: Pass additional keyword arguments to `metric`. Returns: The p-value for observering the difference given \( H_0 \). """ random_state = check_random_state(random_state) score1 = metric(y_true, y_pred_1, **metrics_kwargs) score2 = metric(y_true, y_pred_2, **metrics_kwargs) observed_difference = score1 - score2 # Broadcast mask to shape of y_pred_1. mask_shape = (-1,) + len(y_pred_1.shape[1:]) * (1,) m = len(y_true) score_diff = [] for _ in range(n_iterations): mask = random_state.randint(2, size=m).reshape(mask_shape) # Randomly permute pairs of predictions. p1 = where(mask, y_pred_1, y_pred_2) p2 = where(mask, y_pred_2, y_pred_1) permuted_score1 = metric(y_true, p1, **metrics_kwargs) permuted_score2 = metric(y_true, p2, **metrics_kwargs) score_diff.append(permuted_score1 - permuted_score2) permuted_diff = array(score_diff) if alternative == "greater": p_value = mean(permuted_diff >= observed_difference, axis=0) elif alternative == "less": p_value = mean(permuted_diff <= observed_difference, axis=0) elif alternative == "two-sided": p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0) return p_value
def unpaired_permutation_test(y_true_1: pandas.core.series.Series, y_pred_1: pandas.core.series.Series, y_true_2: pandas.core.series.Series, y_pred_2: pandas.core.series.Series, metric: Callable, alternative: Literal['less', 'greater', 'two-sided'] = 'two-sided', n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}) ‑> float
-
Unpaired permutation test comparing scores
y_pred_1
withy_pred_2
.Null hypothesis, H_0 : metric is not different.
Example
unpaired_permutation_test( # Ground truth - prediction pair first sample set. y_test_1, y_pred_1, # Ground truth - prediction pair second sample set. y_test_2, y_pred_2, metric=roc_auc_score, )
Args
- y_true_1, y_true_2: Ground truth labels of unpaired groups.
- y_pred_1, y_pred_2: Predicted labels (or label probabilities,
- depending on the metric) of corresponding groups.
metric
- Performance metric that takes the true and predicted labels and returns a score.
n_iterations
- Resample the data (with replacement) this many times.
metric_kwargs
- Pass additional keyword arguments to
metric
.
Returns
The p-value for observering the difference given H_0 .
Expand source code
def unpaired_permutation_test( y_true_1: Series, y_pred_1: Series, y_true_2: Series, y_pred_2: Series, metric: Callable, alternative: Literal["less", "greater", "two-sided"] = "two-sided", n_iterations: int = 1000, random_state=None, metrics_kwargs: dict = {}, ) -> float: r"""Unpaired permutation test comparing scores `y_pred_1` with `y_pred_2`. Null hypothesis, \( H_0 \): metric is not different. Example: ```python unpaired_permutation_test( # Ground truth - prediction pair first sample set. y_test_1, y_pred_1, # Ground truth - prediction pair second sample set. y_test_2, y_pred_2, metric=roc_auc_score, ) ``` Args: y_true_1, y_true_2: Ground truth labels of unpaired groups. y_pred_1, y_pred_2: Predicted labels (or label probabilities, depending on the metric) of corresponding groups. metric: Performance metric that takes the true and predicted labels and returns a score. n_iterations: Resample the data (with replacement) this many times. metric_kwargs: Pass additional keyword arguments to `metric`. Returns: The p-value for observering the difference given \( H_0 \). """ random_state = check_random_state(random_state) score1 = metric(y_true_1, y_pred_1, **metrics_kwargs) score2 = metric(y_true_2, y_pred_2, **metrics_kwargs) observed_difference = score1 - score2 n_1 = len(y_pred_1) score_diff = [] for i in range(n_iterations): # Pool slices and randomly split into groups of size n_1 and n_2. Y_1 = stack([y_true_1, y_pred_1], axis=-1) Y_2 = stack([y_true_2, y_pred_2], axis=-1) y_H0 = shuffle(concatenate([Y_1, Y_2]), random_state=random_state) y1_true = y_H0[:n_1, ..., 0] y2_true = y_H0[n_1:, ..., 0] y1_pred_H0 = y_H0[:n_1, ..., 1] y2_pred_H0 = y_H0[n_1:, ..., 1] if len(unique(y1_true)) == 1 or len(unique(y2_true)) == 1: continue permuted_score1 = metric(y1_true, y1_pred_H0, **metrics_kwargs) permuted_score2 = metric(y2_true, y2_pred_H0, **metrics_kwargs) score_diff.append(permuted_score1 - permuted_score2) permuted_diff = array(score_diff) if alternative == "greater": p_value = mean(permuted_diff >= observed_difference, axis=0) elif alternative == "less": p_value = mean(permuted_diff <= observed_difference, axis=0) elif alternative == "two-sided": p_value = mean(abs(permuted_diff) >= abs(observed_difference), axis=0) return p_value