Module statkit.metrics
Classification metrics not part of sci-kit learn.
Expand source code
"""Classification metrics not part of sci-kit learn."""
from numpy import array, exp, isneginf, log, mean, ndarray, sum, where
from pandas import DataFrame, Series
from sklearn.metrics import (
accuracy_score,
average_precision_score,
f1_score,
roc_auc_score,
roc_curve,
)
from statkit.non_parametric import bootstrap_score
def youden_j_threshold(y_true, y_pred) -> float:
"""Classification threshold with highest Youden's J.
Args:
y_true: Ground truth labels.
y_pred: Labels predicted by the classifier.
"""
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
j_scores = tpr - fpr
j_ordered = sorted(zip(j_scores, thresholds))
return j_ordered[-1][1]
def youden_j(y_true, y_pred) -> float:
r"""Classifier informedness as a balance between true and false postivies.
Youden's J statistic is defined as:
$$
J = r_{\mathrm{tp}} - r_{\mathrm{fp}}.
$$
Args:
y_true: Ground truth labels.
y_pred: Labels predicted by the classifier.
"""
return sensitivity(y_true, y_pred) + specificity(y_true, y_pred) - 1
def true_positive_rate(y_true, y_prob, threshold: float = 0.5) -> float:
r"""The number of true positive out of all positives (recall).
Aliases:
- Sensitivity,
- Recall,
- Hit rate.
$$r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}$$
Args:
y_true: Ground truth label (binarised).
y_prob: Probability of positive class.
threshold: Classify as positive when probability exceeds threshold.
"""
if not isinstance(y_true, (ndarray, Series)):
y_true = array(y_true)
if not isinstance(y_prob, (ndarray, Series)):
y_prob = array(y_prob)
y_pred = y_prob >= threshold
positives = sum(y_true)
true_positives = sum(y_true.astype(bool) & y_pred)
return true_positives / positives
def false_positive_rate(y_true, y_prob, threshold: float = 0.5) -> float:
r"""The number of false positive out of all negatives.
Also called the fall out rate.
$$r_{\mathrm{fp}} = \frac{f_p}{t_p + f_n} = \frac{f_p}{p}$$
Args:
y_true: Ground truth label (binarised).
y_prob: Probability of positive class.
threshold: Classify as positive when probability exceeds threshold.
"""
if not isinstance(y_true, (ndarray, Series)):
y_true = array(y_true)
if not isinstance(y_prob, (ndarray, Series)):
y_prob = array(y_prob)
y_pred = y_prob > threshold
negatives = y_true.size - sum(y_true)
# Actual negative, but classified as positive.
false_positives = sum((~y_true.astype(bool)) & y_pred)
return false_positives / negatives
def false_negative_rate(y_true, y_prob, threshold: float = 0.5) -> float:
r"""The number of false negatives out of all positives.
Also called the miss rate.
$$r_{\mathrm{fn}} = \frac{f_n}{t_p + f_n} = \frac{f_n}{p}$$
Args:
y_true: Ground truth label (binarised).
y_prob: Probability of positive class.
threshold: Classify as positive when probability exceeds threshold.
"""
if not isinstance(y_true, (ndarray, Series)):
y_true = array(y_true)
if not isinstance(y_prob, (ndarray, Series)):
y_prob = array(y_prob)
y_pred = y_prob > threshold
positives = sum(y_true)
# Actual positive, but classified as negative.
is_fn = (y_true == 1) & (y_pred == 0)
false_negatives = sum(is_fn)
return false_negatives / positives
def sensitivity(y_true, y_prob, threshold: float = 0.5) -> float:
r"""The number of true positive out of all positives.
Aliases:
- True positive rate,
- Recall,
- Hit rate.
$$r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}$$
Args:
y_true: Ground truth label (binarised).
y_prob: Probability of positive class.
threshold: Classify as positive when probability exceeds threshold.
"""
return true_positive_rate(y_true, y_prob, threshold)
def specificity(y_true, y_prob, threshold: float = 0.5) -> float:
r"""The number of true negatives out of all negatives.
$$r_{\mathrm{tn}} = \frac{t_n}{t_n + f_p} = \frac{t_n}{n} = 1 - r_{\mathrm{fp}}$$
Aliases:
- True negative rate,
- Selectivity.
Args:
y_true: Ground truth label (binarised).
y_prob: Probability of positive class.
threshold: Classify as positive when probability exceeds threshold.
"""
return 1 - false_positive_rate(y_true, y_prob, threshold)
def binary_classification_report(
y_true,
y_pred_proba,
threshold: float = 0.5,
n_iterations: int = 1000,
quantile_range: tuple = (0.025, 0.975),
random_state=None,
) -> DataFrame:
r"""Compile performance metrics of a binary classifier.
Evaluates the model according to the following metrics:
- Accuracy,
- Average precision (area under the precision-recall curve),
- \( F_1 \),
- Area under the receiver operating characteristic curve (ROC AUC),
- Sensitivity (or, true positive rate, see `sensitivity`),
- Specificity (or, true negative rate, see `specificity`).
Args:
y_true: Ground truth labels (0 or 1).
y_pred: Predicted probability of the positive class.
threshold: Dichotomise probabilities greater or equal to this threshold as
positive.
quantile_range: Confidence interval quantiles.
n_iterations: Number of bootstrap permutations.
Returns:
A dataframe with the estimated classification metrics (`point`) and (by default
95 %) confidence interval (from `lower` to `upper`).
"""
scores = DataFrame(
index=[
"Accuracy",
"Average precision",
"$F_1$",
"ROC AUC",
"Sensitivity",
"Specificity",
],
columns=["point", "lower", "upper"],
)
kwargs = {
"n_iterations": n_iterations,
"random_state": random_state,
"quantile_range": quantile_range,
}
rank_scorers = {
"ROC AUC": roc_auc_score,
"Average precision": average_precision_score,
}
for name, scorer in rank_scorers.items():
score = bootstrap_score(y_true, y_pred_proba, metric=scorer, **kwargs)
scores.loc[name] = dict(score)
# Metrics that require the probability to be dichotomised.
class_scorers = {
"$F_1$": f1_score,
"Sensitivity": sensitivity,
"Specificity": specificity,
"Accuracy": accuracy_score,
}
for name, scorer in class_scorers.items():
y_pred = (y_pred_proba >= threshold).astype(int)
score = bootstrap_score(y_true, y_pred, metric=scorer, **kwargs)
scores.loc[name] = dict(score)
return scores
def perplexity(X, probs):
r"""Per word perplexity.
$$
\mathcal{L} = \exp\left(-\frac{1}{m}\sum_{i=1}^m \sum_{j=1}^n \frac{x_{ij}
\log p_{ij}}{\sum_{j=1}^n x_{ij}}\right)
$$
Args:
X: The word counts (a matrix of shape `(m, n)`).
probs: The probability of each word (a matrix of shape `(m, n)`).
Returns:
The perplexity over the dataset (a scalar), ranging from 1 (best) to infinity,
with a perplexity equal to `n` corresponding to a uniform distribution.
"""
n_words = X.sum(axis=1, keepdims=True)
log_probs = log(probs)
# Make sure we don't end up with nan because -inf * 0 = nan.
log_probs = where(isneginf(log_probs) & (X == 0.0), 0.0, log_probs)
log_likel = X * log_probs
# Carefully divide, since `n_words` might be zero.
is_observed = (n_words > 0).squeeze()
return exp(-mean(sum(log_likel[is_observed] / n_words[is_observed], axis=1)))
Functions
def binary_classification_report(y_true, y_pred_proba, threshold: float = 0.5, n_iterations: int = 1000, quantile_range: tuple = (0.025, 0.975), random_state=None) ‑> pandas.core.frame.DataFrame
-
Compile performance metrics of a binary classifier.
Evaluates the model according to the following metrics:
- Accuracy,
- Average precision (area under the precision-recall curve),
- F_1 ,
- Area under the receiver operating characteristic curve (ROC AUC),
- Sensitivity (or, true positive rate, see
sensitivity()
), - Specificity (or, true negative rate, see
specificity()
).
Args
y_true
- Ground truth labels (0 or 1).
y_pred
- Predicted probability of the positive class.
threshold
- Dichotomise probabilities greater or equal to this threshold as positive.
quantile_range
- Confidence interval quantiles.
n_iterations
- Number of bootstrap permutations.
Returns
A dataframe with the estimated classification metrics (
point
) and (by default 95 %) confidence interval (fromlower
toupper
).Expand source code
def binary_classification_report( y_true, y_pred_proba, threshold: float = 0.5, n_iterations: int = 1000, quantile_range: tuple = (0.025, 0.975), random_state=None, ) -> DataFrame: r"""Compile performance metrics of a binary classifier. Evaluates the model according to the following metrics: - Accuracy, - Average precision (area under the precision-recall curve), - \( F_1 \), - Area under the receiver operating characteristic curve (ROC AUC), - Sensitivity (or, true positive rate, see `sensitivity`), - Specificity (or, true negative rate, see `specificity`). Args: y_true: Ground truth labels (0 or 1). y_pred: Predicted probability of the positive class. threshold: Dichotomise probabilities greater or equal to this threshold as positive. quantile_range: Confidence interval quantiles. n_iterations: Number of bootstrap permutations. Returns: A dataframe with the estimated classification metrics (`point`) and (by default 95 %) confidence interval (from `lower` to `upper`). """ scores = DataFrame( index=[ "Accuracy", "Average precision", "$F_1$", "ROC AUC", "Sensitivity", "Specificity", ], columns=["point", "lower", "upper"], ) kwargs = { "n_iterations": n_iterations, "random_state": random_state, "quantile_range": quantile_range, } rank_scorers = { "ROC AUC": roc_auc_score, "Average precision": average_precision_score, } for name, scorer in rank_scorers.items(): score = bootstrap_score(y_true, y_pred_proba, metric=scorer, **kwargs) scores.loc[name] = dict(score) # Metrics that require the probability to be dichotomised. class_scorers = { "$F_1$": f1_score, "Sensitivity": sensitivity, "Specificity": specificity, "Accuracy": accuracy_score, } for name, scorer in class_scorers.items(): y_pred = (y_pred_proba >= threshold).astype(int) score = bootstrap_score(y_true, y_pred, metric=scorer, **kwargs) scores.loc[name] = dict(score) return scores
def false_negative_rate(y_true, y_prob, threshold: float = 0.5) ‑> float
-
The number of false negatives out of all positives.
Also called the miss rate. r_{\mathrm{fn}} = \frac{f_n}{t_p + f_n} = \frac{f_n}{p}
Args
y_true
- Ground truth label (binarised).
y_prob
- Probability of positive class.
threshold
- Classify as positive when probability exceeds threshold.
Expand source code
def false_negative_rate(y_true, y_prob, threshold: float = 0.5) -> float: r"""The number of false negatives out of all positives. Also called the miss rate. $$r_{\mathrm{fn}} = \frac{f_n}{t_p + f_n} = \frac{f_n}{p}$$ Args: y_true: Ground truth label (binarised). y_prob: Probability of positive class. threshold: Classify as positive when probability exceeds threshold. """ if not isinstance(y_true, (ndarray, Series)): y_true = array(y_true) if not isinstance(y_prob, (ndarray, Series)): y_prob = array(y_prob) y_pred = y_prob > threshold positives = sum(y_true) # Actual positive, but classified as negative. is_fn = (y_true == 1) & (y_pred == 0) false_negatives = sum(is_fn) return false_negatives / positives
def false_positive_rate(y_true, y_prob, threshold: float = 0.5) ‑> float
-
The number of false positive out of all negatives.
Also called the fall out rate. r_{\mathrm{fp}} = \frac{f_p}{t_p + f_n} = \frac{f_p}{p}
Args
y_true
- Ground truth label (binarised).
y_prob
- Probability of positive class.
threshold
- Classify as positive when probability exceeds threshold.
Expand source code
def false_positive_rate(y_true, y_prob, threshold: float = 0.5) -> float: r"""The number of false positive out of all negatives. Also called the fall out rate. $$r_{\mathrm{fp}} = \frac{f_p}{t_p + f_n} = \frac{f_p}{p}$$ Args: y_true: Ground truth label (binarised). y_prob: Probability of positive class. threshold: Classify as positive when probability exceeds threshold. """ if not isinstance(y_true, (ndarray, Series)): y_true = array(y_true) if not isinstance(y_prob, (ndarray, Series)): y_prob = array(y_prob) y_pred = y_prob > threshold negatives = y_true.size - sum(y_true) # Actual negative, but classified as positive. false_positives = sum((~y_true.astype(bool)) & y_pred) return false_positives / negatives
def perplexity(X, probs)
-
Per word perplexity.
\mathcal{L} = \exp\left(-\frac{1}{m}\sum_{i=1}^m \sum_{j=1}^n \frac{x_{ij} \log p_{ij}}{\sum_{j=1}^n x_{ij}}\right)
Args
X
- The word counts (a matrix of shape
(m, n)
). probs
- The probability of each word (a matrix of shape
(m, n)
).
Returns
The perplexity over the dataset (a scalar), ranging from 1 (best) to infinity, with a perplexity equal to
n
corresponding to a uniform distribution.Expand source code
def perplexity(X, probs): r"""Per word perplexity. $$ \mathcal{L} = \exp\left(-\frac{1}{m}\sum_{i=1}^m \sum_{j=1}^n \frac{x_{ij} \log p_{ij}}{\sum_{j=1}^n x_{ij}}\right) $$ Args: X: The word counts (a matrix of shape `(m, n)`). probs: The probability of each word (a matrix of shape `(m, n)`). Returns: The perplexity over the dataset (a scalar), ranging from 1 (best) to infinity, with a perplexity equal to `n` corresponding to a uniform distribution. """ n_words = X.sum(axis=1, keepdims=True) log_probs = log(probs) # Make sure we don't end up with nan because -inf * 0 = nan. log_probs = where(isneginf(log_probs) & (X == 0.0), 0.0, log_probs) log_likel = X * log_probs # Carefully divide, since `n_words` might be zero. is_observed = (n_words > 0).squeeze() return exp(-mean(sum(log_likel[is_observed] / n_words[is_observed], axis=1)))
def sensitivity(y_true, y_prob, threshold: float = 0.5) ‑> float
-
The number of true positive out of all positives.
Aliases
- True positive rate,
- Recall,
- Hit rate.
r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}
Args
y_true
- Ground truth label (binarised).
y_prob
- Probability of positive class.
threshold
- Classify as positive when probability exceeds threshold.
Expand source code
def sensitivity(y_true, y_prob, threshold: float = 0.5) -> float: r"""The number of true positive out of all positives. Aliases: - True positive rate, - Recall, - Hit rate. $$r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}$$ Args: y_true: Ground truth label (binarised). y_prob: Probability of positive class. threshold: Classify as positive when probability exceeds threshold. """ return true_positive_rate(y_true, y_prob, threshold)
def specificity(y_true, y_prob, threshold: float = 0.5) ‑> float
-
The number of true negatives out of all negatives.
r_{\mathrm{tn}} = \frac{t_n}{t_n + f_p} = \frac{t_n}{n} = 1 - r_{\mathrm{fp}}
Aliases
- True negative rate,
- Selectivity.
Args
y_true
- Ground truth label (binarised).
y_prob
- Probability of positive class.
threshold
- Classify as positive when probability exceeds threshold.
Expand source code
def specificity(y_true, y_prob, threshold: float = 0.5) -> float: r"""The number of true negatives out of all negatives. $$r_{\mathrm{tn}} = \frac{t_n}{t_n + f_p} = \frac{t_n}{n} = 1 - r_{\mathrm{fp}}$$ Aliases: - True negative rate, - Selectivity. Args: y_true: Ground truth label (binarised). y_prob: Probability of positive class. threshold: Classify as positive when probability exceeds threshold. """ return 1 - false_positive_rate(y_true, y_prob, threshold)
def true_positive_rate(y_true, y_prob, threshold: float = 0.5) ‑> float
-
The number of true positive out of all positives (recall).
Aliases
- Sensitivity,
- Recall,
- Hit rate.
r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}
Args
y_true
- Ground truth label (binarised).
y_prob
- Probability of positive class.
threshold
- Classify as positive when probability exceeds threshold.
Expand source code
def true_positive_rate(y_true, y_prob, threshold: float = 0.5) -> float: r"""The number of true positive out of all positives (recall). Aliases: - Sensitivity, - Recall, - Hit rate. $$r_{\mathrm{tp}} = \frac{t_p}{t_p + f_n} = \frac{t_p}{p}$$ Args: y_true: Ground truth label (binarised). y_prob: Probability of positive class. threshold: Classify as positive when probability exceeds threshold. """ if not isinstance(y_true, (ndarray, Series)): y_true = array(y_true) if not isinstance(y_prob, (ndarray, Series)): y_prob = array(y_prob) y_pred = y_prob >= threshold positives = sum(y_true) true_positives = sum(y_true.astype(bool) & y_pred) return true_positives / positives
def youden_j(y_true, y_pred) ‑> float
-
Classifier informedness as a balance between true and false postivies.
Youden's J statistic is defined as: J = r_{\mathrm{tp}} - r_{\mathrm{fp}}.
Args
y_true
- Ground truth labels.
y_pred
- Labels predicted by the classifier.
Expand source code
def youden_j(y_true, y_pred) -> float: r"""Classifier informedness as a balance between true and false postivies. Youden's J statistic is defined as: $$ J = r_{\mathrm{tp}} - r_{\mathrm{fp}}. $$ Args: y_true: Ground truth labels. y_pred: Labels predicted by the classifier. """ return sensitivity(y_true, y_pred) + specificity(y_true, y_pred) - 1
def youden_j_threshold(y_true, y_pred) ‑> float
-
Classification threshold with highest Youden's J.
Args
y_true
- Ground truth labels.
y_pred
- Labels predicted by the classifier.
Expand source code
def youden_j_threshold(y_true, y_pred) -> float: """Classification threshold with highest Youden's J. Args: y_true: Ground truth labels. y_pred: Labels predicted by the classifier. """ fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) j_scores = tpr - fpr j_ordered = sorted(zip(j_scores, thresholds)) return j_ordered[-1][1]