Source code for scitex_seizure_metrics.calibration

"""Calibration metrics — reliability diagrams and Brier-score
decomposition.

Brier(p, y) = mean((p - y)^2)
            = reliability − resolution + uncertainty       (Murphy 1973)

where, for K equal-frequency bins:
- reliability = sum_k (n_k/N) * (mean_p_k - mean_y_k)^2
                  → 0 when predicted probabilities match observed freqs
- resolution  = sum_k (n_k/N) * (mean_y_k - mean_y)^2
                  → high when bins differ in their observed positive rate
- uncertainty = mean_y * (1 - mean_y)
                  → property of the data alone (max 0.25 at p=0.5)

Smaller Brier is better; the decomposition tells you whether bad scores
come from miscalibration (reliability) or low discrimination (low
resolution).
"""
from __future__ import annotations

from dataclasses import dataclass

import numpy as np


[docs] @dataclass class CalibrationReport: """Container for calibration metrics + per-bin curve points.""" brier: float reliability: float resolution: float uncertainty: float expected_calibration_error: float bin_centers: np.ndarray # mean predicted proba per bin bin_observed: np.ndarray # observed positive rate per bin bin_counts: np.ndarray # count per bin
[docs] def calibration_report(y_true, y_proba, *, n_bins: int = 10, strategy: str = "uniform") -> CalibrationReport: """Compute Brier decomposition + reliability table. Args: y_true: 1-D binary labels. y_proba: 1-D continuous predictions in [0, 1]. n_bins: number of probability bins for the reliability table. strategy: 'uniform' (equal-width) or 'quantile' (equal-frequency). Returns: CalibrationReport. """ y_true = np.asarray(y_true, dtype=float).ravel() y_proba = np.asarray(y_proba, dtype=float).ravel() if y_true.shape != y_proba.shape: raise ValueError("y_true and y_proba shape mismatch") n = y_true.size if n == 0: raise ValueError("empty inputs") # Bin edges if strategy == "uniform": edges = np.linspace(0.0, 1.0, n_bins + 1) elif strategy == "quantile": edges = np.unique( np.quantile(y_proba, np.linspace(0.0, 1.0, n_bins + 1)) ) if edges.size < 2: edges = np.array([0.0, 1.0]) else: raise ValueError(f"unknown strategy {strategy!r}") bin_idx = np.clip(np.digitize(y_proba, edges[1:-1], right=False), 0, len(edges) - 2) bin_p, bin_o, bin_c = [], [], [] for k in range(len(edges) - 1): mask = bin_idx == k if mask.any(): bin_p.append(float(y_proba[mask].mean())) bin_o.append(float(y_true[mask].mean())) bin_c.append(int(mask.sum())) else: bin_p.append(float((edges[k] + edges[k + 1]) / 2)) bin_o.append(0.0) bin_c.append(0) bin_p = np.asarray(bin_p) bin_o = np.asarray(bin_o) bin_c = np.asarray(bin_c) base_rate = float(y_true.mean()) weight = bin_c / max(1, n) reliability = float(np.sum(weight * (bin_p - bin_o) ** 2)) resolution = float(np.sum(weight * (bin_o - base_rate) ** 2)) uncertainty = float(base_rate * (1.0 - base_rate)) brier = float(np.mean((y_proba - y_true) ** 2)) # ECE = sum_k (n_k/N) * |mean_p_k - mean_y_k| ece = float(np.sum(weight * np.abs(bin_p - bin_o))) return CalibrationReport( brier=brier, reliability=reliability, resolution=resolution, uncertainty=uncertainty, expected_calibration_error=ece, bin_centers=bin_p, bin_observed=bin_o, bin_counts=bin_c, )