Edit on GitHub

insurance_gam.ebm

insurance_gam.ebm — EBM subpackage.

Re-exports the full public API of the original insurance-ebm package.

Requires the ebm extra::

pip install insurance-gam[ebm]
 1"""
 2insurance_gam.ebm — EBM subpackage.
 3
 4Re-exports the full public API of the original insurance-ebm package.
 5
 6Requires the ``ebm`` extra::
 7
 8    pip install insurance-gam[ebm]
 9"""
10
11try:
12    from ._model import InsuranceEBM
13    from ._relativities import RelativitiesTable
14    from ._monotonicity import MonotonicityEditor
15    from ._comparison import GLMComparison
16    from . import _diagnostics as diagnostics
17
18    from ._diagnostics import (
19        gini,
20        lorenz_curve,
21        double_lift,
22        deviance,
23        residual_plot,
24        calibration_table,
25    )
26except ImportError as _e:
27    raise ImportError(
28        f"insurance_gam.ebm requires the 'interpret' package. "
29        f"Install it with: pip install insurance-gam[ebm]\n"
30        f"Original error: {_e}"
31    ) from _e
32
33__all__ = [
34    "InsuranceEBM",
35    "RelativitiesTable",
36    "MonotonicityEditor",
37    "GLMComparison",
38    "diagnostics",
39    "gini",
40    "lorenz_curve",
41    "double_lift",
42    "deviance",
43    "residual_plot",
44    "calibration_table",
45]
class InsuranceEBM:
 98class InsuranceEBM:
 99    """
100    Explainable Boosting Machine configured for insurance pricing.
101
102    Wraps interpretML's ExplainableBoostingRegressor and adds:
103    - Exposure-aware fitting via log(exposure) init_score
104    - Polars DataFrame input support
105    - predict() returning expected values (frequency, severity, or pure premium)
106      rather than raw log scores
107    - Deviance scoring with the correct GLM family
108
109    Parameters
110    ----------
111    loss : str
112        Loss function. One of 'poisson' (default), 'tweedie', 'gamma', 'mse',
113        'mae', 'huber'. Poisson is standard for claim frequency; Tweedie or
114        Gamma for pure premium or severity.
115    variance_power : float
116        Tweedie variance power. Only used when loss='tweedie'. Set to 1.0 for
117        Poisson, 2.0 for Gamma, 1.5 for compound Poisson-Gamma (pure premium).
118    interactions : int or str
119        Number of pairwise interaction terms, or a multiplier string of the form
120        ``'Nx'`` (e.g. ``'3x'``, ``'1x'``) meaning ``N * n_features`` interaction
121        terms. ``'3x'`` is the default and usually works well. Use ``0`` or ``'0x'``
122        to disable interactions entirely (reduces memory and training time).
123        Passed through to ExplainableBoostingRegressor as ``interactions``.
124    monotone_constraints : dict, optional
125        Mapping of {feature_name: direction} where direction is +1 (increasing)
126        or -1 (decreasing). Applied at fit time via the interpretML API.
127    **ebm_kwargs
128        Any additional keyword arguments forwarded to ExplainableBoostingRegressor.
129        Common overrides: n_estimators, learning_rate, max_bins, min_samples_leaf,
130        random_state, n_jobs.
131
132    Notes
133    -----
134    interpretML uses a log link for Poisson, Tweedie, and Gamma families.
135    Exposure is incorporated as an offset: init_score = log(exposure). When no
136    exposure is provided, all exposures are treated as 1 (no offset needed).
137    """
138
139    def __init__(
140        self,
141        loss: str = "poisson",
142        variance_power: float = 1.5,
143        interactions: Union[int, str] = "3x",
144        monotone_constraints: Optional[dict] = None,
145        **ebm_kwargs,
146    ) -> None:
147        if loss not in _SUPPORTED_LOSSES:
148            raise ValueError(f"loss must be one of {_SUPPORTED_LOSSES}, got '{loss}'")
149        self.loss = loss
150        self.variance_power = variance_power
151        self.interactions = interactions
152        self.monotone_constraints = monotone_constraints or {}
153        self.ebm_kwargs = ebm_kwargs
154        self.ebm_: Optional[ExplainableBoostingRegressor] = None
155        self._feature_names: Optional[list] = None
156
157    def _build_ebm(self, feature_names: list) -> ExplainableBoostingRegressor:
158        """Instantiate the underlying EBM with appropriate settings."""
159        # Resolve interactions: 'Nx' means N * n_features (e.g. '3x', '1x', '0x')
160        if isinstance(self.interactions, str):
161            m = re.match(r'^(\d+)x$', self.interactions)
162            if m:
163                n_interactions = int(m.group(1)) * len(feature_names)
164            else:
165                raise ValueError(
166                    f"interactions string must be of the form 'Nx' (e.g. '3x', '1x'), "
167                    f"got {self.interactions!r}"
168                )
169        else:
170            n_interactions = int(self.interactions)
171
172        # Map loss string to interpretML objective.
173        # For Tweedie, the variance power must be embedded in the objective
174        # string as "tweedie_deviance:variance_power=<p>". Without it,
175        # interpretML falls back to a default power and the deviance is wrong.
176        if self.loss == "tweedie":
177            objective = f"tweedie_deviance:variance_power={self.variance_power}"
178        else:
179            objective_map = {
180                "poisson": "poisson_deviance",
181                "gamma": "gamma_deviance",
182                "mse": "rmse",
183                "mae": "mae",
184                "huber": "huber",
185            }
186            objective = objective_map[self.loss]
187
188        # tweedie_power kept for reference; power is now embedded in objective string
189        tweedie_power = self.variance_power if self.loss == "tweedie" else None
190
191        # Build monotone constraints list aligned to feature order
192        mc = []
193        for fn in feature_names:
194            mc.append(self.monotone_constraints.get(fn, 0))
195
196        kwargs = dict(self.ebm_kwargs)
197        kwargs.setdefault("random_state", 42)
198        kwargs.setdefault("n_jobs", -1)
199
200        ebm_args = {
201            "interactions": n_interactions,
202            "feature_names": feature_names,
203            "feature_types": None,  # let interpretML infer
204            "objective": objective,
205            "monotone_constraints": mc if any(v != 0 for v in mc) else None,
206        }
207        # tweedie_exp_target is not needed — power is embedded in the objective string above.
208        # No additional kwarg required for Tweedie.
209
210        ebm_args.update(kwargs)
211        return ExplainableBoostingRegressor(**ebm_args)
212
213    def fit(
214        self,
215        X: Union[pl.DataFrame, pd.DataFrame],
216        y: Union[np.ndarray, list, pl.Series, pd.Series],
217        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
218        sample_weight: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
219    ) -> "InsuranceEBM":
220        """
221        Fit the EBM to insurance training data.
222
223        Parameters
224        ----------
225        X : polars.DataFrame or pandas.DataFrame
226            Feature matrix. Column names are used as feature identifiers.
227        y : array-like
228            Target variable. For Poisson frequency models this is claim count.
229            For Tweedie pure premium models this is incurred loss amount (may
230            include zeros). For Gamma severity models this is average claim
231            cost on non-zero rows only.
232        exposure : array-like, optional
233            Exposure measure (e.g. earned car years). Incorporated as a log
234            offset: init_score = log(exposure). If omitted, all exposures are
235            assumed to be 1.
236        sample_weight : array-like, optional
237            Row-level observation weights. Applied in addition to exposure.
238            Useful for credibility weighting or re-balancing portfolio mix.
239
240        Returns
241        -------
242        self
243        """
244        X_pd = _to_pandas(X)
245        self._feature_names = list(X_pd.columns)
246
247        y_arr = _ensure_array(y)
248        exp_arr = _ensure_array(exposure)
249        sw_arr = _ensure_array(sample_weight)
250
251        self.ebm_ = self._build_ebm(self._feature_names)
252
253        fit_kwargs: dict = {}
254
255        if exp_arr is not None:
256            # Exposure enters as a log offset (init_score)
257            if np.any(exp_arr <= 0):
258                raise ValueError("All exposure values must be strictly positive.")
259            fit_kwargs["init_score"] = np.log(exp_arr)
260
261        if sw_arr is not None:
262            fit_kwargs["sample_weight"] = sw_arr
263
264        self.ebm_.fit(X_pd, y_arr, **fit_kwargs)
265        return self
266
267    def _check_fitted(self) -> None:
268        if self.ebm_ is None:
269            raise RuntimeError("Model has not been fitted yet. Call fit() first.")
270
271    def predict(
272        self,
273        X: Union[pl.DataFrame, pd.DataFrame],
274        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
275    ) -> np.ndarray:
276        """
277        Predict expected values on the original (not log) scale.
278
279        For log-link families (Poisson, Tweedie, Gamma), the EBM produces additive
280        scores on the log scale. This method applies exp() and multiplies by
281        exposure to return predicted counts or amounts.
282
283        Parameters
284        ----------
285        X : polars.DataFrame or pandas.DataFrame
286            Feature matrix. Must have the same columns as the training data.
287        exposure : array-like, optional
288            Exposure values for prediction. If provided, predictions are scaled
289            by exposure (equivalent to adding log(exposure) to the log score).
290            For rate models, pass exposure=None and scale externally.
291
292        Returns
293        -------
294        numpy.ndarray
295            Predicted values on the response scale.
296        """
297        self._check_fitted()
298        X_pd = _to_pandas(X)
299        log_scores = self.ebm_.predict(X_pd)
300
301        _log_link_losses = {"poisson", "tweedie", "gamma"}
302        if self.loss in _log_link_losses:
303            predictions = np.exp(log_scores)
304            if exposure is not None:
305                exp_arr = _ensure_array(exposure)
306                if np.any(exp_arr <= 0):
307                    raise ValueError("All exposure values must be strictly positive.")
308                predictions = predictions * exp_arr
309        else:
310            predictions = log_scores
311
312        return predictions
313
314    def predict_log_score(
315        self,
316        X: Union[pl.DataFrame, pd.DataFrame],
317    ) -> np.ndarray:
318        """
319        Return raw additive scores on the log scale (before exp transformation).
320
321        Useful for combining predictions from separate frequency and severity
322        models on a common additive scale.
323
324        Parameters
325        ----------
326        X : polars.DataFrame or pandas.DataFrame
327            Feature matrix.
328
329        Returns
330        -------
331        numpy.ndarray
332            Log-scale scores.
333        """
334        self._check_fitted()
335        X_pd = _to_pandas(X)
336        return self.ebm_.predict(X_pd)
337
338    def score(
339        self,
340        X: Union[pl.DataFrame, pd.DataFrame],
341        y: Union[np.ndarray, list, pl.Series, pd.Series],
342        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
343    ) -> float:
344        """
345        Compute mean deviance on test data (lower is better).
346
347        The deviance family matches the loss used at fit time. For Tweedie,
348        the same variance_power is used.
349
350        Parameters
351        ----------
352        X : polars.DataFrame or pandas.DataFrame
353            Feature matrix.
354        y : array-like
355            Observed values. For Poisson frequency models, this must be raw
356            claim counts (not rates). When exposure is provided, the model
357            computes expected counts (mu = exp(log_score) * exposure) and
358            compares to y; passing claim rates instead of counts will give
359            incorrect deviance values.
360        exposure : array-like, optional
361            Exposure values used to scale predictions.
362
363        Returns
364        -------
365        float
366            Mean deviance (negated so higher = better, consistent with sklearn).
367        """
368        self._check_fitted()
369        y_arr = _ensure_array(y)
370        exp_arr = _ensure_array(exposure)
371        y_pred = self.predict(X, exposure=exposure)
372        weights = exp_arr  # use exposure as weights in deviance calculation
373
374        if self.loss == "poisson":
375            dev = _deviance_poisson(y_arr, y_pred, weights)
376        elif self.loss == "gamma":
377            dev = _deviance_gamma(y_arr, y_pred, weights)
378        elif self.loss == "tweedie":
379            dev = _deviance_tweedie(y_arr, y_pred, self.variance_power, weights)
380        else:
381            # MSE for non-GLM losses
382            if weights is not None:
383                dev = float(np.average((y_arr - y_pred) ** 2, weights=weights))
384            else:
385                dev = float(np.mean((y_arr - y_pred) ** 2))
386
387        return -dev  # higher is better convention
388
389    @property
390    def feature_names(self) -> list:
391        """Feature names from the training data."""
392        self._check_fitted()
393        return self._feature_names
394
395    def __repr__(self) -> str:
396        fitted = self.ebm_ is not None
397        return (
398            f"InsuranceEBM(loss='{self.loss}', "
399            f"variance_power={self.variance_power}, "
400            f"interactions='{self.interactions}', "
401            f"fitted={fitted})"
402        )

Explainable Boosting Machine configured for insurance pricing.

Wraps interpretML's ExplainableBoostingRegressor and adds:

  • Exposure-aware fitting via log(exposure) init_score
  • Polars DataFrame input support
  • predict() returning expected values (frequency, severity, or pure premium) rather than raw log scores
  • Deviance scoring with the correct GLM family

Parameters

loss : str Loss function. One of 'poisson' (default), 'tweedie', 'gamma', 'mse', 'mae', 'huber'. Poisson is standard for claim frequency; Tweedie or Gamma for pure premium or severity. variance_power : float Tweedie variance power. Only used when loss='tweedie'. Set to 1.0 for Poisson, 2.0 for Gamma, 1.5 for compound Poisson-Gamma (pure premium). interactions : int or str Number of pairwise interaction terms, or a multiplier string of the form 'Nx' (e.g. '3x', '1x') meaning N * n_features interaction terms. '3x' is the default and usually works well. Use 0 or '0x' to disable interactions entirely (reduces memory and training time). Passed through to ExplainableBoostingRegressor as interactions. monotone_constraints : dict, optional Mapping of {feature_name: direction} where direction is +1 (increasing) or -1 (decreasing). Applied at fit time via the interpretML API. **ebm_kwargs Any additional keyword arguments forwarded to ExplainableBoostingRegressor. Common overrides: n_estimators, learning_rate, max_bins, min_samples_leaf, random_state, n_jobs.

Notes

interpretML uses a log link for Poisson, Tweedie, and Gamma families. Exposure is incorporated as an offset: init_score = log(exposure). When no exposure is provided, all exposures are treated as 1 (no offset needed).

InsuranceEBM( loss: str = 'poisson', variance_power: float = 1.5, interactions: Union[int, str] = '3x', monotone_constraints: Optional[dict] = None, **ebm_kwargs)
139    def __init__(
140        self,
141        loss: str = "poisson",
142        variance_power: float = 1.5,
143        interactions: Union[int, str] = "3x",
144        monotone_constraints: Optional[dict] = None,
145        **ebm_kwargs,
146    ) -> None:
147        if loss not in _SUPPORTED_LOSSES:
148            raise ValueError(f"loss must be one of {_SUPPORTED_LOSSES}, got '{loss}'")
149        self.loss = loss
150        self.variance_power = variance_power
151        self.interactions = interactions
152        self.monotone_constraints = monotone_constraints or {}
153        self.ebm_kwargs = ebm_kwargs
154        self.ebm_: Optional[ExplainableBoostingRegressor] = None
155        self._feature_names: Optional[list] = None
loss
variance_power
interactions
monotone_constraints
ebm_kwargs
ebm_: Optional[interpret.glassbox._ebm._ebm.ExplainableBoostingRegressor]
def fit( self, X: Union[polars.dataframe.frame.DataFrame, pandas.DataFrame], y: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series], exposure: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series, NoneType] = None, sample_weight: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series, NoneType] = None) -> InsuranceEBM:
213    def fit(
214        self,
215        X: Union[pl.DataFrame, pd.DataFrame],
216        y: Union[np.ndarray, list, pl.Series, pd.Series],
217        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
218        sample_weight: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
219    ) -> "InsuranceEBM":
220        """
221        Fit the EBM to insurance training data.
222
223        Parameters
224        ----------
225        X : polars.DataFrame or pandas.DataFrame
226            Feature matrix. Column names are used as feature identifiers.
227        y : array-like
228            Target variable. For Poisson frequency models this is claim count.
229            For Tweedie pure premium models this is incurred loss amount (may
230            include zeros). For Gamma severity models this is average claim
231            cost on non-zero rows only.
232        exposure : array-like, optional
233            Exposure measure (e.g. earned car years). Incorporated as a log
234            offset: init_score = log(exposure). If omitted, all exposures are
235            assumed to be 1.
236        sample_weight : array-like, optional
237            Row-level observation weights. Applied in addition to exposure.
238            Useful for credibility weighting or re-balancing portfolio mix.
239
240        Returns
241        -------
242        self
243        """
244        X_pd = _to_pandas(X)
245        self._feature_names = list(X_pd.columns)
246
247        y_arr = _ensure_array(y)
248        exp_arr = _ensure_array(exposure)
249        sw_arr = _ensure_array(sample_weight)
250
251        self.ebm_ = self._build_ebm(self._feature_names)
252
253        fit_kwargs: dict = {}
254
255        if exp_arr is not None:
256            # Exposure enters as a log offset (init_score)
257            if np.any(exp_arr <= 0):
258                raise ValueError("All exposure values must be strictly positive.")
259            fit_kwargs["init_score"] = np.log(exp_arr)
260
261        if sw_arr is not None:
262            fit_kwargs["sample_weight"] = sw_arr
263
264        self.ebm_.fit(X_pd, y_arr, **fit_kwargs)
265        return self

Fit the EBM to insurance training data.

Parameters

X : polars.DataFrame or pandas.DataFrame Feature matrix. Column names are used as feature identifiers. y : array-like Target variable. For Poisson frequency models this is claim count. For Tweedie pure premium models this is incurred loss amount (may include zeros). For Gamma severity models this is average claim cost on non-zero rows only. exposure : array-like, optional Exposure measure (e.g. earned car years). Incorporated as a log offset: init_score = log(exposure). If omitted, all exposures are assumed to be 1. sample_weight : array-like, optional Row-level observation weights. Applied in addition to exposure. Useful for credibility weighting or re-balancing portfolio mix.

Returns

self

def predict( self, X: Union[polars.dataframe.frame.DataFrame, pandas.DataFrame], exposure: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series, NoneType] = None) -> numpy.ndarray:
271    def predict(
272        self,
273        X: Union[pl.DataFrame, pd.DataFrame],
274        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
275    ) -> np.ndarray:
276        """
277        Predict expected values on the original (not log) scale.
278
279        For log-link families (Poisson, Tweedie, Gamma), the EBM produces additive
280        scores on the log scale. This method applies exp() and multiplies by
281        exposure to return predicted counts or amounts.
282
283        Parameters
284        ----------
285        X : polars.DataFrame or pandas.DataFrame
286            Feature matrix. Must have the same columns as the training data.
287        exposure : array-like, optional
288            Exposure values for prediction. If provided, predictions are scaled
289            by exposure (equivalent to adding log(exposure) to the log score).
290            For rate models, pass exposure=None and scale externally.
291
292        Returns
293        -------
294        numpy.ndarray
295            Predicted values on the response scale.
296        """
297        self._check_fitted()
298        X_pd = _to_pandas(X)
299        log_scores = self.ebm_.predict(X_pd)
300
301        _log_link_losses = {"poisson", "tweedie", "gamma"}
302        if self.loss in _log_link_losses:
303            predictions = np.exp(log_scores)
304            if exposure is not None:
305                exp_arr = _ensure_array(exposure)
306                if np.any(exp_arr <= 0):
307                    raise ValueError("All exposure values must be strictly positive.")
308                predictions = predictions * exp_arr
309        else:
310            predictions = log_scores
311
312        return predictions

Predict expected values on the original (not log) scale.

For log-link families (Poisson, Tweedie, Gamma), the EBM produces additive scores on the log scale. This method applies exp() and multiplies by exposure to return predicted counts or amounts.

Parameters

X : polars.DataFrame or pandas.DataFrame Feature matrix. Must have the same columns as the training data. exposure : array-like, optional Exposure values for prediction. If provided, predictions are scaled by exposure (equivalent to adding log(exposure) to the log score). For rate models, pass exposure=None and scale externally.

Returns

numpy.ndarray Predicted values on the response scale.

def predict_log_score( self, X: Union[polars.dataframe.frame.DataFrame, pandas.DataFrame]) -> numpy.ndarray:
314    def predict_log_score(
315        self,
316        X: Union[pl.DataFrame, pd.DataFrame],
317    ) -> np.ndarray:
318        """
319        Return raw additive scores on the log scale (before exp transformation).
320
321        Useful for combining predictions from separate frequency and severity
322        models on a common additive scale.
323
324        Parameters
325        ----------
326        X : polars.DataFrame or pandas.DataFrame
327            Feature matrix.
328
329        Returns
330        -------
331        numpy.ndarray
332            Log-scale scores.
333        """
334        self._check_fitted()
335        X_pd = _to_pandas(X)
336        return self.ebm_.predict(X_pd)

Return raw additive scores on the log scale (before exp transformation).

Useful for combining predictions from separate frequency and severity models on a common additive scale.

Parameters

X : polars.DataFrame or pandas.DataFrame Feature matrix.

Returns

numpy.ndarray Log-scale scores.

def score( self, X: Union[polars.dataframe.frame.DataFrame, pandas.DataFrame], y: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series], exposure: Union[numpy.ndarray, list, polars.series.series.Series, pandas.Series, NoneType] = None) -> float:
338    def score(
339        self,
340        X: Union[pl.DataFrame, pd.DataFrame],
341        y: Union[np.ndarray, list, pl.Series, pd.Series],
342        exposure: Optional[Union[np.ndarray, list, pl.Series, pd.Series]] = None,
343    ) -> float:
344        """
345        Compute mean deviance on test data (lower is better).
346
347        The deviance family matches the loss used at fit time. For Tweedie,
348        the same variance_power is used.
349
350        Parameters
351        ----------
352        X : polars.DataFrame or pandas.DataFrame
353            Feature matrix.
354        y : array-like
355            Observed values. For Poisson frequency models, this must be raw
356            claim counts (not rates). When exposure is provided, the model
357            computes expected counts (mu = exp(log_score) * exposure) and
358            compares to y; passing claim rates instead of counts will give
359            incorrect deviance values.
360        exposure : array-like, optional
361            Exposure values used to scale predictions.
362
363        Returns
364        -------
365        float
366            Mean deviance (negated so higher = better, consistent with sklearn).
367        """
368        self._check_fitted()
369        y_arr = _ensure_array(y)
370        exp_arr = _ensure_array(exposure)
371        y_pred = self.predict(X, exposure=exposure)
372        weights = exp_arr  # use exposure as weights in deviance calculation
373
374        if self.loss == "poisson":
375            dev = _deviance_poisson(y_arr, y_pred, weights)
376        elif self.loss == "gamma":
377            dev = _deviance_gamma(y_arr, y_pred, weights)
378        elif self.loss == "tweedie":
379            dev = _deviance_tweedie(y_arr, y_pred, self.variance_power, weights)
380        else:
381            # MSE for non-GLM losses
382            if weights is not None:
383                dev = float(np.average((y_arr - y_pred) ** 2, weights=weights))
384            else:
385                dev = float(np.mean((y_arr - y_pred) ** 2))
386
387        return -dev  # higher is better convention

Compute mean deviance on test data (lower is better).

The deviance family matches the loss used at fit time. For Tweedie, the same variance_power is used.

Parameters

X : polars.DataFrame or pandas.DataFrame Feature matrix. y : array-like Observed values. For Poisson frequency models, this must be raw claim counts (not rates). When exposure is provided, the model computes expected counts (mu = exp(log_score) * exposure) and compares to y; passing claim rates instead of counts will give incorrect deviance values. exposure : array-like, optional Exposure values used to scale predictions.

Returns

float Mean deviance (negated so higher = better, consistent with sklearn).

feature_names: list
389    @property
390    def feature_names(self) -> list:
391        """Feature names from the training data."""
392        self._check_fitted()
393        return self._feature_names

Feature names from the training data.

class RelativitiesTable:
 96class RelativitiesTable:
 97    """
 98    Extract and present insurance-standard relativity tables from a fitted InsuranceEBM.
 99
100    A relativity is defined as exp(score_bin - score_base) where the base bin is the
101    modal bin (the one with the highest training weight). A relativity of 1.0 means
102    the bin contributes the same as the base level; >1 means higher risk, <1 lower.
103
104    Parameters
105    ----------
106    model : InsuranceEBM
107        A fitted InsuranceEBM instance.
108
109    Examples
110    --------
111    >>> rt = RelativitiesTable(model)
112    >>> rt.table("driver_age")
113    shape: (10, 3)
114    ┌────────────────┬───────────┬────────────┐
115    │ bin_label      ┆ raw_score ┆ relativity │
116    ...
117    """
118
119    def __init__(self, model: InsuranceEBM) -> None:
120        model._check_fitted()
121        self.model = model
122        self._ebm = model.ebm_
123
124    def table(self, feature: str) -> pl.DataFrame:
125        """
126        Return relativity table for a single feature.
127
128        Parameters
129        ----------
130        feature : str
131            Name of the feature (must be a main effect, not an interaction).
132
133        Returns
134        -------
135        polars.DataFrame
136            Columns: bin_label (str), raw_score (float), relativity (float).
137            The base bin has relativity = 1.0.
138        """
139        names, scores = _get_ebm_shape(self._ebm, feature)
140        base_idx = _modal_bin_idx(self._ebm, feature)
141
142        # Clip base_idx to valid range (can happen if weights vector is shorter)
143        base_idx = min(base_idx, len(scores) - 1)
144        base_score = scores[base_idx]
145
146        relativities = np.exp(scores - base_score)
147
148        return pl.DataFrame(
149            {
150                "bin_label": names.tolist(),
151                "raw_score": scores.tolist(),
152                "relativity": relativities.tolist(),
153            }
154        )
155
156    def plot(
157        self,
158        feature: str,
159        kind: str = "bar",
160        ax=None,
161        title: Optional[str] = None,
162    ):
163        """
164        Bar or line chart of relativities for a feature.
165
166        Parameters
167        ----------
168        feature : str
169            Feature name.
170        kind : str
171            'bar' (default) or 'line'.
172        ax : matplotlib.axes.Axes, optional
173            Axes to draw on. Creates a new figure if None.
174        title : str, optional
175            Plot title. Defaults to the feature name.
176
177        Returns
178        -------
179        matplotlib.axes.Axes
180        """
181        import matplotlib.pyplot as plt
182
183        df = self.table(feature)
184        labels = df["bin_label"].to_list()
185        rels = df["relativity"].to_list()
186
187        if ax is None:
188            fig, ax = plt.subplots(figsize=(max(8, len(labels) * 0.6), 5))
189
190        if kind == "bar":
191            colours = ["#d62728" if r > 1.0 else "#2ca02c" for r in rels]
192            ax.bar(range(len(labels)), rels, color=colours, alpha=0.8, edgecolor="white")
193        elif kind == "line":
194            ax.plot(range(len(labels)), rels, marker="o", color="#1f77b4", linewidth=2)
195            ax.fill_between(range(len(labels)), 1.0, rels, alpha=0.15, color="#1f77b4")
196        else:
197            raise ValueError(f"kind must be 'bar' or 'line', got '{kind}'")
198
199        ax.axhline(1.0, color="black", linewidth=0.8, linestyle="--")
200        ax.set_xticks(range(len(labels)))
201        ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=9)
202        ax.set_ylabel("Relativity")
203        ax.set_title(title or feature)
204        ax.grid(axis="y", alpha=0.3)
205
206        return ax
207
208    def export_excel(self, path: Union[str, Path]) -> None:
209        """
210        Export relativity tables for all main-effect features to an Excel workbook.
211
212        One sheet per feature. Interaction terms are skipped (they have 2-D score
213        arrays that don't translate cleanly to a flat relativity table).
214
215        Parameters
216        ----------
217        path : str or Path
218            Output path for the .xlsx file.
219
220        Notes
221        -----
222        Requires openpyxl. Install with: pip install insurance-gam[excel]
223        """
224        try:
225            import openpyxl  # noqa: F401
226        except ImportError:
227            raise ImportError(
228                "openpyxl is required for Excel export. "
229                "Install with: pip install insurance-gam[excel]"
230            )
231
232        path = Path(path)
233        feature_names = self.model.feature_names
234        frames: dict[str, pl.DataFrame] = {}
235
236        for feature in feature_names:
237            try:
238                df = self.table(feature)
239                frames[feature] = df
240            except ValueError:
241                # Skip interaction terms or features that can't be extracted
242                continue
243
244        if not frames:
245            raise RuntimeError("No main-effect features found to export.")
246
247        # Write via pandas ExcelWriter — polars converts to pandas per sheet
248        import pandas as pd
249        with pd.ExcelWriter(path, engine="openpyxl") as writer:
250            for sheet_name, df in frames.items():
251                # Excel sheet names are limited to 31 chars
252                safe_name = sheet_name[:31]
253                df.to_pandas().to_excel(writer, sheet_name=safe_name, index=False)
254
255    def summary(self) -> pl.DataFrame:
256        """
257        Summary table of all main-effect features.
258
259        Returns
260        -------
261        polars.DataFrame
262            Columns: feature (str), n_bins (int), min_relativity (float),
263            max_relativity (float), range (float).
264
265            range = max_relativity / min_relativity, a measure of how much
266            leverage the factor has over premium.
267        """
268        rows = []
269        for feature in self.model.feature_names:
270            try:
271                df = self.table(feature)
272                rels = df["relativity"].to_numpy()
273                rows.append(
274                    {
275                        "feature": feature,
276                        "n_bins": len(rels),
277                        "min_relativity": float(np.min(rels)),
278                        "max_relativity": float(np.max(rels)),
279                        "range": float(np.max(rels) / np.maximum(np.min(rels), 1e-10)),
280                    }
281                )
282            except ValueError:
283                continue
284
285        return pl.DataFrame(rows).sort("range", descending=True)

Extract and present insurance-standard relativity tables from a fitted InsuranceEBM.

A relativity is defined as exp(score_bin - score_base) where the base bin is the modal bin (the one with the highest training weight). A relativity of 1.0 means the bin contributes the same as the base level; >1 means higher risk, <1 lower.

Parameters

model : InsuranceEBM A fitted InsuranceEBM instance.

Examples

>>> rt = RelativitiesTable(model)
>>> rt.table("driver_age")
shape: (10, 3)
┌────────────────┬───────────┬────────────┐
│ bin_label      ┆ raw_score ┆ relativity │
...
RelativitiesTable(model: InsuranceEBM)
119    def __init__(self, model: InsuranceEBM) -> None:
120        model._check_fitted()
121        self.model = model
122        self._ebm = model.ebm_
model
def table(self, feature: str) -> polars.dataframe.frame.DataFrame:
124    def table(self, feature: str) -> pl.DataFrame:
125        """
126        Return relativity table for a single feature.
127
128        Parameters
129        ----------
130        feature : str
131            Name of the feature (must be a main effect, not an interaction).
132
133        Returns
134        -------
135        polars.DataFrame
136            Columns: bin_label (str), raw_score (float), relativity (float).
137            The base bin has relativity = 1.0.
138        """
139        names, scores = _get_ebm_shape(self._ebm, feature)
140        base_idx = _modal_bin_idx(self._ebm, feature)
141
142        # Clip base_idx to valid range (can happen if weights vector is shorter)
143        base_idx = min(base_idx, len(scores) - 1)
144        base_score = scores[base_idx]
145
146        relativities = np.exp(scores - base_score)
147
148        return pl.DataFrame(
149            {
150                "bin_label": names.tolist(),
151                "raw_score": scores.tolist(),
152                "relativity": relativities.tolist(),
153            }
154        )

Return relativity table for a single feature.

Parameters

feature : str Name of the feature (must be a main effect, not an interaction).

Returns

polars.DataFrame Columns: bin_label (str), raw_score (float), relativity (float). The base bin has relativity = 1.0.

def plot( self, feature: str, kind: str = 'bar', ax=None, title: Optional[str] = None):
156    def plot(
157        self,
158        feature: str,
159        kind: str = "bar",
160        ax=None,
161        title: Optional[str] = None,
162    ):
163        """
164        Bar or line chart of relativities for a feature.
165
166        Parameters
167        ----------
168        feature : str
169            Feature name.
170        kind : str
171            'bar' (default) or 'line'.
172        ax : matplotlib.axes.Axes, optional
173            Axes to draw on. Creates a new figure if None.
174        title : str, optional
175            Plot title. Defaults to the feature name.
176
177        Returns
178        -------
179        matplotlib.axes.Axes
180        """
181        import matplotlib.pyplot as plt
182
183        df = self.table(feature)
184        labels = df["bin_label"].to_list()
185        rels = df["relativity"].to_list()
186
187        if ax is None:
188            fig, ax = plt.subplots(figsize=(max(8, len(labels) * 0.6), 5))
189
190        if kind == "bar":
191            colours = ["#d62728" if r > 1.0 else "#2ca02c" for r in rels]
192            ax.bar(range(len(labels)), rels, color=colours, alpha=0.8, edgecolor="white")
193        elif kind == "line":
194            ax.plot(range(len(labels)), rels, marker="o", color="#1f77b4", linewidth=2)
195            ax.fill_between(range(len(labels)), 1.0, rels, alpha=0.15, color="#1f77b4")
196        else:
197            raise ValueError(f"kind must be 'bar' or 'line', got '{kind}'")
198
199        ax.axhline(1.0, color="black", linewidth=0.8, linestyle="--")
200        ax.set_xticks(range(len(labels)))
201        ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=9)
202        ax.set_ylabel("Relativity")
203        ax.set_title(title or feature)
204        ax.grid(axis="y", alpha=0.3)
205
206        return ax

Bar or line chart of relativities for a feature.

Parameters

feature : str Feature name. kind : str 'bar' (default) or 'line'. ax : matplotlib.axes.Axes, optional Axes to draw on. Creates a new figure if None. title : str, optional Plot title. Defaults to the feature name.

Returns

matplotlib.axes.Axes

def export_excel(self, path: Union[str, pathlib.Path]) -> None:
208    def export_excel(self, path: Union[str, Path]) -> None:
209        """
210        Export relativity tables for all main-effect features to an Excel workbook.
211
212        One sheet per feature. Interaction terms are skipped (they have 2-D score
213        arrays that don't translate cleanly to a flat relativity table).
214
215        Parameters
216        ----------
217        path : str or Path
218            Output path for the .xlsx file.
219
220        Notes
221        -----
222        Requires openpyxl. Install with: pip install insurance-gam[excel]
223        """
224        try:
225            import openpyxl  # noqa: F401
226        except ImportError:
227            raise ImportError(
228                "openpyxl is required for Excel export. "
229                "Install with: pip install insurance-gam[excel]"
230            )
231
232        path = Path(path)
233        feature_names = self.model.feature_names
234        frames: dict[str, pl.DataFrame] = {}
235
236        for feature in feature_names:
237            try:
238                df = self.table(feature)
239                frames[feature] = df
240            except ValueError:
241                # Skip interaction terms or features that can't be extracted
242                continue
243
244        if not frames:
245            raise RuntimeError("No main-effect features found to export.")
246
247        # Write via pandas ExcelWriter — polars converts to pandas per sheet
248        import pandas as pd
249        with pd.ExcelWriter(path, engine="openpyxl") as writer:
250            for sheet_name, df in frames.items():
251                # Excel sheet names are limited to 31 chars
252                safe_name = sheet_name[:31]
253                df.to_pandas().to_excel(writer, sheet_name=safe_name, index=False)

Export relativity tables for all main-effect features to an Excel workbook.

One sheet per feature. Interaction terms are skipped (they have 2-D score arrays that don't translate cleanly to a flat relativity table).

Parameters

path : str or Path Output path for the .xlsx file.

Notes

Requires openpyxl. Install with: pip install insurance-gam[excel]

def summary(self) -> polars.dataframe.frame.DataFrame:
255    def summary(self) -> pl.DataFrame:
256        """
257        Summary table of all main-effect features.
258
259        Returns
260        -------
261        polars.DataFrame
262            Columns: feature (str), n_bins (int), min_relativity (float),
263            max_relativity (float), range (float).
264
265            range = max_relativity / min_relativity, a measure of how much
266            leverage the factor has over premium.
267        """
268        rows = []
269        for feature in self.model.feature_names:
270            try:
271                df = self.table(feature)
272                rels = df["relativity"].to_numpy()
273                rows.append(
274                    {
275                        "feature": feature,
276                        "n_bins": len(rels),
277                        "min_relativity": float(np.min(rels)),
278                        "max_relativity": float(np.max(rels)),
279                        "range": float(np.max(rels) / np.maximum(np.min(rels), 1e-10)),
280                    }
281                )
282            except ValueError:
283                continue
284
285        return pl.DataFrame(rows).sort("range", descending=True)

Summary table of all main-effect features.

Returns

polars.DataFrame Columns: feature (str), n_bins (int), min_relativity (float), max_relativity (float), range (float).

range = max_relativity / min_relativity, a measure of how much
leverage the factor has over premium.
class MonotonicityEditor:
 96class MonotonicityEditor:
 97    """
 98    Post-fit monotonicity enforcement for EBM shape functions.
 99
100    Applies isotonic regression to bin scores for a named feature, adjusting
101    the stored term_scores_ in-place. The base model object (ebm_) is modified
102    directly, so predictions from InsuranceEBM.predict() will reflect the change.
103
104    Parameters
105    ----------
106    model : InsuranceEBM
107        A fitted InsuranceEBM instance. The underlying ebm_ will be modified in-place.
108
109    Notes
110    -----
111    The adjustment is not reversible once applied (the original scores are
112    overwritten). Keep a copy of the model if you need to compare before/after
113    predictions on a holdout set.
114    """
115
116    def __init__(self, model: InsuranceEBM) -> None:
117        model._check_fitted()
118        self.model = model
119        self._ebm = model.ebm_
120
121    def enforce(
122        self,
123        feature: str,
124        direction: Literal["increase", "decrease", "auto"] = "auto",
125    ) -> "MonotonicityEditor":
126        """
127        Enforce monotonicity on a feature's shape function via isotonic regression.
128
129        The bin scores (term_scores_) are updated in-place. After calling this,
130        model.predict() will reflect the monotone shape.
131
132        Parameters
133        ----------
134        feature : str
135            Feature name.
136        direction : str
137            'increase', 'decrease', or 'auto'. When 'auto', the dominant
138            direction is detected from the existing shape function.
139
140        Returns
141        -------
142        self
143            Returns self for method chaining.
144        """
145        term_idx, scores = _get_term_scores(self._ebm, feature)
146
147        if direction == "auto":
148            direction = _detect_direction(scores)
149
150        if direction not in ("increase", "decrease"):
151            raise ValueError(f"direction must be 'increase', 'decrease', or 'auto', got '{direction}'")
152
153        if _is_monotone(scores, direction):
154            # Already monotone — nothing to do
155            return self
156
157        # interpretML stores a special first bin for missing values (index 0)
158        # We preserve the missing-value score and only enforce on the main bins
159        if len(scores) > 1:
160            main_scores = scores[1:]
161            main_scores_mono = _isotonic_regression(main_scores, direction)
162            new_scores = np.concatenate([[scores[0]], main_scores_mono])
163        else:
164            new_scores = _isotonic_regression(scores, direction)
165
166        self._ebm.term_scores_[term_idx] = new_scores
167        return self
168
169    def check(self, feature: str, direction: Optional[str] = None) -> bool:
170        """
171        Check whether a feature's shape function is monotone.
172
173        Parameters
174        ----------
175        feature : str
176            Feature name.
177        direction : str, optional
178            'increase' or 'decrease'. If None, checks both directions (returns
179            True if monotone in either direction).
180
181        Returns
182        -------
183        bool
184            True if the shape function is monotone in the specified direction.
185        """
186        _, scores = _get_term_scores(self._ebm, feature)
187        main_scores = scores[1:] if len(scores) > 1 else scores
188
189        if direction is None:
190            return _is_monotone(main_scores, "increase") or _is_monotone(main_scores, "decrease")
191        return _is_monotone(main_scores, direction)
192
193    def plot_before_after(
194        self,
195        feature: str,
196        scores_before: np.ndarray,
197        direction: Optional[str] = None,
198        ax=None,
199    ):
200        """
201        Side-by-side comparison of shape function before and after monotonicity enforcement.
202
203        Because enforce() modifies scores in-place, you must capture the scores
204        before calling enforce(). Use MonotonicityEditor.get_scores() to obtain
205        a copy before enforcement.
206
207        Parameters
208        ----------
209        feature : str
210            Feature name.
211        scores_before : numpy.ndarray
212            Scores before enforcement (use get_scores() to capture these).
213        direction : str, optional
214            Direction label for the title.
215        ax : matplotlib.axes.Axes, optional
216            Two-element axes array. Creates a new figure if None.
217
218        Returns
219        -------
220        matplotlib.figure.Figure
221        """
222        import matplotlib.pyplot as plt
223
224        _, scores_after = _get_term_scores(self._ebm, feature)
225        main_before = scores_before[1:] if len(scores_before) > 1 else scores_before
226        main_after = scores_after[1:] if len(scores_after) > 1 else scores_after
227
228        n = max(len(main_before), len(main_after))
229        x = np.arange(n)
230
231        if ax is None:
232            fig, axes = plt.subplots(1, 2, figsize=(14, 5))
233        else:
234            axes = ax
235            fig = axes[0].get_figure()
236
237        for a, scores, title in zip(
238            axes,
239            [main_before, main_after],
240            ["Before enforcement", f"After enforcement ({direction or 'monotone'})"],
241        ):
242            a.plot(x[:len(scores)], scores, marker="o", color="#1f77b4", linewidth=2)
243            a.set_title(f"{feature}{title}")
244            a.set_xlabel("Bin index")
245            a.set_ylabel("Score (log scale)")
246            a.grid(alpha=0.3)
247
248        fig.tight_layout()
249        return fig
250
251    def get_scores(self, feature: str) -> np.ndarray:
252        """
253        Return a copy of the current scores for a feature.
254
255        Use this to capture scores before calling enforce() if you want to
256        plot a before/after comparison.
257
258        Parameters
259        ----------
260        feature : str
261            Feature name.
262
263        Returns
264        -------
265        numpy.ndarray
266            Copy of term_scores_ for this feature.
267        """
268        _, scores = _get_term_scores(self._ebm, feature)
269        return scores.copy()

Post-fit monotonicity enforcement for EBM shape functions.

Applies isotonic regression to bin scores for a named feature, adjusting the stored term_scores_ in-place. The base model object (ebm_) is modified directly, so predictions from InsuranceEBM.predict() will reflect the change.

Parameters

model : InsuranceEBM A fitted InsuranceEBM instance. The underlying ebm_ will be modified in-place.

Notes

The adjustment is not reversible once applied (the original scores are overwritten). Keep a copy of the model if you need to compare before/after predictions on a holdout set.

MonotonicityEditor(model: InsuranceEBM)
116    def __init__(self, model: InsuranceEBM) -> None:
117        model._check_fitted()
118        self.model = model
119        self._ebm = model.ebm_
model
def enforce( self, feature: str, direction: Literal['increase', 'decrease', 'auto'] = 'auto') -> MonotonicityEditor:
121    def enforce(
122        self,
123        feature: str,
124        direction: Literal["increase", "decrease", "auto"] = "auto",
125    ) -> "MonotonicityEditor":
126        """
127        Enforce monotonicity on a feature's shape function via isotonic regression.
128
129        The bin scores (term_scores_) are updated in-place. After calling this,
130        model.predict() will reflect the monotone shape.
131
132        Parameters
133        ----------
134        feature : str
135            Feature name.
136        direction : str
137            'increase', 'decrease', or 'auto'. When 'auto', the dominant
138            direction is detected from the existing shape function.
139
140        Returns
141        -------
142        self
143            Returns self for method chaining.
144        """
145        term_idx, scores = _get_term_scores(self._ebm, feature)
146
147        if direction == "auto":
148            direction = _detect_direction(scores)
149
150        if direction not in ("increase", "decrease"):
151            raise ValueError(f"direction must be 'increase', 'decrease', or 'auto', got '{direction}'")
152
153        if _is_monotone(scores, direction):
154            # Already monotone — nothing to do
155            return self
156
157        # interpretML stores a special first bin for missing values (index 0)
158        # We preserve the missing-value score and only enforce on the main bins
159        if len(scores) > 1:
160            main_scores = scores[1:]
161            main_scores_mono = _isotonic_regression(main_scores, direction)
162            new_scores = np.concatenate([[scores[0]], main_scores_mono])
163        else:
164            new_scores = _isotonic_regression(scores, direction)
165
166        self._ebm.term_scores_[term_idx] = new_scores
167        return self

Enforce monotonicity on a feature's shape function via isotonic regression.

The bin scores (term_scores_) are updated in-place. After calling this, model.predict() will reflect the monotone shape.

Parameters

feature : str Feature name. direction : str 'increase', 'decrease', or 'auto'. When 'auto', the dominant direction is detected from the existing shape function.

Returns

self Returns self for method chaining.

def check(self, feature: str, direction: Optional[str] = None) -> bool:
169    def check(self, feature: str, direction: Optional[str] = None) -> bool:
170        """
171        Check whether a feature's shape function is monotone.
172
173        Parameters
174        ----------
175        feature : str
176            Feature name.
177        direction : str, optional
178            'increase' or 'decrease'. If None, checks both directions (returns
179            True if monotone in either direction).
180
181        Returns
182        -------
183        bool
184            True if the shape function is monotone in the specified direction.
185        """
186        _, scores = _get_term_scores(self._ebm, feature)
187        main_scores = scores[1:] if len(scores) > 1 else scores
188
189        if direction is None:
190            return _is_monotone(main_scores, "increase") or _is_monotone(main_scores, "decrease")
191        return _is_monotone(main_scores, direction)

Check whether a feature's shape function is monotone.

Parameters

feature : str Feature name. direction : str, optional 'increase' or 'decrease'. If None, checks both directions (returns True if monotone in either direction).

Returns

bool True if the shape function is monotone in the specified direction.

def plot_before_after( self, feature: str, scores_before: numpy.ndarray, direction: Optional[str] = None, ax=None):
193    def plot_before_after(
194        self,
195        feature: str,
196        scores_before: np.ndarray,
197        direction: Optional[str] = None,
198        ax=None,
199    ):
200        """
201        Side-by-side comparison of shape function before and after monotonicity enforcement.
202
203        Because enforce() modifies scores in-place, you must capture the scores
204        before calling enforce(). Use MonotonicityEditor.get_scores() to obtain
205        a copy before enforcement.
206
207        Parameters
208        ----------
209        feature : str
210            Feature name.
211        scores_before : numpy.ndarray
212            Scores before enforcement (use get_scores() to capture these).
213        direction : str, optional
214            Direction label for the title.
215        ax : matplotlib.axes.Axes, optional
216            Two-element axes array. Creates a new figure if None.
217
218        Returns
219        -------
220        matplotlib.figure.Figure
221        """
222        import matplotlib.pyplot as plt
223
224        _, scores_after = _get_term_scores(self._ebm, feature)
225        main_before = scores_before[1:] if len(scores_before) > 1 else scores_before
226        main_after = scores_after[1:] if len(scores_after) > 1 else scores_after
227
228        n = max(len(main_before), len(main_after))
229        x = np.arange(n)
230
231        if ax is None:
232            fig, axes = plt.subplots(1, 2, figsize=(14, 5))
233        else:
234            axes = ax
235            fig = axes[0].get_figure()
236
237        for a, scores, title in zip(
238            axes,
239            [main_before, main_after],
240            ["Before enforcement", f"After enforcement ({direction or 'monotone'})"],
241        ):
242            a.plot(x[:len(scores)], scores, marker="o", color="#1f77b4", linewidth=2)
243            a.set_title(f"{feature}{title}")
244            a.set_xlabel("Bin index")
245            a.set_ylabel("Score (log scale)")
246            a.grid(alpha=0.3)
247
248        fig.tight_layout()
249        return fig

Side-by-side comparison of shape function before and after monotonicity enforcement.

Because enforce() modifies scores in-place, you must capture the scores before calling enforce(). Use MonotonicityEditor.get_scores() to obtain a copy before enforcement.

Parameters

feature : str Feature name. scores_before : numpy.ndarray Scores before enforcement (use get_scores() to capture these). direction : str, optional Direction label for the title. ax : matplotlib.axes.Axes, optional Two-element axes array. Creates a new figure if None.

Returns

matplotlib.figure.Figure

def get_scores(self, feature: str) -> numpy.ndarray:
251    def get_scores(self, feature: str) -> np.ndarray:
252        """
253        Return a copy of the current scores for a feature.
254
255        Use this to capture scores before calling enforce() if you want to
256        plot a before/after comparison.
257
258        Parameters
259        ----------
260        feature : str
261            Feature name.
262
263        Returns
264        -------
265        numpy.ndarray
266            Copy of term_scores_ for this feature.
267        """
268        _, scores = _get_term_scores(self._ebm, feature)
269        return scores.copy()

Return a copy of the current scores for a feature.

Use this to capture scores before calling enforce() if you want to plot a before/after comparison.

Parameters

feature : str Feature name.

Returns

numpy.ndarray Copy of term_scores_ for this feature.

class GLMComparison:
 92class GLMComparison:
 93    """
 94    Compare EBM shape functions against GLM factor relativities.
 95
 96    Use this when you have an existing GLM and want to see where the EBM agrees
 97    or disagrees. The key output is a ranked list of features by divergence —
 98    features near the top deserve the most scrutiny.
 99
100    Parameters
101    ----------
102    model : InsuranceEBM
103        A fitted InsuranceEBM instance.
104
105    Examples
106    --------
107    Comparing against a pre-computed GLM relativity table:
108
109    >>> glm_rel = pl.DataFrame({"level": ["A", "B", "C"], "relativity": [1.0, 1.25, 0.85]})
110    >>> cmp = GLMComparison(model)
111    >>> cmp.compare_shapes("vehicle_group", glm_relativities=glm_rel)
112    """
113
114    def __init__(self, model: InsuranceEBM) -> None:
115        model._check_fitted()
116        self.model = model
117        self._ebm = model.ebm_
118        self._rt = RelativitiesTable(model)
119
120    def compare_shapes(
121        self,
122        feature: str,
123        glm_model=None,
124        glm_relativities: Optional[pl.DataFrame] = None,
125    ) -> pl.DataFrame:
126        """
127        Align EBM bins with GLM factor levels for a single feature.
128
129        Either glm_model or glm_relativities must be provided.
130
131        Parameters
132        ----------
133        feature : str
134            Feature name.
135        glm_model : statsmodels GLM result, optional
136            A fitted statsmodels GLM result object. The method extracts
137            factor relativities from params automatically.
138        glm_relativities : polars.DataFrame, optional
139            Pre-computed GLM relativities with columns [level, relativity].
140            'level' values should match the EBM bin labels (for categorical
141            features) or be provided as string bin labels.
142
143        Returns
144        -------
145        polars.DataFrame
146            Columns: level (str), ebm_relativity (float), glm_relativity (float),
147            abs_diff (float), pct_diff (float).
148            Sorted by abs_diff descending.
149        """
150        if glm_model is None and glm_relativities is None:
151            raise ValueError("Provide either glm_model or glm_relativities.")
152        if glm_model is not None and glm_relativities is not None:
153            raise ValueError("Provide either glm_model or glm_relativities, not both.")
154
155        if glm_model is not None:
156            glm_df = _extract_glm_relativities(glm_model, feature)
157        else:
158            # Validate schema
159            required = {"level", "relativity"}
160            if not required.issubset(set(glm_relativities.columns)):
161                raise ValueError(
162                    f"glm_relativities must have columns {required}, "
163                    f"got {glm_relativities.columns}"
164                )
165            glm_df = glm_relativities
166
167        ebm_df = self._rt.table(feature)
168        aligned = _align_ebm_to_glm(ebm_df, glm_df)
169
170        if aligned.is_empty():
171            raise ValueError(
172                f"No matching levels between EBM bins and GLM for feature '{feature}'. "
173                "Check that bin labels match GLM factor level names."
174            )
175
176        aligned = aligned.with_columns(
177            abs_diff=(pl.col("ebm_relativity") - pl.col("glm_relativity")).abs(),
178            pct_diff=((pl.col("ebm_relativity") - pl.col("glm_relativity")).abs()
179                      / pl.col("glm_relativity") * 100),
180        ).sort("abs_diff", descending=True)
181
182        return aligned
183
184    def plot_comparison(
185        self,
186        feature: str,
187        glm_model=None,
188        glm_relativities: Optional[pl.DataFrame] = None,
189        ax=None,
190        title: Optional[str] = None,
191    ):
192        """
193        Overlaid chart of EBM shape function and GLM relativities.
194
195        Parameters
196        ----------
197        feature : str
198            Feature name.
199        glm_model : statsmodels GLM result, optional
200            Fitted statsmodels GLM.
201        glm_relativities : polars.DataFrame, optional
202            Pre-computed GLM relativities [level, relativity].
203        ax : matplotlib.axes.Axes, optional
204            Axes to draw on.
205        title : str, optional
206            Plot title. Defaults to feature name.
207
208        Returns
209        -------
210        matplotlib.axes.Axes
211        """
212        import matplotlib.pyplot as plt
213
214        df = self.compare_shapes(feature, glm_model=glm_model, glm_relativities=glm_relativities)
215        levels = df["level"].to_list()
216        ebm_rels = df["ebm_relativity"].to_list()
217        glm_rels = df["glm_relativity"].to_list()
218
219        if ax is None:
220            fig, ax = plt.subplots(figsize=(max(8, len(levels) * 0.7), 5))
221
222        x = np.arange(len(levels))
223        ax.bar(x - 0.2, ebm_rels, width=0.4, label="EBM", color="#1f77b4", alpha=0.8)
224        ax.bar(x + 0.2, glm_rels, width=0.4, label="GLM", color="#ff7f0e", alpha=0.8)
225        ax.axhline(1.0, color="black", linewidth=0.8, linestyle="--")
226        ax.set_xticks(x)
227        ax.set_xticklabels(levels, rotation=45, ha="right", fontsize=9)
228        ax.set_ylabel("Relativity")
229        ax.set_title(title or f"{feature}: EBM vs GLM")
230        ax.legend()
231        ax.grid(axis="y", alpha=0.3)
232
233        return ax
234
235    def divergence_summary(
236        self,
237        glm_model=None,
238        glm_relativities_by_feature: Optional[dict] = None,
239    ) -> pl.DataFrame:
240        """
241        Features ranked by maximum absolute relativity difference vs GLM.
242
243        Useful for triaging: features near the top of the table are where the
244        EBM and GLM disagree most strongly and deserve the most investigation.
245
246        Parameters
247        ----------
248        glm_model : statsmodels GLM result, optional
249            Fitted GLM. If provided, relativities are extracted for all features.
250        glm_relativities_by_feature : dict, optional
251            Mapping of {feature_name: polars.DataFrame} where each DataFrame
252            has columns [level, relativity]. Alternative to glm_model when you
253            have pre-computed tables for each feature.
254
255        Returns
256        -------
257        polars.DataFrame
258            Columns: feature (str), max_abs_diff (float), mean_abs_diff (float),
259            n_levels_compared (int).
260            Sorted by max_abs_diff descending.
261        """
262        if glm_model is None and glm_relativities_by_feature is None:
263            raise ValueError(
264                "Provide either glm_model or glm_relativities_by_feature."
265            )
266
267        rows = []
268        for feature in self.model.feature_names:
269            try:
270                if glm_relativities_by_feature is not None:
271                    rel_df = glm_relativities_by_feature.get(feature)
272                    if rel_df is None:
273                        continue
274                    df = self.compare_shapes(feature, glm_relativities=rel_df)
275                else:
276                    df = self.compare_shapes(feature, glm_model=glm_model)
277
278                diffs = df["abs_diff"].to_numpy()
279                rows.append(
280                    {
281                        "feature": feature,
282                        "max_abs_diff": float(np.max(diffs)),
283                        "mean_abs_diff": float(np.mean(diffs)),
284                        "n_levels_compared": len(diffs),
285                    }
286                )
287            except (ValueError, KeyError):
288                continue
289
290        return pl.DataFrame(rows).sort("max_abs_diff", descending=True)

Compare EBM shape functions against GLM factor relativities.

Use this when you have an existing GLM and want to see where the EBM agrees or disagrees. The key output is a ranked list of features by divergence — features near the top deserve the most scrutiny.

Parameters

model : InsuranceEBM A fitted InsuranceEBM instance.

Examples

Comparing against a pre-computed GLM relativity table:

>>> glm_rel = pl.DataFrame({"level": ["A", "B", "C"], "relativity": [1.0, 1.25, 0.85]})
>>> cmp = GLMComparison(model)
>>> cmp.compare_shapes("vehicle_group", glm_relativities=glm_rel)
GLMComparison(model: InsuranceEBM)
114    def __init__(self, model: InsuranceEBM) -> None:
115        model._check_fitted()
116        self.model = model
117        self._ebm = model.ebm_
118        self._rt = RelativitiesTable(model)
model
def compare_shapes( self, feature: str, glm_model=None, glm_relativities: Optional[polars.dataframe.frame.DataFrame] = None) -> polars.dataframe.frame.DataFrame:
120    def compare_shapes(
121        self,
122        feature: str,
123        glm_model=None,
124        glm_relativities: Optional[pl.DataFrame] = None,
125    ) -> pl.DataFrame:
126        """
127        Align EBM bins with GLM factor levels for a single feature.
128
129        Either glm_model or glm_relativities must be provided.
130
131        Parameters
132        ----------
133        feature : str
134            Feature name.
135        glm_model : statsmodels GLM result, optional
136            A fitted statsmodels GLM result object. The method extracts
137            factor relativities from params automatically.
138        glm_relativities : polars.DataFrame, optional
139            Pre-computed GLM relativities with columns [level, relativity].
140            'level' values should match the EBM bin labels (for categorical
141            features) or be provided as string bin labels.
142
143        Returns
144        -------
145        polars.DataFrame
146            Columns: level (str), ebm_relativity (float), glm_relativity (float),
147            abs_diff (float), pct_diff (float).
148            Sorted by abs_diff descending.
149        """
150        if glm_model is None and glm_relativities is None:
151            raise ValueError("Provide either glm_model or glm_relativities.")
152        if glm_model is not None and glm_relativities is not None:
153            raise ValueError("Provide either glm_model or glm_relativities, not both.")
154
155        if glm_model is not None:
156            glm_df = _extract_glm_relativities(glm_model, feature)
157        else:
158            # Validate schema
159            required = {"level", "relativity"}
160            if not required.issubset(set(glm_relativities.columns)):
161                raise ValueError(
162                    f"glm_relativities must have columns {required}, "
163                    f"got {glm_relativities.columns}"
164                )
165            glm_df = glm_relativities
166
167        ebm_df = self._rt.table(feature)
168        aligned = _align_ebm_to_glm(ebm_df, glm_df)
169
170        if aligned.is_empty():
171            raise ValueError(
172                f"No matching levels between EBM bins and GLM for feature '{feature}'. "
173                "Check that bin labels match GLM factor level names."
174            )
175
176        aligned = aligned.with_columns(
177            abs_diff=(pl.col("ebm_relativity") - pl.col("glm_relativity")).abs(),
178            pct_diff=((pl.col("ebm_relativity") - pl.col("glm_relativity")).abs()
179                      / pl.col("glm_relativity") * 100),
180        ).sort("abs_diff", descending=True)
181
182        return aligned

Align EBM bins with GLM factor levels for a single feature.

Either glm_model or glm_relativities must be provided.

Parameters

feature : str Feature name. glm_model : statsmodels GLM result, optional A fitted statsmodels GLM result object. The method extracts factor relativities from params automatically. glm_relativities : polars.DataFrame, optional Pre-computed GLM relativities with columns [level, relativity]. 'level' values should match the EBM bin labels (for categorical features) or be provided as string bin labels.

Returns

polars.DataFrame Columns: level (str), ebm_relativity (float), glm_relativity (float), abs_diff (float), pct_diff (float). Sorted by abs_diff descending.

def plot_comparison( self, feature: str, glm_model=None, glm_relativities: Optional[polars.dataframe.frame.DataFrame] = None, ax=None, title: Optional[str] = None):
184    def plot_comparison(
185        self,
186        feature: str,
187        glm_model=None,
188        glm_relativities: Optional[pl.DataFrame] = None,
189        ax=None,
190        title: Optional[str] = None,
191    ):
192        """
193        Overlaid chart of EBM shape function and GLM relativities.
194
195        Parameters
196        ----------
197        feature : str
198            Feature name.
199        glm_model : statsmodels GLM result, optional
200            Fitted statsmodels GLM.
201        glm_relativities : polars.DataFrame, optional
202            Pre-computed GLM relativities [level, relativity].
203        ax : matplotlib.axes.Axes, optional
204            Axes to draw on.
205        title : str, optional
206            Plot title. Defaults to feature name.
207
208        Returns
209        -------
210        matplotlib.axes.Axes
211        """
212        import matplotlib.pyplot as plt
213
214        df = self.compare_shapes(feature, glm_model=glm_model, glm_relativities=glm_relativities)
215        levels = df["level"].to_list()
216        ebm_rels = df["ebm_relativity"].to_list()
217        glm_rels = df["glm_relativity"].to_list()
218
219        if ax is None:
220            fig, ax = plt.subplots(figsize=(max(8, len(levels) * 0.7), 5))
221
222        x = np.arange(len(levels))
223        ax.bar(x - 0.2, ebm_rels, width=0.4, label="EBM", color="#1f77b4", alpha=0.8)
224        ax.bar(x + 0.2, glm_rels, width=0.4, label="GLM", color="#ff7f0e", alpha=0.8)
225        ax.axhline(1.0, color="black", linewidth=0.8, linestyle="--")
226        ax.set_xticks(x)
227        ax.set_xticklabels(levels, rotation=45, ha="right", fontsize=9)
228        ax.set_ylabel("Relativity")
229        ax.set_title(title or f"{feature}: EBM vs GLM")
230        ax.legend()
231        ax.grid(axis="y", alpha=0.3)
232
233        return ax

Overlaid chart of EBM shape function and GLM relativities.

Parameters

feature : str Feature name. glm_model : statsmodels GLM result, optional Fitted statsmodels GLM. glm_relativities : polars.DataFrame, optional Pre-computed GLM relativities [level, relativity]. ax : matplotlib.axes.Axes, optional Axes to draw on. title : str, optional Plot title. Defaults to feature name.

Returns

matplotlib.axes.Axes

def divergence_summary( self, glm_model=None, glm_relativities_by_feature: Optional[dict] = None) -> polars.dataframe.frame.DataFrame:
235    def divergence_summary(
236        self,
237        glm_model=None,
238        glm_relativities_by_feature: Optional[dict] = None,
239    ) -> pl.DataFrame:
240        """
241        Features ranked by maximum absolute relativity difference vs GLM.
242
243        Useful for triaging: features near the top of the table are where the
244        EBM and GLM disagree most strongly and deserve the most investigation.
245
246        Parameters
247        ----------
248        glm_model : statsmodels GLM result, optional
249            Fitted GLM. If provided, relativities are extracted for all features.
250        glm_relativities_by_feature : dict, optional
251            Mapping of {feature_name: polars.DataFrame} where each DataFrame
252            has columns [level, relativity]. Alternative to glm_model when you
253            have pre-computed tables for each feature.
254
255        Returns
256        -------
257        polars.DataFrame
258            Columns: feature (str), max_abs_diff (float), mean_abs_diff (float),
259            n_levels_compared (int).
260            Sorted by max_abs_diff descending.
261        """
262        if glm_model is None and glm_relativities_by_feature is None:
263            raise ValueError(
264                "Provide either glm_model or glm_relativities_by_feature."
265            )
266
267        rows = []
268        for feature in self.model.feature_names:
269            try:
270                if glm_relativities_by_feature is not None:
271                    rel_df = glm_relativities_by_feature.get(feature)
272                    if rel_df is None:
273                        continue
274                    df = self.compare_shapes(feature, glm_relativities=rel_df)
275                else:
276                    df = self.compare_shapes(feature, glm_model=glm_model)
277
278                diffs = df["abs_diff"].to_numpy()
279                rows.append(
280                    {
281                        "feature": feature,
282                        "max_abs_diff": float(np.max(diffs)),
283                        "mean_abs_diff": float(np.mean(diffs)),
284                        "n_levels_compared": len(diffs),
285                    }
286                )
287            except (ValueError, KeyError):
288                continue
289
290        return pl.DataFrame(rows).sort("max_abs_diff", descending=True)

Features ranked by maximum absolute relativity difference vs GLM.

Useful for triaging: features near the top of the table are where the EBM and GLM disagree most strongly and deserve the most investigation.

Parameters

glm_model : statsmodels GLM result, optional Fitted GLM. If provided, relativities are extracted for all features. glm_relativities_by_feature : dict, optional Mapping of {feature_name: polars.DataFrame} where each DataFrame has columns [level, relativity]. Alternative to glm_model when you have pre-computed tables for each feature.

Returns

polars.DataFrame Columns: feature (str), max_abs_diff (float), mean_abs_diff (float), n_levels_compared (int). Sorted by max_abs_diff descending.

def gini( y_true: Union[numpy.ndarray, polars.series.series.Series, list], y_pred: Union[numpy.ndarray, polars.series.series.Series, list], exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None) -> float:
 42def gini(
 43    y_true: Union[np.ndarray, pl.Series, list],
 44    y_pred: Union[np.ndarray, pl.Series, list],
 45    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
 46) -> float:
 47    """
 48    Normalised Gini coefficient for a predictive model.
 49
 50    The Gini measures the ordering power of the model — how well it separates
 51    high-risk from low-risk policies. A perfect model has Gini = 1.0; a random
 52    model has Gini = 0.0. Values below 0 indicate a model that ranks risks
 53    inversely.
 54
 55    The normalised Gini divides by the Gini of the oracle (perfect) model,
 56    so it is bounded [−1, 1].
 57
 58    Parameters
 59    ----------
 60    y_true : array-like
 61        Observed outcomes (claim counts or amounts).
 62    y_pred : array-like
 63        Model predictions (expected values).
 64    exposure : array-like, optional
 65        Exposure weights. When provided, each policy's contribution to the
 66        Lorenz curve is weighted by its exposure. Always provide exposure
 67        for frequency models.
 68
 69    Returns
 70    -------
 71    float
 72        Normalised Gini coefficient.
 73    """
 74    y_t = _coerce(y_true)
 75    y_p = _coerce(y_pred)
 76    w = _coerce(exposure)
 77
 78    if w is None:
 79        w = np.ones_like(y_t)
 80
 81    # Sort by predicted risk (ascending)
 82    order = np.argsort(y_p)
 83    y_t_sorted = y_t[order]
 84    w_sorted = w[order]
 85
 86    # Weighted cumulative sums
 87    w_cum = np.cumsum(w_sorted)
 88    loss_cum = np.cumsum(y_t_sorted * w_sorted)
 89
 90    w_total = w_cum[-1]
 91    loss_total = loss_cum[-1]
 92
 93    if loss_total == 0:
 94        return 0.0
 95
 96    # Gini of model predictions
 97    x = w_cum / w_total
 98    y = loss_cum / loss_total
 99    gini_model = 1.0 - 2.0 * float(_trapz(y, x))
100
101    # Gini of oracle (sort by actual outcomes)
102    order_oracle = np.argsort(y_t)
103    y_t_oracle = y_t[order_oracle]
104    w_oracle = w[order_oracle]
105    wc = np.cumsum(w_oracle)
106    lc = np.cumsum(y_t_oracle * w_oracle)
107    x_o = wc / wc[-1]
108    y_o = lc / lc[-1]
109    gini_oracle = 1.0 - 2.0 * float(_trapz(y_o, x_o))
110
111    if gini_oracle == 0:
112        return 0.0
113
114    return gini_model / gini_oracle

Normalised Gini coefficient for a predictive model.

The Gini measures the ordering power of the model — how well it separates high-risk from low-risk policies. A perfect model has Gini = 1.0; a random model has Gini = 0.0. Values below 0 indicate a model that ranks risks inversely.

The normalised Gini divides by the Gini of the oracle (perfect) model, so it is bounded [−1, 1].

Parameters

y_true : array-like Observed outcomes (claim counts or amounts). y_pred : array-like Model predictions (expected values). exposure : array-like, optional Exposure weights. When provided, each policy's contribution to the Lorenz curve is weighted by its exposure. Always provide exposure for frequency models.

Returns

float Normalised Gini coefficient.

def lorenz_curve( y_true: Union[numpy.ndarray, polars.series.series.Series, list], y_pred: Union[numpy.ndarray, polars.series.series.Series, list], exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None, plot: bool = False, ax=None) -> tuple[numpy.ndarray, numpy.ndarray]:
121def lorenz_curve(
122    y_true: Union[np.ndarray, pl.Series, list],
123    y_pred: Union[np.ndarray, pl.Series, list],
124    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
125    plot: bool = False,
126    ax=None,
127) -> tuple[np.ndarray, np.ndarray]:
128    """
129    Compute (and optionally plot) the Lorenz curve for a model.
130
131    Policies are sorted by predicted risk. The Lorenz curve shows the cumulative
132    share of exposure on the x-axis against cumulative share of losses on the y-axis.
133
134    Parameters
135    ----------
136    y_true : array-like
137        Observed outcomes.
138    y_pred : array-like
139        Model predictions.
140    exposure : array-like, optional
141        Exposure weights.
142    plot : bool
143        If True, draw the Lorenz curve on ``ax`` or a new figure.
144    ax : matplotlib.axes.Axes, optional
145        Axes to draw on. Ignored if plot=False.
146
147    Returns
148    -------
149    tuple[numpy.ndarray, numpy.ndarray]
150        (frac_exposure, frac_losses) — both start at 0 and end at 1.
151    """
152    y_t = _coerce(y_true)
153    y_p = _coerce(y_pred)
154    w = _coerce(exposure)
155
156    if w is None:
157        w = np.ones_like(y_t)
158
159    order = np.argsort(y_p)
160    y_t_s = y_t[order]
161    w_s = w[order]
162
163    w_cum = np.concatenate([[0], np.cumsum(w_s)])
164    l_cum = np.concatenate([[0], np.cumsum(y_t_s * w_s)])
165
166    frac_exp = w_cum / w_cum[-1]
167    frac_loss = l_cum / l_cum[-1] if l_cum[-1] > 0 else l_cum
168
169    if plot:
170        import matplotlib.pyplot as plt
171
172        if ax is None:
173            fig, ax = plt.subplots(figsize=(7, 6))
174
175        ax.plot(frac_exp, frac_loss, label="Model", color="#1f77b4", linewidth=2)
176        ax.plot([0, 1], [0, 1], "k--", linewidth=0.8, label="Random")
177        ax.fill_between(frac_exp, frac_exp, frac_loss, alpha=0.12, color="#1f77b4")
178        ax.set_xlabel("Cumulative share of exposure")
179        ax.set_ylabel("Cumulative share of losses")
180        ax.set_title("Lorenz Curve")
181        ax.legend()
182        ax.grid(alpha=0.3)
183
184    return frac_exp, frac_loss

Compute (and optionally plot) the Lorenz curve for a model.

Policies are sorted by predicted risk. The Lorenz curve shows the cumulative share of exposure on the x-axis against cumulative share of losses on the y-axis.

Parameters

y_true : array-like Observed outcomes. y_pred : array-like Model predictions. exposure : array-like, optional Exposure weights. plot : bool If True, draw the Lorenz curve on ax or a new figure. ax : matplotlib.axes.Axes, optional Axes to draw on. Ignored if plot=False.

Returns

tuple[numpy.ndarray, numpy.ndarray] (frac_exposure, frac_losses) — both start at 0 and end at 1.

def double_lift( y_true: Union[numpy.ndarray, polars.series.series.Series, list], y_pred: Union[numpy.ndarray, polars.series.series.Series, list], exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None, n_bands: int = 10) -> polars.dataframe.frame.DataFrame:
191def double_lift(
192    y_true: Union[np.ndarray, pl.Series, list],
193    y_pred: Union[np.ndarray, pl.Series, list],
194    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
195    n_bands: int = 10,
196) -> pl.DataFrame:
197    """
198    Double-lift chart: actual/expected by predicted risk decile.
199
200    Policies are sorted by predicted value and grouped into n_bands equal-weight
201    buckets. Within each bucket, the table shows mean actual, mean predicted, and
202    the A/E ratio. A well-calibrated model has A/E close to 1.0 across all bands.
203
204    Systematic deviations indicate model bias: monotone deviations suggest the
205    model is over/under-dispersed; U-shaped patterns suggest missing interactions.
206
207    Parameters
208    ----------
209    y_true : array-like
210        Observed outcomes.
211    y_pred : array-like
212        Model predictions.
213    exposure : array-like, optional
214        Exposure weights. When provided, bands are equal-exposure buckets.
215    n_bands : int
216        Number of bands (default 10 for deciles).
217
218    Returns
219    -------
220    polars.DataFrame
221        Columns: band (int), exposure (float), actual (float), predicted (float),
222        ae_ratio (float).
223    """
224    y_t = _coerce(y_true)
225    y_p = _coerce(y_pred)
226    w = _coerce(exposure)
227
228    if w is None:
229        w = np.ones_like(y_t)
230
231    # Sort by predicted value
232    order = np.argsort(y_p)
233    y_t_s = y_t[order]
234    y_p_s = y_p[order]
235    w_s = w[order]
236
237    # Assign equal-weight bands
238    w_cum = np.cumsum(w_s)
239    band_edges = np.linspace(0, w_cum[-1], n_bands + 1)
240    band_ids = np.searchsorted(w_cum, band_edges[1:], side="left")
241    band_ids = np.clip(band_ids, 0, len(w_s) - 1)
242
243    rows = []
244    prev = 0
245    for b_idx, end in enumerate(band_ids):
246        end = int(end) + 1
247        sl = slice(prev, end)
248        w_sl = w_s[sl]
249        total_w = float(np.sum(w_sl))
250        actual = float(np.sum(y_t_s[sl] * w_sl)) / total_w if total_w > 0 else float("nan")
251        predicted = float(np.sum(y_p_s[sl] * w_sl)) / total_w if total_w > 0 else float("nan")
252        ae = actual / predicted if predicted and predicted != 0 else float("nan")
253        rows.append(
254            {
255                "band": b_idx + 1,
256                "exposure": total_w,
257                "actual": actual,
258                "predicted": predicted,
259                "ae_ratio": ae,
260            }
261        )
262        prev = end
263
264    return pl.DataFrame(rows)

Double-lift chart: actual/expected by predicted risk decile.

Policies are sorted by predicted value and grouped into n_bands equal-weight buckets. Within each bucket, the table shows mean actual, mean predicted, and the A/E ratio. A well-calibrated model has A/E close to 1.0 across all bands.

Systematic deviations indicate model bias: monotone deviations suggest the model is over/under-dispersed; U-shaped patterns suggest missing interactions.

Parameters

y_true : array-like Observed outcomes. y_pred : array-like Model predictions. exposure : array-like, optional Exposure weights. When provided, bands are equal-exposure buckets. n_bands : int Number of bands (default 10 for deciles).

Returns

polars.DataFrame Columns: band (int), exposure (float), actual (float), predicted (float), ae_ratio (float).

def deviance( y_true: Union[numpy.ndarray, polars.series.series.Series, list], y_pred: Union[numpy.ndarray, polars.series.series.Series, list], exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None, family: str = 'poisson', variance_power: float = 1.5) -> float:
271def deviance(
272    y_true: Union[np.ndarray, pl.Series, list],
273    y_pred: Union[np.ndarray, pl.Series, list],
274    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
275    family: str = "poisson",
276    variance_power: float = 1.5,
277) -> float:
278    """
279    Mean deviance for a GLM family.
280
281    Parameters
282    ----------
283    y_true : array-like
284        Observed outcomes.
285    y_pred : array-like
286        Predicted values (on the response scale, not log scale).
287    exposure : array-like, optional
288        Exposure weights used in the weighted mean deviance.
289    family : str
290        One of 'poisson', 'gamma', 'tweedie'. Default 'poisson'.
291    variance_power : float
292        Tweedie variance power. Only used when family='tweedie'.
293
294    Returns
295    -------
296    float
297        Mean deviance (lower is better).
298    """
299    y_t = _coerce(y_true)
300    y_p = _coerce(y_pred)
301    w = _coerce(exposure)
302
303    if family == "poisson":
304        return _deviance_poisson(y_t, y_p, w)
305    elif family == "gamma":
306        return _deviance_gamma(y_t, y_p, w)
307    elif family == "tweedie":
308        return _deviance_tweedie(y_t, y_p, variance_power, w)
309    else:
310        raise ValueError(f"family must be 'poisson', 'gamma', or 'tweedie', got '{family}'")

Mean deviance for a GLM family.

Parameters

y_true : array-like Observed outcomes. y_pred : array-like Predicted values (on the response scale, not log scale). exposure : array-like, optional Exposure weights used in the weighted mean deviance. family : str One of 'poisson', 'gamma', 'tweedie'. Default 'poisson'. variance_power : float Tweedie variance power. Only used when family='tweedie'.

Returns

float Mean deviance (lower is better).

def residual_plot( model: InsuranceEBM, X: "Union['pl.DataFrame', 'pd.DataFrame']", y: Union[numpy.ndarray, polars.series.series.Series, list], feature: str, exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None, n_bins: int = 20, ax=None):
317def residual_plot(
318    model: InsuranceEBM,
319    X: Union["pl.DataFrame", "pd.DataFrame"],
320    y: Union[np.ndarray, pl.Series, list],
321    feature: str,
322    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
323    n_bins: int = 20,
324    ax=None,
325):
326    """
327    Deviance residuals by feature bin.
328
329    Groups observations by feature value (or quantile bin for continuous features)
330    and plots mean deviance residual per bin. Useful for identifying features that
331    the model is systematically mis-estimating.
332
333    Parameters
334    ----------
335    model : InsuranceEBM
336        Fitted model.
337    X : DataFrame
338        Feature matrix.
339    y : array-like
340        Observed outcomes.
341    feature : str
342        Feature to bin on (must be a column in X).
343    exposure : array-like, optional
344        Exposure weights.
345    n_bins : int
346        Number of quantile bins for continuous features. Ignored for categoricals.
347    ax : matplotlib.axes.Axes, optional
348        Axes to draw on.
349
350    Returns
351    -------
352    matplotlib.axes.Axes
353    """
354    import matplotlib.pyplot as plt
355
356    model._check_fitted()
357
358    if isinstance(X, pl.DataFrame):
359        X_pl = X
360    else:
361        X_pl = pl.from_pandas(X)
362
363    y_arr = _coerce(y)
364    w = _coerce(exposure)
365    y_pred = model.predict(X_pl, exposure=exposure)
366
367    # Compute per-observation Poisson-style deviance residuals (signed)
368    eps = 1e-10
369    y_safe = np.maximum(y_arr, 0.0)
370    y_pred_safe = np.maximum(y_pred, eps)
371    sign = np.where(y_safe >= y_pred_safe, 1.0, -1.0)
372    unit_dev = 2.0 * (
373        np.where(y_safe > 0, y_safe * np.log(y_safe / y_pred_safe), 0.0)
374        - (y_safe - y_pred_safe)
375    )
376    residuals = sign * np.sqrt(np.maximum(unit_dev, 0.0))
377
378    # Bin the feature
379    col = X_pl[feature]
380    if col.dtype in (pl.Utf8, pl.Categorical, pl.Boolean):
381        # Categorical: group by value
382        vals = col.cast(pl.Utf8).to_numpy()
383        unique_vals = np.unique(vals)
384        bin_labels = unique_vals
385        assignments = np.searchsorted(unique_vals, vals)
386    else:
387        # Numeric: quantile bins
388        col_np = col.cast(pl.Float64).to_numpy()
389        quantiles = np.percentile(col_np, np.linspace(0, 100, n_bins + 1))
390        quantiles = np.unique(quantiles)
391        assignments = np.digitize(col_np, quantiles[1:-1])
392        bin_labels = [f"Q{i+1}" for i in range(len(quantiles) - 1)]
393
394    n_groups = len(bin_labels)
395    mean_resid = np.zeros(n_groups)
396    for i in range(n_groups):
397        mask = assignments == i
398        if mask.sum() == 0:
399            mean_resid[i] = 0.0
400        elif w is not None:
401            mean_resid[i] = float(np.average(residuals[mask], weights=w[mask]))
402        else:
403            mean_resid[i] = float(np.mean(residuals[mask]))
404
405    if ax is None:
406        fig, ax = plt.subplots(figsize=(max(8, n_groups * 0.5), 5))
407
408    colours = ["#d62728" if r > 0 else "#2ca02c" for r in mean_resid]
409    ax.bar(range(n_groups), mean_resid, color=colours, alpha=0.8, edgecolor="white")
410    ax.axhline(0, color="black", linewidth=0.8, linestyle="--")
411    ax.set_xticks(range(n_groups))
412    ax.set_xticklabels(bin_labels, rotation=45, ha="right", fontsize=8)
413    ax.set_ylabel("Mean deviance residual (signed)")
414    ax.set_title(f"Residuals by {feature}")
415    ax.grid(axis="y", alpha=0.3)
416
417    return ax

Deviance residuals by feature bin.

Groups observations by feature value (or quantile bin for continuous features) and plots mean deviance residual per bin. Useful for identifying features that the model is systematically mis-estimating.

Parameters

model : InsuranceEBM Fitted model. X : DataFrame Feature matrix. y : array-like Observed outcomes. feature : str Feature to bin on (must be a column in X). exposure : array-like, optional Exposure weights. n_bins : int Number of quantile bins for continuous features. Ignored for categoricals. ax : matplotlib.axes.Axes, optional Axes to draw on.

Returns

matplotlib.axes.Axes

def calibration_table( y_true: Union[numpy.ndarray, polars.series.series.Series, list], y_pred: Union[numpy.ndarray, polars.series.series.Series, list], segment: Union[numpy.ndarray, polars.series.series.Series, list], exposure: Union[numpy.ndarray, polars.series.series.Series, list, NoneType] = None) -> polars.dataframe.frame.DataFrame:
424def calibration_table(
425    y_true: Union[np.ndarray, pl.Series, list],
426    y_pred: Union[np.ndarray, pl.Series, list],
427    segment: Union[np.ndarray, pl.Series, list],
428    exposure: Optional[Union[np.ndarray, pl.Series, list]] = None,
429) -> pl.DataFrame:
430    """
431    Actual vs expected by segment.
432
433    Standard actuarial A/E table. For each segment (e.g. area, vehicle group,
434    risk band), shows total exposure, total actual claims, total predicted claims,
435    and the A/E ratio.
436
437    Parameters
438    ----------
439    y_true : array-like
440        Observed outcomes.
441    y_pred : array-like
442        Predicted values.
443    segment : array-like
444        Segment labels (strings or integers).
445    exposure : array-like, optional
446        Exposure weights. If None, each observation has weight 1.
447
448    Returns
449    -------
450    polars.DataFrame
451        Columns: segment, exposure, actual_total, predicted_total, ae_ratio.
452        Sorted by ae_ratio descending (worst calibrated segments first).
453    """
454    y_t = _coerce(y_true)
455    y_p = _coerce(y_pred)
456    w = _coerce(exposure)
457    seg = _coerce(segment) if not isinstance(segment, (list, np.ndarray)) else np.asarray(segment)
458
459    if isinstance(segment, pl.Series):
460        seg = segment.to_numpy()
461    else:
462        seg = np.asarray(segment)
463
464    if w is None:
465        w = np.ones_like(y_t)
466
467    unique_segs = np.unique(seg)
468    rows = []
469    for s in unique_segs:
470        mask = seg == s
471        total_w = float(np.sum(w[mask]))
472        actual = float(np.sum(y_t[mask] * w[mask]))
473        predicted = float(np.sum(y_p[mask] * w[mask]))
474        ae = actual / predicted if predicted > 0 else float("nan")
475        rows.append(
476            {
477                "segment": str(s),
478                "exposure": total_w,
479                "actual_total": actual,
480                "predicted_total": predicted,
481                "ae_ratio": ae,
482            }
483        )
484
485    return pl.DataFrame(rows).sort("ae_ratio", descending=True)

Actual vs expected by segment.

Standard actuarial A/E table. For each segment (e.g. area, vehicle group, risk band), shows total exposure, total actual claims, total predicted claims, and the A/E ratio.

Parameters

y_true : array-like Observed outcomes. y_pred : array-like Predicted values. segment : array-like Segment labels (strings or integers). exposure : array-like, optional Exposure weights. If None, each observation has weight 1.

Returns

polars.DataFrame Columns: segment, exposure, actual_total, predicted_total, ae_ratio. Sorted by ae_ratio descending (worst calibrated segments first).