Edit on GitHub

insurance_distributional_glm

insurance-distributional-glm

GAMLSS (Generalised Additive Models for Location, Scale and Shape) for insurance pricing in Python.

Models the full conditional distribution p(Y|X), not just E[Y|X]. Each distribution parameter (mean, variance, shape) can depend on covariates.

Quick start

>>> import numpy as np
>>> import polars as pl
>>> from insurance_distributional_glm import DistributionalGLM
>>> from insurance_distributional_glm.families import Gamma
>>>
>>> rng = np.random.default_rng(42)
>>> df = pl.DataFrame({"age_band": rng.integers(0, 3, 500).astype(float)})
>>> y = rng.gamma(shape=2.0, scale=500.0, size=500)
>>>
>>> model = DistributionalGLM(
...     family=Gamma(),
...     formulas={"mu": ["age_band"], "sigma": []},
... )
>>> model.fit(df, y)
>>> model.predict(df, parameter="mu")[:5]
 1"""
 2insurance-distributional-glm
 3=============================
 4
 5GAMLSS (Generalised Additive Models for Location, Scale and Shape) for
 6insurance pricing in Python.
 7
 8Models the full conditional distribution p(Y|X), not just E[Y|X].
 9Each distribution parameter (mean, variance, shape) can depend on covariates.
10
11Quick start
12-----------
13>>> import numpy as np
14>>> import polars as pl
15>>> from insurance_distributional_glm import DistributionalGLM
16>>> from insurance_distributional_glm.families import Gamma
17>>>
18>>> rng = np.random.default_rng(42)
19>>> df = pl.DataFrame({"age_band": rng.integers(0, 3, 500).astype(float)})
20>>> y = rng.gamma(shape=2.0, scale=500.0, size=500)
21>>>
22>>> model = DistributionalGLM(
23...     family=Gamma(),
24...     formulas={"mu": ["age_band"], "sigma": []},
25... )
26>>> model.fit(df, y)
27>>> model.predict(df, parameter="mu")[:5]
28"""
29
30from .model import DistributionalGLM
31from .selection import choose_distribution, gaic, SelectionResult
32from .diagnostics import quantile_residuals, worm_plot
33from . import families
34
35from importlib.metadata import version, PackageNotFoundError
36
37try:
38    __version__ = version("insurance-distributional-glm")
39except PackageNotFoundError:
40    __version__ = "0.0.0"  # not installed
41
42__all__ = [
43    "DistributionalGLM",
44    "choose_distribution",
45    "gaic",
46    "SelectionResult",
47    "quantile_residuals",
48    "worm_plot",
49    "families",
50]
class DistributionalGLM:
 84class DistributionalGLM:
 85    """
 86    GAMLSS model: model ALL distribution parameters as functions of covariates.
 87
 88    Standard GLM models only E[Y|X]. This models the full conditional
 89    distribution p(Y|X) by allowing each parameter (mean, variance, shape)
 90    to depend on covariates.
 91
 92    Example
 93    -------
 94    >>> import polars as pl
 95    >>> from insurance_distributional_glm import DistributionalGLM
 96    >>> from insurance_distributional_glm.families import Gamma
 97    >>>
 98    >>> df = pl.DataFrame({"age": [25, 35, 45], "vehicle_value": [5000, 15000, 30000]})
 99    >>> y = np.array([500.0, 1200.0, 3500.0])
100    >>>
101    >>> model = DistributionalGLM(
102    ...     family=Gamma(),
103    ...     formulas={"mu": ["age", "vehicle_value"], "sigma": ["age"]},
104    ... )
105    >>> model.fit(df, y)
106    >>> model.predict(df, parameter="mu")
107    """
108
109    def __init__(
110        self,
111        family: GamlssFamily,
112        formulas: Optional[Dict[str, List[str]]] = None,
113    ):
114        """
115        Parameters
116        ----------
117        family : GamlssFamily
118            Distribution family. Determines which parameters exist and their
119            default link functions.
120        formulas : dict, optional
121            Maps parameter name -> list of column names to include.
122            Intercept is always added automatically.
123            If None: mu gets all columns, other parameters get intercept-only.
124            If {'mu': [], 'sigma': []}: intercept-only for all parameters.
125        """
126        self.family = family
127        self.formulas = formulas
128        self._betas: Dict[str, np.ndarray] = {}
129        self._params: Dict[str, np.ndarray] = {}
130        self._columns: List[str] = []
131        self._loglik: float = float("nan")
132        self._loglik_history: List[float] = []
133        self._converged: bool = False
134        self._n: int = 0
135        self._fitted = False
136
137    def fit(
138        self,
139        X,
140        y,
141        exposure: Optional[np.ndarray] = None,
142        weights: Optional[np.ndarray] = None,
143        max_iter: int = 100,
144        tol: float = 1e-6,
145        verbose: bool = False,
146    ) -> "DistributionalGLM":
147        """
148        Fit the model via the RS algorithm.
149
150        Parameters
151        ----------
152        X : polars/pandas DataFrame or numpy array
153            Covariate matrix. Column names must match the formulas dict.
154        y : array-like
155            Response observations. Must be positive for Gamma/LogNormal/IG.
156        exposure : array-like, optional
157            Exposure values (not log). Applied as log-offset on mu.
158        weights : array-like, optional
159            Observation weights.
160        max_iter : int
161            Maximum RS outer iterations (default 100).
162        tol : float
163            Convergence tolerance on log-likelihood change (default 1e-6).
164        verbose : bool
165            Print log-likelihood at each RS iteration.
166
167        Returns
168        -------
169        self
170        """
171        y = np.asarray(y, dtype=float)
172        self._n = len(y)
173        self._columns = _get_columns(X)
174
175        # Resolve formulas
176        formulas = self._resolve_formulas()
177
178        # Build design matrices
179        design_matrices = {}
180        for pname in self.family.param_names:
181            cols = formulas.get(pname, [])
182            design_matrices[pname] = _build_design_matrix(X, cols)
183
184        # Exposure offset
185        log_offset = None
186        if exposure is not None:
187            exposure = np.asarray(exposure, dtype=float)
188            log_offset = np.log(np.clip(exposure, 1e-10, None))
189
190        if weights is not None:
191            weights = np.asarray(weights, dtype=float)
192
193        # Run RS algorithm
194        betas, params, ll_history, converged = rs_fit(
195            family=self.family,
196            design_matrices=design_matrices,
197            y=y,
198            log_offset=log_offset,
199            weights=weights,
200            max_iter=max_iter,
201            tol=tol,
202            verbose=verbose,
203        )
204
205        self._betas = betas
206        self._params = params
207        self._loglik_history = ll_history
208        self._loglik = ll_history[-1] if ll_history else float("nan")
209        self._converged = converged
210        self._design_matrices = design_matrices
211        self._formulas = formulas
212        self._fitted = True
213
214        if not converged and verbose:
215            print(f"Warning: RS algorithm did not converge in {max_iter} iterations.")
216
217        return self
218
219    def _resolve_formulas(self) -> Dict[str, List[str]]:
220        """Apply default formula logic if formulas is None."""
221        param_names = self.family.param_names
222        if self.formulas is None:
223            result = {}
224            for i, pname in enumerate(param_names):
225                result[pname] = self._columns if i == 0 else []
226            return result
227        # Fill in missing params with intercept-only
228        result = {}
229        for pname in param_names:
230            result[pname] = self.formulas.get(pname, [])
231        return result
232
233    def predict(
234        self,
235        X,
236        parameter: str = "mu",
237        exposure: Optional[np.ndarray] = None,
238    ) -> np.ndarray:
239        """
240        Predict a distribution parameter on the response scale.
241
242        Parameters
243        ----------
244        X : DataFrame or array
245            Covariates.
246        parameter : str
247            Which parameter to predict (e.g. 'mu', 'sigma', 'pi').
248        exposure : array-like, optional
249            Exposure offset for mu predictions.
250
251        Returns
252        -------
253        ndarray, shape (n,)
254            Predicted parameter values on response scale.
255        """
256        self._check_fitted()
257        cols = self._formulas.get(parameter, [])
258        X_mat = _build_design_matrix(X, cols)
259        link = self.family.default_links[parameter]
260
261        eta = X_mat @ self._betas[parameter]
262        if parameter == "mu" and exposure is not None:
263            exposure = np.asarray(exposure, dtype=float)
264            eta = eta + np.log(np.clip(exposure, 1e-10, None))
265
266        return link.inverse(eta)
267
268    def predict_distribution(self, X, exposure: Optional[np.ndarray] = None):
269        """
270        Return a list of scipy.stats frozen distribution objects, one per row.
271
272        Parameters
273        ----------
274        X : DataFrame or array
275        exposure : array-like, optional
276
277        Returns
278        -------
279        list of scipy.stats frozen distributions (or None if unsupported)
280        """
281        self._check_fitted()
282        from scipy import stats
283
284        param_arrays = {}
285        for pname in self.family.param_names:
286            param_arrays[pname] = self.predict(X, parameter=pname, exposure=exposure)
287
288        n = len(param_arrays[self.family.param_names[0]])
289        family_name = self.family.__class__.__name__
290        dists = []
291
292        for i in range(n):
293            p = {k: v[i] for k, v in param_arrays.items()}
294            dist = _make_scipy_dist(family_name, p, self.family)
295            dists.append(dist)
296
297        return dists
298
299    def predict_mean(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
300        """E[Y|X] — convenience wrapper.
301
302        For ZIP, E[Y] = (1-pi)*mu, not mu alone.
303        """
304        mu = self.predict(X, parameter="mu", exposure=exposure)
305        if self.family.__class__.__name__ == "ZIP":
306            pi = self.predict(X, parameter="pi")
307            return (1.0 - pi) * mu
308        return mu
309
310    def predict_variance(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
311        """
312        Var[Y|X] from the fitted distribution parameters.
313
314        Uses the distribution's variance formula rather than empirical variance.
315        """
316        self._check_fitted()
317        family_name = self.family.__class__.__name__
318        param_arrays = {
319            pname: self.predict(X, parameter=pname, exposure=exposure)
320            for pname in self.family.param_names
321        }
322        return _compute_variance(family_name, param_arrays, self.family)
323
324    def volatility_score(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
325        """
326        Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk.
327
328        Insurance interpretation: risks with CV > 1 have larger standard
329        deviation than mean — typical for heavy-tailed severity.
330        Useful for pricing loading decisions and risk selection.
331        """
332        mu = self.predict_mean(X, exposure=exposure)
333        var = self.predict_variance(X, exposure=exposure)
334        return np.sqrt(np.clip(var, 0, None)) / np.clip(mu, 1e-10, None)
335
336    def relativities(self, parameter: str = "mu"):
337        """
338        Return coefficient relativities for a given parameter.
339
340        For log-linked parameters (mu, sigma on most families):
341          - Intercept relativity = exp(beta_intercept) — the fitted baseline
342            on the response scale.
343          - All other term relativities = exp(beta_j) — the multiplicative
344            effect of a one-unit increase in that term on the response scale.
345
346        For logit-linked parameters (pi in ZIP):
347          Returns odds ratios: exp(beta_j).
348
349        Returns
350        -------
351        polars DataFrame with columns: param, term, coefficient, relativity, link
352        """
353        self._check_fitted()
354        if not _HAS_POLARS:
355            raise ImportError("polars required for relativities(). Install polars.")
356
357        betas = self._betas[parameter]
358        cols = self._formulas.get(parameter, [])
359        link = self.family.default_links[parameter]
360
361        terms = ["(Intercept)"] + cols
362        intercept = betas[0]
363
364        rows = []
365        for term, beta in zip(terms, betas):
366            if link.name == "log":
367                rel = float(np.exp(beta))
368                if term == "(Intercept)":
369                    base = float(np.exp(intercept))
370                    rows.append({"param": parameter, "term": term, "coefficient": float(beta),
371                                 "relativity": base, "link": "log"})
372                else:
373                    rows.append({"param": parameter, "term": term, "coefficient": float(beta),
374                                 "relativity": rel, "link": "log"})
375            elif link.name == "logit":
376                rows.append({"param": parameter, "term": term, "coefficient": float(beta),
377                             "relativity": float(np.exp(beta)), "link": "logit (odds ratio)"})
378            else:
379                rows.append({"param": parameter, "term": term, "coefficient": float(beta),
380                             "relativity": float(beta), "link": link.name})
381
382        return pl.DataFrame(rows)
383
384    def summary(self) -> None:
385        """Print coefficient summary for all parameters."""
386        self._check_fitted()
387        print(f"\nDistributionalGLM — {self.family.__class__.__name__}")
388        print(f"  n = {self._n}, loglik = {self._loglik:.4f}")
389        print(f"  Converged: {self._converged}")
390        print(f"  GAIC(2): {self.gaic(penalty=2):.4f}")
391        print()
392
393        for pname in self.family.param_names:
394            betas = self._betas[pname]
395            cols = self._formulas.get(pname, [])
396            terms = ["(Intercept)"] + cols
397            link = self.family.default_links[pname]
398            print(f"  Parameter: {pname}  (link: {link.name})")
399            print(f"  {'Term':<30} {'Coef':>12}")
400            print(f"  {'-'*44}")
401            for term, beta in zip(terms, betas):
402                print(f"  {term:<30} {beta:>12.5f}")
403            print()
404
405    def gaic(self, penalty: float = 2.0) -> float:
406        """
407        Generalised AIC with given penalty.
408
409        penalty=2   -> standard AIC
410        penalty=log(n)  -> BIC
411        """
412        self._check_fitted()
413        total_params = sum(len(b) for b in self._betas.values())
414        return _gaic(self._loglik, total_params, penalty=penalty)
415
416    def score(
417        self,
418        X,
419        y,
420        metric: str = "nll",
421        exposure: Optional[np.ndarray] = None,
422    ) -> float:
423        """
424        Evaluate model on held-out data.
425
426        Parameters
427        ----------
428        X : DataFrame or array
429        y : array-like
430        metric : str
431            'nll' (negative log-likelihood), 'deviance', or 'crps'.
432        exposure : array-like, optional
433
434        Returns
435        -------
436        float
437            Metric value. Lower is better for nll and deviance.
438        """
439        self._check_fitted()
440        y = np.asarray(y, dtype=float)
441
442        param_arrays = {
443            pname: self.predict(X, parameter=pname, exposure=exposure)
444            for pname in self.family.param_names
445        }
446        ll = self.family.log_likelihood(y, param_arrays)
447
448        if metric == "nll":
449            return float(-np.mean(ll[np.isfinite(ll)]))
450        elif metric == "deviance":
451            # Saturated model: each obs gets its own parameter
452            ll_sat = self._saturated_loglik(y, param_arrays)
453            return float(2.0 * np.sum(ll_sat - ll))
454        elif metric == "crps":
455            dists = self.predict_distribution(X, exposure=exposure)
456            crps_vals = []
457            for dist, yi in zip(dists, y):
458                if dist is not None:
459                    # Monte Carlo CRPS approximation using the standard estimator:
460                    #   CRPS = E|X - y| - 0.5 * E|X - X'|
461                    # where X, X' are independent draws from the predicted distribution.
462                    # We draw n_samples and use a random split for the second term —
463                    # this avoids the O(n^2) full pairwise calculation and does not
464                    # depend on drawing exactly 1000 samples.
465                    n_samples = 500
466                    s1 = dist.rvs(n_samples)
467                    s2 = dist.rvs(n_samples)
468                    crps_vals.append(
469                        np.mean(np.abs(s1 - yi)) - 0.5 * np.mean(np.abs(s1 - s2))
470                    )
471            return float(np.mean(crps_vals))
472        else:
473            raise ValueError(f"Unknown metric: {metric}. Use 'nll', 'deviance', or 'crps'.")
474
475    def _saturated_loglik(self, y, params):
476        """Saturated log-likelihood (each obs perfectly fitted to itself)."""
477        sat_params = {k: v.copy() for k, v in params.items()}
478        sat_params["mu"] = y.copy()
479        try:
480            ll = self.family.log_likelihood(y, sat_params)
481            return np.where(np.isfinite(ll), ll, 0.0)
482        except Exception:
483            return np.zeros(len(y))
484
485    def _check_fitted(self):
486        if not self._fitted:
487            raise RuntimeError("Model not fitted. Call fit() first.")
488
489    @property
490    def coefficients(self) -> Dict[str, np.ndarray]:
491        """Fitted coefficients for all parameters."""
492        self._check_fitted()
493        return {k: v.copy() for k, v in self._betas.items()}
494
495    @property
496    def n_observations(self) -> int:
497        return self._n
498
499    @property
500    def loglikelihood(self) -> float:
501        self._check_fitted()
502        return self._loglik
503
504    @property
505    def converged(self) -> bool:
506        self._check_fitted()
507        return self._converged
508
509    def __repr__(self) -> str:
510        status = "fitted" if self._fitted else "unfitted"
511        return (
512            f"DistributionalGLM({self.family.__class__.__name__}, "
513            f"n={self._n}, {status})"
514        )

GAMLSS model: model ALL distribution parameters as functions of covariates.

Standard GLM models only E[Y|X]. This models the full conditional distribution p(Y|X) by allowing each parameter (mean, variance, shape) to depend on covariates.

Example

>>> import polars as pl
>>> from insurance_distributional_glm import DistributionalGLM
>>> from insurance_distributional_glm.families import Gamma
>>>
>>> df = pl.DataFrame({"age": [25, 35, 45], "vehicle_value": [5000, 15000, 30000]})
>>> y = np.array([500.0, 1200.0, 3500.0])
>>>
>>> model = DistributionalGLM(
...     family=Gamma(),
...     formulas={"mu": ["age", "vehicle_value"], "sigma": ["age"]},
... )
>>> model.fit(df, y)
>>> model.predict(df, parameter="mu")
DistributionalGLM( family: insurance_distributional_glm.families.GamlssFamily, formulas: Optional[Dict[str, List[str]]] = None)
109    def __init__(
110        self,
111        family: GamlssFamily,
112        formulas: Optional[Dict[str, List[str]]] = None,
113    ):
114        """
115        Parameters
116        ----------
117        family : GamlssFamily
118            Distribution family. Determines which parameters exist and their
119            default link functions.
120        formulas : dict, optional
121            Maps parameter name -> list of column names to include.
122            Intercept is always added automatically.
123            If None: mu gets all columns, other parameters get intercept-only.
124            If {'mu': [], 'sigma': []}: intercept-only for all parameters.
125        """
126        self.family = family
127        self.formulas = formulas
128        self._betas: Dict[str, np.ndarray] = {}
129        self._params: Dict[str, np.ndarray] = {}
130        self._columns: List[str] = []
131        self._loglik: float = float("nan")
132        self._loglik_history: List[float] = []
133        self._converged: bool = False
134        self._n: int = 0
135        self._fitted = False

Parameters

family : GamlssFamily Distribution family. Determines which parameters exist and their default link functions. formulas : dict, optional Maps parameter name -> list of column names to include. Intercept is always added automatically. If None: mu gets all columns, other parameters get intercept-only. If {'mu': [], 'sigma': []}: intercept-only for all parameters.

family
formulas
def fit( self, X, y, exposure: Optional[numpy.ndarray] = None, weights: Optional[numpy.ndarray] = None, max_iter: int = 100, tol: float = 1e-06, verbose: bool = False) -> DistributionalGLM:
137    def fit(
138        self,
139        X,
140        y,
141        exposure: Optional[np.ndarray] = None,
142        weights: Optional[np.ndarray] = None,
143        max_iter: int = 100,
144        tol: float = 1e-6,
145        verbose: bool = False,
146    ) -> "DistributionalGLM":
147        """
148        Fit the model via the RS algorithm.
149
150        Parameters
151        ----------
152        X : polars/pandas DataFrame or numpy array
153            Covariate matrix. Column names must match the formulas dict.
154        y : array-like
155            Response observations. Must be positive for Gamma/LogNormal/IG.
156        exposure : array-like, optional
157            Exposure values (not log). Applied as log-offset on mu.
158        weights : array-like, optional
159            Observation weights.
160        max_iter : int
161            Maximum RS outer iterations (default 100).
162        tol : float
163            Convergence tolerance on log-likelihood change (default 1e-6).
164        verbose : bool
165            Print log-likelihood at each RS iteration.
166
167        Returns
168        -------
169        self
170        """
171        y = np.asarray(y, dtype=float)
172        self._n = len(y)
173        self._columns = _get_columns(X)
174
175        # Resolve formulas
176        formulas = self._resolve_formulas()
177
178        # Build design matrices
179        design_matrices = {}
180        for pname in self.family.param_names:
181            cols = formulas.get(pname, [])
182            design_matrices[pname] = _build_design_matrix(X, cols)
183
184        # Exposure offset
185        log_offset = None
186        if exposure is not None:
187            exposure = np.asarray(exposure, dtype=float)
188            log_offset = np.log(np.clip(exposure, 1e-10, None))
189
190        if weights is not None:
191            weights = np.asarray(weights, dtype=float)
192
193        # Run RS algorithm
194        betas, params, ll_history, converged = rs_fit(
195            family=self.family,
196            design_matrices=design_matrices,
197            y=y,
198            log_offset=log_offset,
199            weights=weights,
200            max_iter=max_iter,
201            tol=tol,
202            verbose=verbose,
203        )
204
205        self._betas = betas
206        self._params = params
207        self._loglik_history = ll_history
208        self._loglik = ll_history[-1] if ll_history else float("nan")
209        self._converged = converged
210        self._design_matrices = design_matrices
211        self._formulas = formulas
212        self._fitted = True
213
214        if not converged and verbose:
215            print(f"Warning: RS algorithm did not converge in {max_iter} iterations.")
216
217        return self

Fit the model via the RS algorithm.

Parameters

X : polars/pandas DataFrame or numpy array Covariate matrix. Column names must match the formulas dict. y : array-like Response observations. Must be positive for Gamma/LogNormal/IG. exposure : array-like, optional Exposure values (not log). Applied as log-offset on mu. weights : array-like, optional Observation weights. max_iter : int Maximum RS outer iterations (default 100). tol : float Convergence tolerance on log-likelihood change (default 1e-6). verbose : bool Print log-likelihood at each RS iteration.

Returns

self

def predict( self, X, parameter: str = 'mu', exposure: Optional[numpy.ndarray] = None) -> numpy.ndarray:
233    def predict(
234        self,
235        X,
236        parameter: str = "mu",
237        exposure: Optional[np.ndarray] = None,
238    ) -> np.ndarray:
239        """
240        Predict a distribution parameter on the response scale.
241
242        Parameters
243        ----------
244        X : DataFrame or array
245            Covariates.
246        parameter : str
247            Which parameter to predict (e.g. 'mu', 'sigma', 'pi').
248        exposure : array-like, optional
249            Exposure offset for mu predictions.
250
251        Returns
252        -------
253        ndarray, shape (n,)
254            Predicted parameter values on response scale.
255        """
256        self._check_fitted()
257        cols = self._formulas.get(parameter, [])
258        X_mat = _build_design_matrix(X, cols)
259        link = self.family.default_links[parameter]
260
261        eta = X_mat @ self._betas[parameter]
262        if parameter == "mu" and exposure is not None:
263            exposure = np.asarray(exposure, dtype=float)
264            eta = eta + np.log(np.clip(exposure, 1e-10, None))
265
266        return link.inverse(eta)

Predict a distribution parameter on the response scale.

Parameters

X : DataFrame or array Covariates. parameter : str Which parameter to predict (e.g. 'mu', 'sigma', 'pi'). exposure : array-like, optional Exposure offset for mu predictions.

Returns

ndarray, shape (n,) Predicted parameter values on response scale.

def predict_distribution(self, X, exposure: Optional[numpy.ndarray] = None):
268    def predict_distribution(self, X, exposure: Optional[np.ndarray] = None):
269        """
270        Return a list of scipy.stats frozen distribution objects, one per row.
271
272        Parameters
273        ----------
274        X : DataFrame or array
275        exposure : array-like, optional
276
277        Returns
278        -------
279        list of scipy.stats frozen distributions (or None if unsupported)
280        """
281        self._check_fitted()
282        from scipy import stats
283
284        param_arrays = {}
285        for pname in self.family.param_names:
286            param_arrays[pname] = self.predict(X, parameter=pname, exposure=exposure)
287
288        n = len(param_arrays[self.family.param_names[0]])
289        family_name = self.family.__class__.__name__
290        dists = []
291
292        for i in range(n):
293            p = {k: v[i] for k, v in param_arrays.items()}
294            dist = _make_scipy_dist(family_name, p, self.family)
295            dists.append(dist)
296
297        return dists

Return a list of scipy.stats frozen distribution objects, one per row.

Parameters

X : DataFrame or array exposure : array-like, optional

Returns

list of scipy.stats frozen distributions (or None if unsupported)

def predict_mean(self, X, exposure: Optional[numpy.ndarray] = None) -> numpy.ndarray:
299    def predict_mean(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
300        """E[Y|X] — convenience wrapper.
301
302        For ZIP, E[Y] = (1-pi)*mu, not mu alone.
303        """
304        mu = self.predict(X, parameter="mu", exposure=exposure)
305        if self.family.__class__.__name__ == "ZIP":
306            pi = self.predict(X, parameter="pi")
307            return (1.0 - pi) * mu
308        return mu

E[Y|X] — convenience wrapper.

For ZIP, E[Y] = (1-pi)*mu, not mu alone.

def predict_variance(self, X, exposure: Optional[numpy.ndarray] = None) -> numpy.ndarray:
310    def predict_variance(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
311        """
312        Var[Y|X] from the fitted distribution parameters.
313
314        Uses the distribution's variance formula rather than empirical variance.
315        """
316        self._check_fitted()
317        family_name = self.family.__class__.__name__
318        param_arrays = {
319            pname: self.predict(X, parameter=pname, exposure=exposure)
320            for pname in self.family.param_names
321        }
322        return _compute_variance(family_name, param_arrays, self.family)

Var[Y|X] from the fitted distribution parameters.

Uses the distribution's variance formula rather than empirical variance.

def volatility_score(self, X, exposure: Optional[numpy.ndarray] = None) -> numpy.ndarray:
324    def volatility_score(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray:
325        """
326        Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk.
327
328        Insurance interpretation: risks with CV > 1 have larger standard
329        deviation than mean — typical for heavy-tailed severity.
330        Useful for pricing loading decisions and risk selection.
331        """
332        mu = self.predict_mean(X, exposure=exposure)
333        var = self.predict_variance(X, exposure=exposure)
334        return np.sqrt(np.clip(var, 0, None)) / np.clip(mu, 1e-10, None)

Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk.

Insurance interpretation: risks with CV > 1 have larger standard deviation than mean — typical for heavy-tailed severity. Useful for pricing loading decisions and risk selection.

def relativities(self, parameter: str = 'mu'):
336    def relativities(self, parameter: str = "mu"):
337        """
338        Return coefficient relativities for a given parameter.
339
340        For log-linked parameters (mu, sigma on most families):
341          - Intercept relativity = exp(beta_intercept) — the fitted baseline
342            on the response scale.
343          - All other term relativities = exp(beta_j) — the multiplicative
344            effect of a one-unit increase in that term on the response scale.
345
346        For logit-linked parameters (pi in ZIP):
347          Returns odds ratios: exp(beta_j).
348
349        Returns
350        -------
351        polars DataFrame with columns: param, term, coefficient, relativity, link
352        """
353        self._check_fitted()
354        if not _HAS_POLARS:
355            raise ImportError("polars required for relativities(). Install polars.")
356
357        betas = self._betas[parameter]
358        cols = self._formulas.get(parameter, [])
359        link = self.family.default_links[parameter]
360
361        terms = ["(Intercept)"] + cols
362        intercept = betas[0]
363
364        rows = []
365        for term, beta in zip(terms, betas):
366            if link.name == "log":
367                rel = float(np.exp(beta))
368                if term == "(Intercept)":
369                    base = float(np.exp(intercept))
370                    rows.append({"param": parameter, "term": term, "coefficient": float(beta),
371                                 "relativity": base, "link": "log"})
372                else:
373                    rows.append({"param": parameter, "term": term, "coefficient": float(beta),
374                                 "relativity": rel, "link": "log"})
375            elif link.name == "logit":
376                rows.append({"param": parameter, "term": term, "coefficient": float(beta),
377                             "relativity": float(np.exp(beta)), "link": "logit (odds ratio)"})
378            else:
379                rows.append({"param": parameter, "term": term, "coefficient": float(beta),
380                             "relativity": float(beta), "link": link.name})
381
382        return pl.DataFrame(rows)

Return coefficient relativities for a given parameter.

For log-linked parameters (mu, sigma on most families):

  • Intercept relativity = exp(beta_intercept) — the fitted baseline on the response scale.
  • All other term relativities = exp(beta_j) — the multiplicative effect of a one-unit increase in that term on the response scale.

For logit-linked parameters (pi in ZIP): Returns odds ratios: exp(beta_j).

Returns

polars DataFrame with columns: param, term, coefficient, relativity, link

def summary(self) -> None:
384    def summary(self) -> None:
385        """Print coefficient summary for all parameters."""
386        self._check_fitted()
387        print(f"\nDistributionalGLM — {self.family.__class__.__name__}")
388        print(f"  n = {self._n}, loglik = {self._loglik:.4f}")
389        print(f"  Converged: {self._converged}")
390        print(f"  GAIC(2): {self.gaic(penalty=2):.4f}")
391        print()
392
393        for pname in self.family.param_names:
394            betas = self._betas[pname]
395            cols = self._formulas.get(pname, [])
396            terms = ["(Intercept)"] + cols
397            link = self.family.default_links[pname]
398            print(f"  Parameter: {pname}  (link: {link.name})")
399            print(f"  {'Term':<30} {'Coef':>12}")
400            print(f"  {'-'*44}")
401            for term, beta in zip(terms, betas):
402                print(f"  {term:<30} {beta:>12.5f}")
403            print()

Print coefficient summary for all parameters.

def gaic(self, penalty: float = 2.0) -> float:
405    def gaic(self, penalty: float = 2.0) -> float:
406        """
407        Generalised AIC with given penalty.
408
409        penalty=2   -> standard AIC
410        penalty=log(n)  -> BIC
411        """
412        self._check_fitted()
413        total_params = sum(len(b) for b in self._betas.values())
414        return _gaic(self._loglik, total_params, penalty=penalty)

Generalised AIC with given penalty.

penalty=2 -> standard AIC penalty=log(n) -> BIC

def score( self, X, y, metric: str = 'nll', exposure: Optional[numpy.ndarray] = None) -> float:
416    def score(
417        self,
418        X,
419        y,
420        metric: str = "nll",
421        exposure: Optional[np.ndarray] = None,
422    ) -> float:
423        """
424        Evaluate model on held-out data.
425
426        Parameters
427        ----------
428        X : DataFrame or array
429        y : array-like
430        metric : str
431            'nll' (negative log-likelihood), 'deviance', or 'crps'.
432        exposure : array-like, optional
433
434        Returns
435        -------
436        float
437            Metric value. Lower is better for nll and deviance.
438        """
439        self._check_fitted()
440        y = np.asarray(y, dtype=float)
441
442        param_arrays = {
443            pname: self.predict(X, parameter=pname, exposure=exposure)
444            for pname in self.family.param_names
445        }
446        ll = self.family.log_likelihood(y, param_arrays)
447
448        if metric == "nll":
449            return float(-np.mean(ll[np.isfinite(ll)]))
450        elif metric == "deviance":
451            # Saturated model: each obs gets its own parameter
452            ll_sat = self._saturated_loglik(y, param_arrays)
453            return float(2.0 * np.sum(ll_sat - ll))
454        elif metric == "crps":
455            dists = self.predict_distribution(X, exposure=exposure)
456            crps_vals = []
457            for dist, yi in zip(dists, y):
458                if dist is not None:
459                    # Monte Carlo CRPS approximation using the standard estimator:
460                    #   CRPS = E|X - y| - 0.5 * E|X - X'|
461                    # where X, X' are independent draws from the predicted distribution.
462                    # We draw n_samples and use a random split for the second term —
463                    # this avoids the O(n^2) full pairwise calculation and does not
464                    # depend on drawing exactly 1000 samples.
465                    n_samples = 500
466                    s1 = dist.rvs(n_samples)
467                    s2 = dist.rvs(n_samples)
468                    crps_vals.append(
469                        np.mean(np.abs(s1 - yi)) - 0.5 * np.mean(np.abs(s1 - s2))
470                    )
471            return float(np.mean(crps_vals))
472        else:
473            raise ValueError(f"Unknown metric: {metric}. Use 'nll', 'deviance', or 'crps'.")

Evaluate model on held-out data.

Parameters

X : DataFrame or array y : array-like metric : str 'nll' (negative log-likelihood), 'deviance', or 'crps'. exposure : array-like, optional

Returns

float Metric value. Lower is better for nll and deviance.

coefficients: Dict[str, numpy.ndarray]
489    @property
490    def coefficients(self) -> Dict[str, np.ndarray]:
491        """Fitted coefficients for all parameters."""
492        self._check_fitted()
493        return {k: v.copy() for k, v in self._betas.items()}

Fitted coefficients for all parameters.

n_observations: int
495    @property
496    def n_observations(self) -> int:
497        return self._n
loglikelihood: float
499    @property
500    def loglikelihood(self) -> float:
501        self._check_fitted()
502        return self._loglik
converged: bool
504    @property
505    def converged(self) -> bool:
506        self._check_fitted()
507        return self._converged
def choose_distribution( X, y: numpy.ndarray, families: List[insurance_distributional_glm.families.GamlssFamily], formulas: Optional[Dict] = None, exposure: Optional[numpy.ndarray] = None, weights: Optional[numpy.ndarray] = None, penalty: float = 2.0, max_iter: int = 100, tol: float = 1e-06, verbose: bool = False) -> List[SelectionResult]:
 58def choose_distribution(
 59    X,
 60    y: np.ndarray,
 61    families: List[GamlssFamily],
 62    formulas: Optional[Dict] = None,
 63    exposure: Optional[np.ndarray] = None,
 64    weights: Optional[np.ndarray] = None,
 65    penalty: float = 2.0,
 66    max_iter: int = 100,
 67    tol: float = 1e-6,
 68    verbose: bool = False,
 69) -> List[SelectionResult]:
 70    """
 71    Fit multiple families and rank by GAIC.
 72
 73    Parameters
 74    ----------
 75    X : DataFrame (polars or pandas)
 76        Covariate matrix.
 77    y : ndarray
 78        Response observations.
 79    families : list of GamlssFamily
 80        Candidate distribution families to compare.
 81    formulas : dict, optional
 82        Passed to DistributionalGLM. If None, all columns for mu,
 83        intercept-only for other parameters.
 84    exposure : ndarray, optional
 85        Exposure offset (values, not log). Applied to mu.
 86    weights : ndarray, optional
 87        Observation weights.
 88    penalty : float
 89        GAIC penalty. Default 2 (AIC).
 90    max_iter : int
 91        Maximum RS iterations per family.
 92    tol : float
 93        Convergence tolerance.
 94    verbose : bool
 95        Print progress.
 96
 97    Returns
 98    -------
 99    list of SelectionResult
100        Sorted ascending by GAIC (best first).
101    """
102    # Import here to avoid circular dependency
103    from .model import DistributionalGLM
104
105    results = []
106    for fam in families:
107        fname = fam.__class__.__name__
108        if verbose:
109            print(f"Fitting {fname}...")
110        try:
111            mdl = DistributionalGLM(family=fam, formulas=formulas)
112            mdl.fit(
113                X, y,
114                exposure=exposure,
115                weights=weights,
116                max_iter=max_iter,
117                tol=tol,
118            )
119            ll = mdl._loglik
120            df = sum(len(b) for b in mdl._betas.values())
121            g = gaic(ll, df, penalty=penalty)
122            results.append(SelectionResult(
123                family=fam,
124                family_name=fname,
125                gaic=g,
126                loglik=ll,
127                df=df,
128                converged=mdl._converged,
129                model=mdl,
130            ))
131        except (ValueError, ArithmeticError, np.linalg.LinAlgError) as e:
132            if verbose:
133                print(f"  {fname} failed: {e}")
134
135    results.sort(key=lambda r: r.gaic)
136    return results

Fit multiple families and rank by GAIC.

Parameters

X : DataFrame (polars or pandas) Covariate matrix. y : ndarray Response observations. families : list of GamlssFamily Candidate distribution families to compare. formulas : dict, optional Passed to DistributionalGLM. If None, all columns for mu, intercept-only for other parameters. exposure : ndarray, optional Exposure offset (values, not log). Applied to mu. weights : ndarray, optional Observation weights. penalty : float GAIC penalty. Default 2 (AIC). max_iter : int Maximum RS iterations per family. tol : float Convergence tolerance. verbose : bool Print progress.

Returns

list of SelectionResult Sorted ascending by GAIC (best first).

def gaic(loglik: float, n_params: int, penalty: float = 2.0) -> float:
36def gaic(loglik: float, n_params: int, penalty: float = 2.0) -> float:
37    """
38    Generalised AIC.
39
40    Parameters
41    ----------
42    loglik : float
43        Total log-likelihood at convergence.
44    n_params : int
45        Total number of estimated parameters (sum of df across all
46        distribution components).
47    penalty : float
48        Penalty per parameter. Use 2 for AIC, log(n) for BIC.
49
50    Returns
51    -------
52    float
53        GAIC value. Lower is better.
54    """
55    return -2.0 * loglik + penalty * n_params

Generalised AIC.

Parameters

loglik : float Total log-likelihood at convergence. n_params : int Total number of estimated parameters (sum of df across all distribution components). penalty : float Penalty per parameter. Use 2 for AIC, log(n) for BIC.

Returns

float GAIC value. Lower is better.

@dataclass
class SelectionResult:
24@dataclass
25class SelectionResult:
26    """Result from choose_distribution for one family."""
27    family: GamlssFamily
28    family_name: str
29    gaic: float
30    loglik: float
31    df: int
32    converged: bool
33    model: object  # DistributionalGLM, typed as object to avoid circular import

Result from choose_distribution for one family.

SelectionResult( family: insurance_distributional_glm.families.GamlssFamily, family_name: str, gaic: float, loglik: float, df: int, converged: bool, model: object)
family_name: str
gaic: float
loglik: float
df: int
converged: bool
model: object
def quantile_residuals( model, X, y: numpy.ndarray, exposure: Optional[numpy.ndarray] = None, n_random: int = 1, seed: Optional[int] = None) -> numpy.ndarray:
23def quantile_residuals(
24    model,
25    X,
26    y: np.ndarray,
27    exposure: Optional[np.ndarray] = None,
28    n_random: int = 1,
29    seed: Optional[int] = None,
30) -> np.ndarray:
31    """
32    Compute randomised quantile residuals (Dunn & Smyth 1996).
33
34    For a continuous response, the residual is:
35      r_i = Phi^{-1}(F(y_i; theta_hat_i))
36
37    For discrete responses, F(y_i) is not uniquely defined at y_i, so we
38    randomise within the probability interval:
39      u_i ~ Uniform(F(y_i - 1; theta_hat_i), F(y_i; theta_hat_i))
40      r_i = Phi^{-1}(u_i)
41
42    A correctly specified model produces r_i ~ iid N(0,1).
43
44    Parameters
45    ----------
46    model : DistributionalGLM
47        A fitted model.
48    X : DataFrame
49        Covariates (same format as fit).
50    y : ndarray
51        Observed responses.
52    exposure : ndarray, optional
53        Exposure for offset.
54    n_random : int
55        For discrete families, number of randomisations to average over.
56    seed : int, optional
57        Random seed for reproducibility.
58
59    Returns
60    -------
61    ndarray, shape (n,)
62        Normalised quantile residuals.
63    """
64    rng = np.random.default_rng(seed)
65    y = np.asarray(y)
66    dists = model.predict_distribution(X, exposure=exposure)
67
68    family = model.family
69    is_discrete = family.__class__.__name__ in ("Poisson", "NBI", "ZIP")
70
71    residuals = np.zeros(len(y))
72
73    for i, (dist, yi) in enumerate(zip(dists, y)):
74        if dist is None:
75            residuals[i] = np.nan
76            continue
77
78        if is_discrete:
79            # Randomise within [F(y-1), F(y)]
80            f_lower = dist.cdf(yi - 1) if yi > 0 else 0.0
81            f_upper = dist.cdf(yi)
82            f_lower = np.clip(f_lower, 1e-10, 1 - 1e-10)
83            f_upper = np.clip(f_upper, f_lower + 1e-10, 1 - 1e-10)
84            u_vals = rng.uniform(f_lower, f_upper, size=n_random)
85            residuals[i] = np.mean(stats.norm.ppf(u_vals))
86        else:
87            p = dist.cdf(yi)
88            p = np.clip(p, 1e-10, 1 - 1e-10)
89            residuals[i] = stats.norm.ppf(p)
90
91    return residuals

Compute randomised quantile residuals (Dunn & Smyth 1996).

For a continuous response, the residual is: r_i = Phi^{-1}(F(y_i; theta_hat_i))

For discrete responses, F(y_i) is not uniquely defined at y_i, so we randomise within the probability interval: u_i ~ Uniform(F(y_i - 1; theta_hat_i), F(y_i; theta_hat_i)) r_i = Phi^{-1}(u_i)

A correctly specified model produces r_i ~ iid N(0,1).

Parameters

model : DistributionalGLM A fitted model. X : DataFrame Covariates (same format as fit). y : ndarray Observed responses. exposure : ndarray, optional Exposure for offset. n_random : int For discrete families, number of randomisations to average over. seed : int, optional Random seed for reproducibility.

Returns

ndarray, shape (n,) Normalised quantile residuals.

def worm_plot( model, X, y: numpy.ndarray, exposure: Optional[numpy.ndarray] = None, n_groups: int = 4, seed: int = 42, ax=None):
 94def worm_plot(
 95    model,
 96    X,
 97    y: np.ndarray,
 98    exposure: Optional[np.ndarray] = None,
 99    n_groups: int = 4,
100    seed: int = 42,
101    ax=None,
102):
103    """
104    Worm plot: detrended QQ plot of quantile residuals.
105
106    Shows deviation from N(0,1) per quantile group of the fitted values.
107    A flat worm near zero indicates good fit. Systematic deviations indicate:
108      - Upward shift: underdispersion
109      - Downward shift: overdispersion
110      - S-shape: wrong variance-mean relationship
111      - U/inverted-U: wrong kurtosis
112
113    Requires matplotlib (optional dependency).
114
115    Parameters
116    ----------
117    model : DistributionalGLM
118        Fitted model.
119    X, y : as for quantile_residuals.
120    exposure : ndarray, optional
121    n_groups : int
122        Number of groups to split fitted mu into (default 4).
123    seed : int
124        Random seed for quantile residuals.
125    ax : matplotlib.axes.Axes or None
126        Axes to draw on. Creates new figure if None.
127
128    Returns
129    -------
130    ax : matplotlib.axes.Axes
131    """
132    try:
133        import matplotlib.pyplot as plt
134    except ImportError:
135        raise ImportError(
136            "matplotlib is required for worm_plot. "
137            "Install it with: pip install insurance-distributional-glm[plots]"
138        )
139
140    resids = quantile_residuals(model, X, y, exposure=exposure, seed=seed)
141    mu_hat = model.predict(X, parameter="mu", exposure=exposure)
142
143    if ax is None:
144        fig, axes = plt.subplots(1, n_groups, figsize=(4 * n_groups, 4), sharey=True)
145    else:
146        axes = [ax] * n_groups
147
148    group_bounds = np.quantile(mu_hat, np.linspace(0, 1, n_groups + 1))
149
150    for g in range(n_groups):
151        mask = (mu_hat >= group_bounds[g]) & (mu_hat <= group_bounds[g + 1])
152        r_group = resids[mask]
153        r_group = r_group[np.isfinite(r_group)]
154
155        if len(r_group) < 2:
156            continue
157
158        ax_g = axes[g] if n_groups > 1 else ax
159
160        # Normal quantiles
161        n = len(r_group)
162        probs = (np.arange(1, n + 1) - 0.5) / n
163        theoretical = stats.norm.ppf(probs)
164
165        # Sort residuals
166        observed = np.sort(r_group)
167
168        # Detrend: residual minus theoretical (worm = deviation from diagonal)
169        worm = observed - theoretical
170
171        ax_g.scatter(theoretical, worm, s=10, alpha=0.5, color="steelblue")
172        ax_g.axhline(0, color="black", linewidth=0.8)
173        ax_g.set_title(
174            f"mu: [{group_bounds[g]:.2f}, {group_bounds[g+1]:.2f}]",
175            fontsize=9,
176        )
177        ax_g.set_xlabel("Theoretical N(0,1) quantiles")
178        if g == 0:
179            ax_g.set_ylabel("Residual - Theoretical")
180
181    if ax is None:
182        plt.suptitle("Worm Plot (detrended QQ by fitted mu group)", y=1.02)
183        plt.tight_layout()
184        return axes
185    return ax

Worm plot: detrended QQ plot of quantile residuals.

Shows deviation from N(0,1) per quantile group of the fitted values. A flat worm near zero indicates good fit. Systematic deviations indicate:

  • Upward shift: underdispersion
  • Downward shift: overdispersion
  • S-shape: wrong variance-mean relationship
  • U/inverted-U: wrong kurtosis

Requires matplotlib (optional dependency).

Parameters

model : DistributionalGLM Fitted model. X, y : as for quantile_residuals. exposure : ndarray, optional n_groups : int Number of groups to split fitted mu into (default 4). seed : int Random seed for quantile residuals. ax : matplotlib.axes.Axes or None Axes to draw on. Creates new figure if None.

Returns

ax : matplotlib.axes.Axes