insurance_distributional_glm
insurance-distributional-glm
GAMLSS (Generalised Additive Models for Location, Scale and Shape) for insurance pricing in Python.
Models the full conditional distribution p(Y|X), not just E[Y|X]. Each distribution parameter (mean, variance, shape) can depend on covariates.
Quick start
>>> import numpy as np
>>> import polars as pl
>>> from insurance_distributional_glm import DistributionalGLM
>>> from insurance_distributional_glm.families import Gamma
>>>
>>> rng = np.random.default_rng(42)
>>> df = pl.DataFrame({"age_band": rng.integers(0, 3, 500).astype(float)})
>>> y = rng.gamma(shape=2.0, scale=500.0, size=500)
>>>
>>> model = DistributionalGLM(
... family=Gamma(),
... formulas={"mu": ["age_band"], "sigma": []},
... )
>>> model.fit(df, y)
>>> model.predict(df, parameter="mu")[:5]
1""" 2insurance-distributional-glm 3============================= 4 5GAMLSS (Generalised Additive Models for Location, Scale and Shape) for 6insurance pricing in Python. 7 8Models the full conditional distribution p(Y|X), not just E[Y|X]. 9Each distribution parameter (mean, variance, shape) can depend on covariates. 10 11Quick start 12----------- 13>>> import numpy as np 14>>> import polars as pl 15>>> from insurance_distributional_glm import DistributionalGLM 16>>> from insurance_distributional_glm.families import Gamma 17>>> 18>>> rng = np.random.default_rng(42) 19>>> df = pl.DataFrame({"age_band": rng.integers(0, 3, 500).astype(float)}) 20>>> y = rng.gamma(shape=2.0, scale=500.0, size=500) 21>>> 22>>> model = DistributionalGLM( 23... family=Gamma(), 24... formulas={"mu": ["age_band"], "sigma": []}, 25... ) 26>>> model.fit(df, y) 27>>> model.predict(df, parameter="mu")[:5] 28""" 29 30from .model import DistributionalGLM 31from .selection import choose_distribution, gaic, SelectionResult 32from .diagnostics import quantile_residuals, worm_plot 33from . import families 34 35from importlib.metadata import version, PackageNotFoundError 36 37try: 38 __version__ = version("insurance-distributional-glm") 39except PackageNotFoundError: 40 __version__ = "0.0.0" # not installed 41 42__all__ = [ 43 "DistributionalGLM", 44 "choose_distribution", 45 "gaic", 46 "SelectionResult", 47 "quantile_residuals", 48 "worm_plot", 49 "families", 50]
84class DistributionalGLM: 85 """ 86 GAMLSS model: model ALL distribution parameters as functions of covariates. 87 88 Standard GLM models only E[Y|X]. This models the full conditional 89 distribution p(Y|X) by allowing each parameter (mean, variance, shape) 90 to depend on covariates. 91 92 Example 93 ------- 94 >>> import polars as pl 95 >>> from insurance_distributional_glm import DistributionalGLM 96 >>> from insurance_distributional_glm.families import Gamma 97 >>> 98 >>> df = pl.DataFrame({"age": [25, 35, 45], "vehicle_value": [5000, 15000, 30000]}) 99 >>> y = np.array([500.0, 1200.0, 3500.0]) 100 >>> 101 >>> model = DistributionalGLM( 102 ... family=Gamma(), 103 ... formulas={"mu": ["age", "vehicle_value"], "sigma": ["age"]}, 104 ... ) 105 >>> model.fit(df, y) 106 >>> model.predict(df, parameter="mu") 107 """ 108 109 def __init__( 110 self, 111 family: GamlssFamily, 112 formulas: Optional[Dict[str, List[str]]] = None, 113 ): 114 """ 115 Parameters 116 ---------- 117 family : GamlssFamily 118 Distribution family. Determines which parameters exist and their 119 default link functions. 120 formulas : dict, optional 121 Maps parameter name -> list of column names to include. 122 Intercept is always added automatically. 123 If None: mu gets all columns, other parameters get intercept-only. 124 If {'mu': [], 'sigma': []}: intercept-only for all parameters. 125 """ 126 self.family = family 127 self.formulas = formulas 128 self._betas: Dict[str, np.ndarray] = {} 129 self._params: Dict[str, np.ndarray] = {} 130 self._columns: List[str] = [] 131 self._loglik: float = float("nan") 132 self._loglik_history: List[float] = [] 133 self._converged: bool = False 134 self._n: int = 0 135 self._fitted = False 136 137 def fit( 138 self, 139 X, 140 y, 141 exposure: Optional[np.ndarray] = None, 142 weights: Optional[np.ndarray] = None, 143 max_iter: int = 100, 144 tol: float = 1e-6, 145 verbose: bool = False, 146 ) -> "DistributionalGLM": 147 """ 148 Fit the model via the RS algorithm. 149 150 Parameters 151 ---------- 152 X : polars/pandas DataFrame or numpy array 153 Covariate matrix. Column names must match the formulas dict. 154 y : array-like 155 Response observations. Must be positive for Gamma/LogNormal/IG. 156 exposure : array-like, optional 157 Exposure values (not log). Applied as log-offset on mu. 158 weights : array-like, optional 159 Observation weights. 160 max_iter : int 161 Maximum RS outer iterations (default 100). 162 tol : float 163 Convergence tolerance on log-likelihood change (default 1e-6). 164 verbose : bool 165 Print log-likelihood at each RS iteration. 166 167 Returns 168 ------- 169 self 170 """ 171 y = np.asarray(y, dtype=float) 172 self._n = len(y) 173 self._columns = _get_columns(X) 174 175 # Resolve formulas 176 formulas = self._resolve_formulas() 177 178 # Build design matrices 179 design_matrices = {} 180 for pname in self.family.param_names: 181 cols = formulas.get(pname, []) 182 design_matrices[pname] = _build_design_matrix(X, cols) 183 184 # Exposure offset 185 log_offset = None 186 if exposure is not None: 187 exposure = np.asarray(exposure, dtype=float) 188 log_offset = np.log(np.clip(exposure, 1e-10, None)) 189 190 if weights is not None: 191 weights = np.asarray(weights, dtype=float) 192 193 # Run RS algorithm 194 betas, params, ll_history, converged = rs_fit( 195 family=self.family, 196 design_matrices=design_matrices, 197 y=y, 198 log_offset=log_offset, 199 weights=weights, 200 max_iter=max_iter, 201 tol=tol, 202 verbose=verbose, 203 ) 204 205 self._betas = betas 206 self._params = params 207 self._loglik_history = ll_history 208 self._loglik = ll_history[-1] if ll_history else float("nan") 209 self._converged = converged 210 self._design_matrices = design_matrices 211 self._formulas = formulas 212 self._fitted = True 213 214 if not converged and verbose: 215 print(f"Warning: RS algorithm did not converge in {max_iter} iterations.") 216 217 return self 218 219 def _resolve_formulas(self) -> Dict[str, List[str]]: 220 """Apply default formula logic if formulas is None.""" 221 param_names = self.family.param_names 222 if self.formulas is None: 223 result = {} 224 for i, pname in enumerate(param_names): 225 result[pname] = self._columns if i == 0 else [] 226 return result 227 # Fill in missing params with intercept-only 228 result = {} 229 for pname in param_names: 230 result[pname] = self.formulas.get(pname, []) 231 return result 232 233 def predict( 234 self, 235 X, 236 parameter: str = "mu", 237 exposure: Optional[np.ndarray] = None, 238 ) -> np.ndarray: 239 """ 240 Predict a distribution parameter on the response scale. 241 242 Parameters 243 ---------- 244 X : DataFrame or array 245 Covariates. 246 parameter : str 247 Which parameter to predict (e.g. 'mu', 'sigma', 'pi'). 248 exposure : array-like, optional 249 Exposure offset for mu predictions. 250 251 Returns 252 ------- 253 ndarray, shape (n,) 254 Predicted parameter values on response scale. 255 """ 256 self._check_fitted() 257 cols = self._formulas.get(parameter, []) 258 X_mat = _build_design_matrix(X, cols) 259 link = self.family.default_links[parameter] 260 261 eta = X_mat @ self._betas[parameter] 262 if parameter == "mu" and exposure is not None: 263 exposure = np.asarray(exposure, dtype=float) 264 eta = eta + np.log(np.clip(exposure, 1e-10, None)) 265 266 return link.inverse(eta) 267 268 def predict_distribution(self, X, exposure: Optional[np.ndarray] = None): 269 """ 270 Return a list of scipy.stats frozen distribution objects, one per row. 271 272 Parameters 273 ---------- 274 X : DataFrame or array 275 exposure : array-like, optional 276 277 Returns 278 ------- 279 list of scipy.stats frozen distributions (or None if unsupported) 280 """ 281 self._check_fitted() 282 from scipy import stats 283 284 param_arrays = {} 285 for pname in self.family.param_names: 286 param_arrays[pname] = self.predict(X, parameter=pname, exposure=exposure) 287 288 n = len(param_arrays[self.family.param_names[0]]) 289 family_name = self.family.__class__.__name__ 290 dists = [] 291 292 for i in range(n): 293 p = {k: v[i] for k, v in param_arrays.items()} 294 dist = _make_scipy_dist(family_name, p, self.family) 295 dists.append(dist) 296 297 return dists 298 299 def predict_mean(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 300 """E[Y|X] — convenience wrapper. 301 302 For ZIP, E[Y] = (1-pi)*mu, not mu alone. 303 """ 304 mu = self.predict(X, parameter="mu", exposure=exposure) 305 if self.family.__class__.__name__ == "ZIP": 306 pi = self.predict(X, parameter="pi") 307 return (1.0 - pi) * mu 308 return mu 309 310 def predict_variance(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 311 """ 312 Var[Y|X] from the fitted distribution parameters. 313 314 Uses the distribution's variance formula rather than empirical variance. 315 """ 316 self._check_fitted() 317 family_name = self.family.__class__.__name__ 318 param_arrays = { 319 pname: self.predict(X, parameter=pname, exposure=exposure) 320 for pname in self.family.param_names 321 } 322 return _compute_variance(family_name, param_arrays, self.family) 323 324 def volatility_score(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 325 """ 326 Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk. 327 328 Insurance interpretation: risks with CV > 1 have larger standard 329 deviation than mean — typical for heavy-tailed severity. 330 Useful for pricing loading decisions and risk selection. 331 """ 332 mu = self.predict_mean(X, exposure=exposure) 333 var = self.predict_variance(X, exposure=exposure) 334 return np.sqrt(np.clip(var, 0, None)) / np.clip(mu, 1e-10, None) 335 336 def relativities(self, parameter: str = "mu"): 337 """ 338 Return coefficient relativities for a given parameter. 339 340 For log-linked parameters (mu, sigma on most families): 341 - Intercept relativity = exp(beta_intercept) — the fitted baseline 342 on the response scale. 343 - All other term relativities = exp(beta_j) — the multiplicative 344 effect of a one-unit increase in that term on the response scale. 345 346 For logit-linked parameters (pi in ZIP): 347 Returns odds ratios: exp(beta_j). 348 349 Returns 350 ------- 351 polars DataFrame with columns: param, term, coefficient, relativity, link 352 """ 353 self._check_fitted() 354 if not _HAS_POLARS: 355 raise ImportError("polars required for relativities(). Install polars.") 356 357 betas = self._betas[parameter] 358 cols = self._formulas.get(parameter, []) 359 link = self.family.default_links[parameter] 360 361 terms = ["(Intercept)"] + cols 362 intercept = betas[0] 363 364 rows = [] 365 for term, beta in zip(terms, betas): 366 if link.name == "log": 367 rel = float(np.exp(beta)) 368 if term == "(Intercept)": 369 base = float(np.exp(intercept)) 370 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 371 "relativity": base, "link": "log"}) 372 else: 373 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 374 "relativity": rel, "link": "log"}) 375 elif link.name == "logit": 376 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 377 "relativity": float(np.exp(beta)), "link": "logit (odds ratio)"}) 378 else: 379 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 380 "relativity": float(beta), "link": link.name}) 381 382 return pl.DataFrame(rows) 383 384 def summary(self) -> None: 385 """Print coefficient summary for all parameters.""" 386 self._check_fitted() 387 print(f"\nDistributionalGLM — {self.family.__class__.__name__}") 388 print(f" n = {self._n}, loglik = {self._loglik:.4f}") 389 print(f" Converged: {self._converged}") 390 print(f" GAIC(2): {self.gaic(penalty=2):.4f}") 391 print() 392 393 for pname in self.family.param_names: 394 betas = self._betas[pname] 395 cols = self._formulas.get(pname, []) 396 terms = ["(Intercept)"] + cols 397 link = self.family.default_links[pname] 398 print(f" Parameter: {pname} (link: {link.name})") 399 print(f" {'Term':<30} {'Coef':>12}") 400 print(f" {'-'*44}") 401 for term, beta in zip(terms, betas): 402 print(f" {term:<30} {beta:>12.5f}") 403 print() 404 405 def gaic(self, penalty: float = 2.0) -> float: 406 """ 407 Generalised AIC with given penalty. 408 409 penalty=2 -> standard AIC 410 penalty=log(n) -> BIC 411 """ 412 self._check_fitted() 413 total_params = sum(len(b) for b in self._betas.values()) 414 return _gaic(self._loglik, total_params, penalty=penalty) 415 416 def score( 417 self, 418 X, 419 y, 420 metric: str = "nll", 421 exposure: Optional[np.ndarray] = None, 422 ) -> float: 423 """ 424 Evaluate model on held-out data. 425 426 Parameters 427 ---------- 428 X : DataFrame or array 429 y : array-like 430 metric : str 431 'nll' (negative log-likelihood), 'deviance', or 'crps'. 432 exposure : array-like, optional 433 434 Returns 435 ------- 436 float 437 Metric value. Lower is better for nll and deviance. 438 """ 439 self._check_fitted() 440 y = np.asarray(y, dtype=float) 441 442 param_arrays = { 443 pname: self.predict(X, parameter=pname, exposure=exposure) 444 for pname in self.family.param_names 445 } 446 ll = self.family.log_likelihood(y, param_arrays) 447 448 if metric == "nll": 449 return float(-np.mean(ll[np.isfinite(ll)])) 450 elif metric == "deviance": 451 # Saturated model: each obs gets its own parameter 452 ll_sat = self._saturated_loglik(y, param_arrays) 453 return float(2.0 * np.sum(ll_sat - ll)) 454 elif metric == "crps": 455 dists = self.predict_distribution(X, exposure=exposure) 456 crps_vals = [] 457 for dist, yi in zip(dists, y): 458 if dist is not None: 459 # Monte Carlo CRPS approximation using the standard estimator: 460 # CRPS = E|X - y| - 0.5 * E|X - X'| 461 # where X, X' are independent draws from the predicted distribution. 462 # We draw n_samples and use a random split for the second term — 463 # this avoids the O(n^2) full pairwise calculation and does not 464 # depend on drawing exactly 1000 samples. 465 n_samples = 500 466 s1 = dist.rvs(n_samples) 467 s2 = dist.rvs(n_samples) 468 crps_vals.append( 469 np.mean(np.abs(s1 - yi)) - 0.5 * np.mean(np.abs(s1 - s2)) 470 ) 471 return float(np.mean(crps_vals)) 472 else: 473 raise ValueError(f"Unknown metric: {metric}. Use 'nll', 'deviance', or 'crps'.") 474 475 def _saturated_loglik(self, y, params): 476 """Saturated log-likelihood (each obs perfectly fitted to itself).""" 477 sat_params = {k: v.copy() for k, v in params.items()} 478 sat_params["mu"] = y.copy() 479 try: 480 ll = self.family.log_likelihood(y, sat_params) 481 return np.where(np.isfinite(ll), ll, 0.0) 482 except Exception: 483 return np.zeros(len(y)) 484 485 def _check_fitted(self): 486 if not self._fitted: 487 raise RuntimeError("Model not fitted. Call fit() first.") 488 489 @property 490 def coefficients(self) -> Dict[str, np.ndarray]: 491 """Fitted coefficients for all parameters.""" 492 self._check_fitted() 493 return {k: v.copy() for k, v in self._betas.items()} 494 495 @property 496 def n_observations(self) -> int: 497 return self._n 498 499 @property 500 def loglikelihood(self) -> float: 501 self._check_fitted() 502 return self._loglik 503 504 @property 505 def converged(self) -> bool: 506 self._check_fitted() 507 return self._converged 508 509 def __repr__(self) -> str: 510 status = "fitted" if self._fitted else "unfitted" 511 return ( 512 f"DistributionalGLM({self.family.__class__.__name__}, " 513 f"n={self._n}, {status})" 514 )
GAMLSS model: model ALL distribution parameters as functions of covariates.
Standard GLM models only E[Y|X]. This models the full conditional distribution p(Y|X) by allowing each parameter (mean, variance, shape) to depend on covariates.
Example
>>> import polars as pl
>>> from insurance_distributional_glm import DistributionalGLM
>>> from insurance_distributional_glm.families import Gamma
>>>
>>> df = pl.DataFrame({"age": [25, 35, 45], "vehicle_value": [5000, 15000, 30000]})
>>> y = np.array([500.0, 1200.0, 3500.0])
>>>
>>> model = DistributionalGLM(
... family=Gamma(),
... formulas={"mu": ["age", "vehicle_value"], "sigma": ["age"]},
... )
>>> model.fit(df, y)
>>> model.predict(df, parameter="mu")
109 def __init__( 110 self, 111 family: GamlssFamily, 112 formulas: Optional[Dict[str, List[str]]] = None, 113 ): 114 """ 115 Parameters 116 ---------- 117 family : GamlssFamily 118 Distribution family. Determines which parameters exist and their 119 default link functions. 120 formulas : dict, optional 121 Maps parameter name -> list of column names to include. 122 Intercept is always added automatically. 123 If None: mu gets all columns, other parameters get intercept-only. 124 If {'mu': [], 'sigma': []}: intercept-only for all parameters. 125 """ 126 self.family = family 127 self.formulas = formulas 128 self._betas: Dict[str, np.ndarray] = {} 129 self._params: Dict[str, np.ndarray] = {} 130 self._columns: List[str] = [] 131 self._loglik: float = float("nan") 132 self._loglik_history: List[float] = [] 133 self._converged: bool = False 134 self._n: int = 0 135 self._fitted = False
Parameters
family : GamlssFamily Distribution family. Determines which parameters exist and their default link functions. formulas : dict, optional Maps parameter name -> list of column names to include. Intercept is always added automatically. If None: mu gets all columns, other parameters get intercept-only. If {'mu': [], 'sigma': []}: intercept-only for all parameters.
137 def fit( 138 self, 139 X, 140 y, 141 exposure: Optional[np.ndarray] = None, 142 weights: Optional[np.ndarray] = None, 143 max_iter: int = 100, 144 tol: float = 1e-6, 145 verbose: bool = False, 146 ) -> "DistributionalGLM": 147 """ 148 Fit the model via the RS algorithm. 149 150 Parameters 151 ---------- 152 X : polars/pandas DataFrame or numpy array 153 Covariate matrix. Column names must match the formulas dict. 154 y : array-like 155 Response observations. Must be positive for Gamma/LogNormal/IG. 156 exposure : array-like, optional 157 Exposure values (not log). Applied as log-offset on mu. 158 weights : array-like, optional 159 Observation weights. 160 max_iter : int 161 Maximum RS outer iterations (default 100). 162 tol : float 163 Convergence tolerance on log-likelihood change (default 1e-6). 164 verbose : bool 165 Print log-likelihood at each RS iteration. 166 167 Returns 168 ------- 169 self 170 """ 171 y = np.asarray(y, dtype=float) 172 self._n = len(y) 173 self._columns = _get_columns(X) 174 175 # Resolve formulas 176 formulas = self._resolve_formulas() 177 178 # Build design matrices 179 design_matrices = {} 180 for pname in self.family.param_names: 181 cols = formulas.get(pname, []) 182 design_matrices[pname] = _build_design_matrix(X, cols) 183 184 # Exposure offset 185 log_offset = None 186 if exposure is not None: 187 exposure = np.asarray(exposure, dtype=float) 188 log_offset = np.log(np.clip(exposure, 1e-10, None)) 189 190 if weights is not None: 191 weights = np.asarray(weights, dtype=float) 192 193 # Run RS algorithm 194 betas, params, ll_history, converged = rs_fit( 195 family=self.family, 196 design_matrices=design_matrices, 197 y=y, 198 log_offset=log_offset, 199 weights=weights, 200 max_iter=max_iter, 201 tol=tol, 202 verbose=verbose, 203 ) 204 205 self._betas = betas 206 self._params = params 207 self._loglik_history = ll_history 208 self._loglik = ll_history[-1] if ll_history else float("nan") 209 self._converged = converged 210 self._design_matrices = design_matrices 211 self._formulas = formulas 212 self._fitted = True 213 214 if not converged and verbose: 215 print(f"Warning: RS algorithm did not converge in {max_iter} iterations.") 216 217 return self
Fit the model via the RS algorithm.
Parameters
X : polars/pandas DataFrame or numpy array Covariate matrix. Column names must match the formulas dict. y : array-like Response observations. Must be positive for Gamma/LogNormal/IG. exposure : array-like, optional Exposure values (not log). Applied as log-offset on mu. weights : array-like, optional Observation weights. max_iter : int Maximum RS outer iterations (default 100). tol : float Convergence tolerance on log-likelihood change (default 1e-6). verbose : bool Print log-likelihood at each RS iteration.
Returns
self
233 def predict( 234 self, 235 X, 236 parameter: str = "mu", 237 exposure: Optional[np.ndarray] = None, 238 ) -> np.ndarray: 239 """ 240 Predict a distribution parameter on the response scale. 241 242 Parameters 243 ---------- 244 X : DataFrame or array 245 Covariates. 246 parameter : str 247 Which parameter to predict (e.g. 'mu', 'sigma', 'pi'). 248 exposure : array-like, optional 249 Exposure offset for mu predictions. 250 251 Returns 252 ------- 253 ndarray, shape (n,) 254 Predicted parameter values on response scale. 255 """ 256 self._check_fitted() 257 cols = self._formulas.get(parameter, []) 258 X_mat = _build_design_matrix(X, cols) 259 link = self.family.default_links[parameter] 260 261 eta = X_mat @ self._betas[parameter] 262 if parameter == "mu" and exposure is not None: 263 exposure = np.asarray(exposure, dtype=float) 264 eta = eta + np.log(np.clip(exposure, 1e-10, None)) 265 266 return link.inverse(eta)
Predict a distribution parameter on the response scale.
Parameters
X : DataFrame or array Covariates. parameter : str Which parameter to predict (e.g. 'mu', 'sigma', 'pi'). exposure : array-like, optional Exposure offset for mu predictions.
Returns
ndarray, shape (n,) Predicted parameter values on response scale.
268 def predict_distribution(self, X, exposure: Optional[np.ndarray] = None): 269 """ 270 Return a list of scipy.stats frozen distribution objects, one per row. 271 272 Parameters 273 ---------- 274 X : DataFrame or array 275 exposure : array-like, optional 276 277 Returns 278 ------- 279 list of scipy.stats frozen distributions (or None if unsupported) 280 """ 281 self._check_fitted() 282 from scipy import stats 283 284 param_arrays = {} 285 for pname in self.family.param_names: 286 param_arrays[pname] = self.predict(X, parameter=pname, exposure=exposure) 287 288 n = len(param_arrays[self.family.param_names[0]]) 289 family_name = self.family.__class__.__name__ 290 dists = [] 291 292 for i in range(n): 293 p = {k: v[i] for k, v in param_arrays.items()} 294 dist = _make_scipy_dist(family_name, p, self.family) 295 dists.append(dist) 296 297 return dists
Return a list of scipy.stats frozen distribution objects, one per row.
Parameters
X : DataFrame or array exposure : array-like, optional
Returns
list of scipy.stats frozen distributions (or None if unsupported)
299 def predict_mean(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 300 """E[Y|X] — convenience wrapper. 301 302 For ZIP, E[Y] = (1-pi)*mu, not mu alone. 303 """ 304 mu = self.predict(X, parameter="mu", exposure=exposure) 305 if self.family.__class__.__name__ == "ZIP": 306 pi = self.predict(X, parameter="pi") 307 return (1.0 - pi) * mu 308 return mu
E[Y|X] — convenience wrapper.
For ZIP, E[Y] = (1-pi)*mu, not mu alone.
310 def predict_variance(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 311 """ 312 Var[Y|X] from the fitted distribution parameters. 313 314 Uses the distribution's variance formula rather than empirical variance. 315 """ 316 self._check_fitted() 317 family_name = self.family.__class__.__name__ 318 param_arrays = { 319 pname: self.predict(X, parameter=pname, exposure=exposure) 320 for pname in self.family.param_names 321 } 322 return _compute_variance(family_name, param_arrays, self.family)
Var[Y|X] from the fitted distribution parameters.
Uses the distribution's variance formula rather than empirical variance.
324 def volatility_score(self, X, exposure: Optional[np.ndarray] = None) -> np.ndarray: 325 """ 326 Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk. 327 328 Insurance interpretation: risks with CV > 1 have larger standard 329 deviation than mean — typical for heavy-tailed severity. 330 Useful for pricing loading decisions and risk selection. 331 """ 332 mu = self.predict_mean(X, exposure=exposure) 333 var = self.predict_variance(X, exposure=exposure) 334 return np.sqrt(np.clip(var, 0, None)) / np.clip(mu, 1e-10, None)
Coefficient of variation CV = sqrt(Var[Y|X]) / E[Y|X] per risk.
Insurance interpretation: risks with CV > 1 have larger standard deviation than mean — typical for heavy-tailed severity. Useful for pricing loading decisions and risk selection.
336 def relativities(self, parameter: str = "mu"): 337 """ 338 Return coefficient relativities for a given parameter. 339 340 For log-linked parameters (mu, sigma on most families): 341 - Intercept relativity = exp(beta_intercept) — the fitted baseline 342 on the response scale. 343 - All other term relativities = exp(beta_j) — the multiplicative 344 effect of a one-unit increase in that term on the response scale. 345 346 For logit-linked parameters (pi in ZIP): 347 Returns odds ratios: exp(beta_j). 348 349 Returns 350 ------- 351 polars DataFrame with columns: param, term, coefficient, relativity, link 352 """ 353 self._check_fitted() 354 if not _HAS_POLARS: 355 raise ImportError("polars required for relativities(). Install polars.") 356 357 betas = self._betas[parameter] 358 cols = self._formulas.get(parameter, []) 359 link = self.family.default_links[parameter] 360 361 terms = ["(Intercept)"] + cols 362 intercept = betas[0] 363 364 rows = [] 365 for term, beta in zip(terms, betas): 366 if link.name == "log": 367 rel = float(np.exp(beta)) 368 if term == "(Intercept)": 369 base = float(np.exp(intercept)) 370 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 371 "relativity": base, "link": "log"}) 372 else: 373 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 374 "relativity": rel, "link": "log"}) 375 elif link.name == "logit": 376 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 377 "relativity": float(np.exp(beta)), "link": "logit (odds ratio)"}) 378 else: 379 rows.append({"param": parameter, "term": term, "coefficient": float(beta), 380 "relativity": float(beta), "link": link.name}) 381 382 return pl.DataFrame(rows)
Return coefficient relativities for a given parameter.
For log-linked parameters (mu, sigma on most families):
- Intercept relativity = exp(beta_intercept) — the fitted baseline on the response scale.
- All other term relativities = exp(beta_j) — the multiplicative effect of a one-unit increase in that term on the response scale.
For logit-linked parameters (pi in ZIP): Returns odds ratios: exp(beta_j).
Returns
polars DataFrame with columns: param, term, coefficient, relativity, link
384 def summary(self) -> None: 385 """Print coefficient summary for all parameters.""" 386 self._check_fitted() 387 print(f"\nDistributionalGLM — {self.family.__class__.__name__}") 388 print(f" n = {self._n}, loglik = {self._loglik:.4f}") 389 print(f" Converged: {self._converged}") 390 print(f" GAIC(2): {self.gaic(penalty=2):.4f}") 391 print() 392 393 for pname in self.family.param_names: 394 betas = self._betas[pname] 395 cols = self._formulas.get(pname, []) 396 terms = ["(Intercept)"] + cols 397 link = self.family.default_links[pname] 398 print(f" Parameter: {pname} (link: {link.name})") 399 print(f" {'Term':<30} {'Coef':>12}") 400 print(f" {'-'*44}") 401 for term, beta in zip(terms, betas): 402 print(f" {term:<30} {beta:>12.5f}") 403 print()
Print coefficient summary for all parameters.
405 def gaic(self, penalty: float = 2.0) -> float: 406 """ 407 Generalised AIC with given penalty. 408 409 penalty=2 -> standard AIC 410 penalty=log(n) -> BIC 411 """ 412 self._check_fitted() 413 total_params = sum(len(b) for b in self._betas.values()) 414 return _gaic(self._loglik, total_params, penalty=penalty)
Generalised AIC with given penalty.
penalty=2 -> standard AIC penalty=log(n) -> BIC
416 def score( 417 self, 418 X, 419 y, 420 metric: str = "nll", 421 exposure: Optional[np.ndarray] = None, 422 ) -> float: 423 """ 424 Evaluate model on held-out data. 425 426 Parameters 427 ---------- 428 X : DataFrame or array 429 y : array-like 430 metric : str 431 'nll' (negative log-likelihood), 'deviance', or 'crps'. 432 exposure : array-like, optional 433 434 Returns 435 ------- 436 float 437 Metric value. Lower is better for nll and deviance. 438 """ 439 self._check_fitted() 440 y = np.asarray(y, dtype=float) 441 442 param_arrays = { 443 pname: self.predict(X, parameter=pname, exposure=exposure) 444 for pname in self.family.param_names 445 } 446 ll = self.family.log_likelihood(y, param_arrays) 447 448 if metric == "nll": 449 return float(-np.mean(ll[np.isfinite(ll)])) 450 elif metric == "deviance": 451 # Saturated model: each obs gets its own parameter 452 ll_sat = self._saturated_loglik(y, param_arrays) 453 return float(2.0 * np.sum(ll_sat - ll)) 454 elif metric == "crps": 455 dists = self.predict_distribution(X, exposure=exposure) 456 crps_vals = [] 457 for dist, yi in zip(dists, y): 458 if dist is not None: 459 # Monte Carlo CRPS approximation using the standard estimator: 460 # CRPS = E|X - y| - 0.5 * E|X - X'| 461 # where X, X' are independent draws from the predicted distribution. 462 # We draw n_samples and use a random split for the second term — 463 # this avoids the O(n^2) full pairwise calculation and does not 464 # depend on drawing exactly 1000 samples. 465 n_samples = 500 466 s1 = dist.rvs(n_samples) 467 s2 = dist.rvs(n_samples) 468 crps_vals.append( 469 np.mean(np.abs(s1 - yi)) - 0.5 * np.mean(np.abs(s1 - s2)) 470 ) 471 return float(np.mean(crps_vals)) 472 else: 473 raise ValueError(f"Unknown metric: {metric}. Use 'nll', 'deviance', or 'crps'.")
Evaluate model on held-out data.
Parameters
X : DataFrame or array y : array-like metric : str 'nll' (negative log-likelihood), 'deviance', or 'crps'. exposure : array-like, optional
Returns
float Metric value. Lower is better for nll and deviance.
489 @property 490 def coefficients(self) -> Dict[str, np.ndarray]: 491 """Fitted coefficients for all parameters.""" 492 self._check_fitted() 493 return {k: v.copy() for k, v in self._betas.items()}
Fitted coefficients for all parameters.
58def choose_distribution( 59 X, 60 y: np.ndarray, 61 families: List[GamlssFamily], 62 formulas: Optional[Dict] = None, 63 exposure: Optional[np.ndarray] = None, 64 weights: Optional[np.ndarray] = None, 65 penalty: float = 2.0, 66 max_iter: int = 100, 67 tol: float = 1e-6, 68 verbose: bool = False, 69) -> List[SelectionResult]: 70 """ 71 Fit multiple families and rank by GAIC. 72 73 Parameters 74 ---------- 75 X : DataFrame (polars or pandas) 76 Covariate matrix. 77 y : ndarray 78 Response observations. 79 families : list of GamlssFamily 80 Candidate distribution families to compare. 81 formulas : dict, optional 82 Passed to DistributionalGLM. If None, all columns for mu, 83 intercept-only for other parameters. 84 exposure : ndarray, optional 85 Exposure offset (values, not log). Applied to mu. 86 weights : ndarray, optional 87 Observation weights. 88 penalty : float 89 GAIC penalty. Default 2 (AIC). 90 max_iter : int 91 Maximum RS iterations per family. 92 tol : float 93 Convergence tolerance. 94 verbose : bool 95 Print progress. 96 97 Returns 98 ------- 99 list of SelectionResult 100 Sorted ascending by GAIC (best first). 101 """ 102 # Import here to avoid circular dependency 103 from .model import DistributionalGLM 104 105 results = [] 106 for fam in families: 107 fname = fam.__class__.__name__ 108 if verbose: 109 print(f"Fitting {fname}...") 110 try: 111 mdl = DistributionalGLM(family=fam, formulas=formulas) 112 mdl.fit( 113 X, y, 114 exposure=exposure, 115 weights=weights, 116 max_iter=max_iter, 117 tol=tol, 118 ) 119 ll = mdl._loglik 120 df = sum(len(b) for b in mdl._betas.values()) 121 g = gaic(ll, df, penalty=penalty) 122 results.append(SelectionResult( 123 family=fam, 124 family_name=fname, 125 gaic=g, 126 loglik=ll, 127 df=df, 128 converged=mdl._converged, 129 model=mdl, 130 )) 131 except (ValueError, ArithmeticError, np.linalg.LinAlgError) as e: 132 if verbose: 133 print(f" {fname} failed: {e}") 134 135 results.sort(key=lambda r: r.gaic) 136 return results
Fit multiple families and rank by GAIC.
Parameters
X : DataFrame (polars or pandas) Covariate matrix. y : ndarray Response observations. families : list of GamlssFamily Candidate distribution families to compare. formulas : dict, optional Passed to DistributionalGLM. If None, all columns for mu, intercept-only for other parameters. exposure : ndarray, optional Exposure offset (values, not log). Applied to mu. weights : ndarray, optional Observation weights. penalty : float GAIC penalty. Default 2 (AIC). max_iter : int Maximum RS iterations per family. tol : float Convergence tolerance. verbose : bool Print progress.
Returns
list of SelectionResult Sorted ascending by GAIC (best first).
36def gaic(loglik: float, n_params: int, penalty: float = 2.0) -> float: 37 """ 38 Generalised AIC. 39 40 Parameters 41 ---------- 42 loglik : float 43 Total log-likelihood at convergence. 44 n_params : int 45 Total number of estimated parameters (sum of df across all 46 distribution components). 47 penalty : float 48 Penalty per parameter. Use 2 for AIC, log(n) for BIC. 49 50 Returns 51 ------- 52 float 53 GAIC value. Lower is better. 54 """ 55 return -2.0 * loglik + penalty * n_params
Generalised AIC.
Parameters
loglik : float Total log-likelihood at convergence. n_params : int Total number of estimated parameters (sum of df across all distribution components). penalty : float Penalty per parameter. Use 2 for AIC, log(n) for BIC.
Returns
float GAIC value. Lower is better.
24@dataclass 25class SelectionResult: 26 """Result from choose_distribution for one family.""" 27 family: GamlssFamily 28 family_name: str 29 gaic: float 30 loglik: float 31 df: int 32 converged: bool 33 model: object # DistributionalGLM, typed as object to avoid circular import
Result from choose_distribution for one family.
23def quantile_residuals( 24 model, 25 X, 26 y: np.ndarray, 27 exposure: Optional[np.ndarray] = None, 28 n_random: int = 1, 29 seed: Optional[int] = None, 30) -> np.ndarray: 31 """ 32 Compute randomised quantile residuals (Dunn & Smyth 1996). 33 34 For a continuous response, the residual is: 35 r_i = Phi^{-1}(F(y_i; theta_hat_i)) 36 37 For discrete responses, F(y_i) is not uniquely defined at y_i, so we 38 randomise within the probability interval: 39 u_i ~ Uniform(F(y_i - 1; theta_hat_i), F(y_i; theta_hat_i)) 40 r_i = Phi^{-1}(u_i) 41 42 A correctly specified model produces r_i ~ iid N(0,1). 43 44 Parameters 45 ---------- 46 model : DistributionalGLM 47 A fitted model. 48 X : DataFrame 49 Covariates (same format as fit). 50 y : ndarray 51 Observed responses. 52 exposure : ndarray, optional 53 Exposure for offset. 54 n_random : int 55 For discrete families, number of randomisations to average over. 56 seed : int, optional 57 Random seed for reproducibility. 58 59 Returns 60 ------- 61 ndarray, shape (n,) 62 Normalised quantile residuals. 63 """ 64 rng = np.random.default_rng(seed) 65 y = np.asarray(y) 66 dists = model.predict_distribution(X, exposure=exposure) 67 68 family = model.family 69 is_discrete = family.__class__.__name__ in ("Poisson", "NBI", "ZIP") 70 71 residuals = np.zeros(len(y)) 72 73 for i, (dist, yi) in enumerate(zip(dists, y)): 74 if dist is None: 75 residuals[i] = np.nan 76 continue 77 78 if is_discrete: 79 # Randomise within [F(y-1), F(y)] 80 f_lower = dist.cdf(yi - 1) if yi > 0 else 0.0 81 f_upper = dist.cdf(yi) 82 f_lower = np.clip(f_lower, 1e-10, 1 - 1e-10) 83 f_upper = np.clip(f_upper, f_lower + 1e-10, 1 - 1e-10) 84 u_vals = rng.uniform(f_lower, f_upper, size=n_random) 85 residuals[i] = np.mean(stats.norm.ppf(u_vals)) 86 else: 87 p = dist.cdf(yi) 88 p = np.clip(p, 1e-10, 1 - 1e-10) 89 residuals[i] = stats.norm.ppf(p) 90 91 return residuals
Compute randomised quantile residuals (Dunn & Smyth 1996).
For a continuous response, the residual is: r_i = Phi^{-1}(F(y_i; theta_hat_i))
For discrete responses, F(y_i) is not uniquely defined at y_i, so we randomise within the probability interval: u_i ~ Uniform(F(y_i - 1; theta_hat_i), F(y_i; theta_hat_i)) r_i = Phi^{-1}(u_i)
A correctly specified model produces r_i ~ iid N(0,1).
Parameters
model : DistributionalGLM A fitted model. X : DataFrame Covariates (same format as fit). y : ndarray Observed responses. exposure : ndarray, optional Exposure for offset. n_random : int For discrete families, number of randomisations to average over. seed : int, optional Random seed for reproducibility.
Returns
ndarray, shape (n,) Normalised quantile residuals.
94def worm_plot( 95 model, 96 X, 97 y: np.ndarray, 98 exposure: Optional[np.ndarray] = None, 99 n_groups: int = 4, 100 seed: int = 42, 101 ax=None, 102): 103 """ 104 Worm plot: detrended QQ plot of quantile residuals. 105 106 Shows deviation from N(0,1) per quantile group of the fitted values. 107 A flat worm near zero indicates good fit. Systematic deviations indicate: 108 - Upward shift: underdispersion 109 - Downward shift: overdispersion 110 - S-shape: wrong variance-mean relationship 111 - U/inverted-U: wrong kurtosis 112 113 Requires matplotlib (optional dependency). 114 115 Parameters 116 ---------- 117 model : DistributionalGLM 118 Fitted model. 119 X, y : as for quantile_residuals. 120 exposure : ndarray, optional 121 n_groups : int 122 Number of groups to split fitted mu into (default 4). 123 seed : int 124 Random seed for quantile residuals. 125 ax : matplotlib.axes.Axes or None 126 Axes to draw on. Creates new figure if None. 127 128 Returns 129 ------- 130 ax : matplotlib.axes.Axes 131 """ 132 try: 133 import matplotlib.pyplot as plt 134 except ImportError: 135 raise ImportError( 136 "matplotlib is required for worm_plot. " 137 "Install it with: pip install insurance-distributional-glm[plots]" 138 ) 139 140 resids = quantile_residuals(model, X, y, exposure=exposure, seed=seed) 141 mu_hat = model.predict(X, parameter="mu", exposure=exposure) 142 143 if ax is None: 144 fig, axes = plt.subplots(1, n_groups, figsize=(4 * n_groups, 4), sharey=True) 145 else: 146 axes = [ax] * n_groups 147 148 group_bounds = np.quantile(mu_hat, np.linspace(0, 1, n_groups + 1)) 149 150 for g in range(n_groups): 151 mask = (mu_hat >= group_bounds[g]) & (mu_hat <= group_bounds[g + 1]) 152 r_group = resids[mask] 153 r_group = r_group[np.isfinite(r_group)] 154 155 if len(r_group) < 2: 156 continue 157 158 ax_g = axes[g] if n_groups > 1 else ax 159 160 # Normal quantiles 161 n = len(r_group) 162 probs = (np.arange(1, n + 1) - 0.5) / n 163 theoretical = stats.norm.ppf(probs) 164 165 # Sort residuals 166 observed = np.sort(r_group) 167 168 # Detrend: residual minus theoretical (worm = deviation from diagonal) 169 worm = observed - theoretical 170 171 ax_g.scatter(theoretical, worm, s=10, alpha=0.5, color="steelblue") 172 ax_g.axhline(0, color="black", linewidth=0.8) 173 ax_g.set_title( 174 f"mu: [{group_bounds[g]:.2f}, {group_bounds[g+1]:.2f}]", 175 fontsize=9, 176 ) 177 ax_g.set_xlabel("Theoretical N(0,1) quantiles") 178 if g == 0: 179 ax_g.set_ylabel("Residual - Theoretical") 180 181 if ax is None: 182 plt.suptitle("Worm Plot (detrended QQ by fitted mu group)", y=1.02) 183 plt.tight_layout() 184 return axes 185 return ax
Worm plot: detrended QQ plot of quantile residuals.
Shows deviation from N(0,1) per quantile group of the fitted values. A flat worm near zero indicates good fit. Systematic deviations indicate:
- Upward shift: underdispersion
- Downward shift: overdispersion
- S-shape: wrong variance-mean relationship
- U/inverted-U: wrong kurtosis
Requires matplotlib (optional dependency).
Parameters
model : DistributionalGLM Fitted model. X, y : as for quantile_residuals. exposure : ndarray, optional n_groups : int Number of groups to split fitted mu into (default 4). seed : int Random seed for quantile residuals. ax : matplotlib.axes.Axes or None Axes to draw on. Creates new figure if None.
Returns
ax : matplotlib.axes.Axes