insurance_distributional_glm.families
Distribution families for GAMLSS.
1"""Distribution families for GAMLSS.""" 2 3from .base import ( 4 GamlssFamily, 5 Link, 6 LogLink, 7 LogitLink, 8 IdentityLink, 9 SqrtLink, 10 log, 11 logit, 12 identity, 13 sqrt, 14) 15from .continuous import Gamma, LogNormal, InverseGaussian, Tweedie 16from .count import Poisson, NBI, ZIP 17 18__all__ = [ 19 # Base 20 "GamlssFamily", 21 "Link", 22 "LogLink", 23 "LogitLink", 24 "IdentityLink", 25 "SqrtLink", 26 "log", 27 "logit", 28 "identity", 29 "sqrt", 30 # Continuous 31 "Gamma", 32 "LogNormal", 33 "InverseGaussian", 34 "Tweedie", 35 # Count 36 "Poisson", 37 "NBI", 38 "ZIP", 39]
146class GamlssFamily(ABC): 147 """ 148 Abstract base for GAMLSS distribution families. 149 150 Each subclass implements a specific distribution and its relationship 151 to the RS fitting algorithm. The key quantities are: 152 - log_likelihood: total log-likelihood (sum over obs) 153 - dl_deta: d(log L)/d(eta_k) — the score for the IRLS working response 154 - d2l_deta2: E[d^2(log L)/d(eta_k)^2] — negative for IRLS weights 155 156 These drive the weighted least squares step in the RS algorithm. 157 """ 158 159 @property 160 @abstractmethod 161 def param_names(self) -> List[str]: 162 """Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].""" 163 164 @property 165 @abstractmethod 166 def default_links(self) -> Dict[str, Link]: 167 """Default link function for each parameter.""" 168 169 @abstractmethod 170 def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray: 171 """ 172 Per-observation log-likelihood. 173 174 Returns array of shape (n,), one value per observation. 175 The fitting engine sums these to get total log-likelihood. 176 """ 177 178 @abstractmethod 179 def dl_deta( 180 self, 181 y: np.ndarray, 182 params: Dict[str, np.ndarray], 183 param_name: str, 184 ) -> np.ndarray: 185 """ 186 d(log L_i)/d(eta_k) for parameter param_name. 187 188 This is the derivative of log-likelihood with respect to the linear 189 predictor eta_k (not theta_k). Used to construct the working response 190 in the IRLS step. 191 192 By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k) 193 194 Raises ValueError for unknown param_name. 195 """ 196 197 @abstractmethod 198 def d2l_deta2( 199 self, 200 y: np.ndarray, 201 params: Dict[str, np.ndarray], 202 param_name: str, 203 ) -> np.ndarray: 204 """ 205 Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name. 206 207 Returns positive values (this is the negative expected second derivative). 208 Used as IRLS weights. If analytic expectation is hard, use the observed 209 Fisher information (negative of actual second derivative). 210 211 Raises ValueError for unknown param_name. 212 """ 213 214 @abstractmethod 215 def starting_values(self, y: np.ndarray) -> Dict[str, np.ndarray]: 216 """ 217 Initial parameter estimates from the marginal distribution of y. 218 219 Called once before the RS loop begins. Should be robust — moment 220 estimates are fine here; precision doesn't matter much. 221 """ 222 223 def _check_param(self, param_name: str) -> None: 224 """Raise ValueError if param_name is not a known parameter.""" 225 if param_name not in self.param_names: 226 raise ValueError( 227 f"Unknown parameter '{param_name}' for {self.__class__.__name__}. " 228 f"Known parameters: {self.param_names}" 229 ) 230 231 def validate_params(self, params: Dict[str, np.ndarray]) -> None: 232 """Optional validation hook. Raise ValueError if params are out of range.""" 233 pass 234 235 def __repr__(self) -> str: 236 return f"{self.__class__.__name__}(links={{{', '.join(f'{k}: {v.name}' for k, v in self.default_links.items())}}})"
Abstract base for GAMLSS distribution families.
Each subclass implements a specific distribution and its relationship to the RS fitting algorithm. The key quantities are:
- log_likelihood: total log-likelihood (sum over obs)
- dl_deta: d(log L)/d(eta_k) — the score for the IRLS working response
- d2l_deta2: E[d^2(log L)/d(eta_k)^2] — negative for IRLS weights
These drive the weighted least squares step in the RS algorithm.
159 @property 160 @abstractmethod 161 def param_names(self) -> List[str]: 162 """Ordered list of distribution parameter names, e.g. ['mu', 'sigma']."""
Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].
164 @property 165 @abstractmethod 166 def default_links(self) -> Dict[str, Link]: 167 """Default link function for each parameter."""
Default link function for each parameter.
169 @abstractmethod 170 def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray: 171 """ 172 Per-observation log-likelihood. 173 174 Returns array of shape (n,), one value per observation. 175 The fitting engine sums these to get total log-likelihood. 176 """
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
178 @abstractmethod 179 def dl_deta( 180 self, 181 y: np.ndarray, 182 params: Dict[str, np.ndarray], 183 param_name: str, 184 ) -> np.ndarray: 185 """ 186 d(log L_i)/d(eta_k) for parameter param_name. 187 188 This is the derivative of log-likelihood with respect to the linear 189 predictor eta_k (not theta_k). Used to construct the working response 190 in the IRLS step. 191 192 By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k) 193 194 Raises ValueError for unknown param_name. 195 """
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
197 @abstractmethod 198 def d2l_deta2( 199 self, 200 y: np.ndarray, 201 params: Dict[str, np.ndarray], 202 param_name: str, 203 ) -> np.ndarray: 204 """ 205 Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name. 206 207 Returns positive values (this is the negative expected second derivative). 208 Used as IRLS weights. If analytic expectation is hard, use the observed 209 Fisher information (negative of actual second derivative). 210 211 Raises ValueError for unknown param_name. 212 """
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
214 @abstractmethod 215 def starting_values(self, y: np.ndarray) -> Dict[str, np.ndarray]: 216 """ 217 Initial parameter estimates from the marginal distribution of y. 218 219 Called once before the RS loop begins. Should be robust — moment 220 estimates are fine here; precision doesn't matter much. 221 """
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
26class Link(ABC): 27 """Abstract link function g: parameter space -> real line.""" 28 29 @abstractmethod 30 def link(self, theta: np.ndarray) -> np.ndarray: 31 """Forward: eta = g(theta).""" 32 33 @abstractmethod 34 def inverse(self, eta: np.ndarray) -> np.ndarray: 35 """Inverse: theta = g^{-1}(eta).""" 36 37 @abstractmethod 38 def deriv(self, theta: np.ndarray) -> np.ndarray: 39 """Derivative: d(eta)/d(theta) = g'(theta).""" 40 41 def inverse_deriv(self, eta: np.ndarray) -> np.ndarray: 42 """d(theta)/d(eta) = 1 / g'(g^{-1}(eta)).""" 43 theta = self.inverse(eta) 44 return 1.0 / self.deriv(theta) 45 46 @property 47 @abstractmethod 48 def name(self) -> str: 49 pass
Abstract link function g: parameter space -> real line.
29 @abstractmethod 30 def link(self, theta: np.ndarray) -> np.ndarray: 31 """Forward: eta = g(theta)."""
Forward: eta = g(theta).
33 @abstractmethod 34 def inverse(self, eta: np.ndarray) -> np.ndarray: 35 """Inverse: theta = g^{-1}(eta)."""
Inverse: theta = g^{-1}(eta).
37 @abstractmethod 38 def deriv(self, theta: np.ndarray) -> np.ndarray: 39 """Derivative: d(eta)/d(theta) = g'(theta)."""
Derivative: d(eta)/d(theta) = g'(theta).
52class LogLink(Link): 53 """Log link: eta = log(theta), theta = exp(eta). Used for positive parameters.""" 54 55 def link(self, theta: np.ndarray) -> np.ndarray: 56 return np.log(theta) 57 58 def inverse(self, eta: np.ndarray) -> np.ndarray: 59 return np.exp(np.clip(eta, -30, 30)) 60 61 def deriv(self, theta: np.ndarray) -> np.ndarray: 62 return 1.0 / theta 63 64 def inverse_deriv(self, eta: np.ndarray) -> np.ndarray: 65 return np.exp(np.clip(eta, -30, 30)) 66 67 @property 68 def name(self) -> str: 69 return "log"
Log link: eta = log(theta), theta = exp(eta). Used for positive parameters.
72class LogitLink(Link): 73 """Logit link: eta = log(theta/(1-theta)), theta = expit(eta). Used for probabilities.""" 74 75 def link(self, theta: np.ndarray) -> np.ndarray: 76 theta_clipped = np.clip(theta, 1e-10, 1 - 1e-10) 77 return np.log(theta_clipped / (1.0 - theta_clipped)) 78 79 def inverse(self, eta: np.ndarray) -> np.ndarray: 80 return 1.0 / (1.0 + np.exp(-np.clip(eta, -30, 30))) 81 82 def deriv(self, theta: np.ndarray) -> np.ndarray: 83 theta_clipped = np.clip(theta, 1e-10, 1 - 1e-10) 84 return 1.0 / (theta_clipped * (1.0 - theta_clipped)) 85 86 def inverse_deriv(self, eta: np.ndarray) -> np.ndarray: 87 p = self.inverse(eta) 88 return p * (1.0 - p) 89 90 @property 91 def name(self) -> str: 92 return "logit"
Logit link: eta = log(theta/(1-theta)), theta = expit(eta). Used for probabilities.
75 def link(self, theta: np.ndarray) -> np.ndarray: 76 theta_clipped = np.clip(theta, 1e-10, 1 - 1e-10) 77 return np.log(theta_clipped / (1.0 - theta_clipped))
Forward: eta = g(theta).
79 def inverse(self, eta: np.ndarray) -> np.ndarray: 80 return 1.0 / (1.0 + np.exp(-np.clip(eta, -30, 30)))
Inverse: theta = g^{-1}(eta).
82 def deriv(self, theta: np.ndarray) -> np.ndarray: 83 theta_clipped = np.clip(theta, 1e-10, 1 - 1e-10) 84 return 1.0 / (theta_clipped * (1.0 - theta_clipped))
Derivative: d(eta)/d(theta) = g'(theta).
95class IdentityLink(Link): 96 """Identity link: eta = theta. Used for location parameters on the original scale.""" 97 98 def link(self, theta: np.ndarray) -> np.ndarray: 99 return theta.copy() 100 101 def inverse(self, eta: np.ndarray) -> np.ndarray: 102 return eta.copy() 103 104 def deriv(self, theta: np.ndarray) -> np.ndarray: 105 return np.ones_like(theta) 106 107 def inverse_deriv(self, eta: np.ndarray) -> np.ndarray: 108 return np.ones_like(eta) 109 110 @property 111 def name(self) -> str: 112 return "identity"
Identity link: eta = theta. Used for location parameters on the original scale.
115class SqrtLink(Link): 116 """Square root link: eta = sqrt(theta), theta = eta^2.""" 117 118 def link(self, theta: np.ndarray) -> np.ndarray: 119 return np.sqrt(np.clip(theta, 0, None)) 120 121 def inverse(self, eta: np.ndarray) -> np.ndarray: 122 return np.clip(eta, 0, None) ** 2 123 124 def deriv(self, theta: np.ndarray) -> np.ndarray: 125 return 0.5 / np.sqrt(np.clip(theta, 1e-10, None)) 126 127 def inverse_deriv(self, eta: np.ndarray) -> np.ndarray: 128 return 2.0 * np.clip(eta, 0, None) 129 130 @property 131 def name(self) -> str: 132 return "sqrt"
Square root link: eta = sqrt(theta), theta = eta^2.
124 def deriv(self, theta: np.ndarray) -> np.ndarray: 125 return 0.5 / np.sqrt(np.clip(theta, 1e-10, None))
Derivative: d(eta)/d(theta) = g'(theta).
28class Gamma(GamlssFamily): 29 """ 30 Gamma distribution with mean mu and coefficient of variation sigma. 31 32 PDF: f(y) = (1/sigma^2)^(1/sigma^2) * y^(1/sigma^2 - 1) * exp(-y/(mu*sigma^2)) 33 / (Gamma(1/sigma^2) * (mu*sigma^2)^(1/sigma^2)) 34 35 mu > 0 : mean (log link) 36 sigma > 0 : coefficient of variation, sd/mean (log link) 37 38 Insurance use: claim severity. sigma models heterogeneity of variance 39 — more risky segments have higher CV. 40 """ 41 42 @property 43 def param_names(self): 44 return ["mu", "sigma"] 45 46 @property 47 def default_links(self): 48 return {"mu": LogLink(), "sigma": LogLink()} 49 50 def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray: 51 mu = params["mu"] 52 sigma = params["sigma"] 53 nu = 1.0 / sigma**2 # shape parameter 54 return ( 55 nu * np.log(nu) 56 - gammaln(nu) 57 + (nu - 1.0) * np.log(y) 58 - nu * np.log(mu) 59 - nu * y / mu 60 ) 61 62 def dl_deta(self, y, params, param_name): 63 self._check_param(param_name) 64 mu = params["mu"] 65 sigma = params["sigma"] 66 nu = 1.0 / sigma**2 67 68 if param_name == "mu": 69 link = self.default_links["mu"] 70 dl_dtheta = nu * (y - mu) / mu**2 71 dtheta_deta = link.inverse_deriv(link.link(mu)) 72 return dl_dtheta * dtheta_deta 73 74 else: # sigma 75 from scipy.special import digamma 76 link = self.default_links["sigma"] 77 dl_dnu = np.log(nu) + 1.0 - digamma(nu) + np.log(y / mu) - y / mu 78 dl_dsigma = dl_dnu * (-2.0 / sigma**3) 79 dsigma_deta = link.inverse_deriv(link.link(sigma)) 80 return dl_dsigma * dsigma_deta 81 82 def d2l_deta2(self, y, params, param_name): 83 self._check_param(param_name) 84 mu = params["mu"] 85 sigma = params["sigma"] 86 nu = 1.0 / sigma**2 87 88 if param_name == "mu": 89 link = self.default_links["mu"] 90 d2l_dmu2 = nu / mu**2 91 dtheta_deta = link.inverse_deriv(link.link(mu)) 92 return d2l_dmu2 * dtheta_deta**2 93 94 else: # sigma 95 from scipy.special import polygamma 96 link = self.default_links["sigma"] 97 d2l_dnu2 = polygamma(1, nu) - 1.0 / nu 98 dnu_dsigma = -2.0 / sigma**3 99 d2l_dsigma2 = d2l_dnu2 * dnu_dsigma**2 100 dsigma_deta = link.inverse_deriv(link.link(sigma)) 101 return d2l_dsigma2 * dsigma_deta**2 102 103 def starting_values(self, y): 104 mu_init = np.full(len(y), np.mean(y)) 105 sigma_init = np.full(len(y), np.std(y) / np.mean(y)) 106 sigma_init = np.clip(sigma_init, 0.01, 5.0) 107 return {"mu": mu_init, "sigma": sigma_init}
Gamma distribution with mean mu and coefficient of variation sigma.
PDF: f(y) = (1/sigma^2)^(1/sigma^2) * y^(1/sigma^2 - 1) * exp(-y/(musigma^2)) / (Gamma(1/sigma^2) * (musigma^2)^(1/sigma^2))
mu > 0 : mean (log link) sigma > 0 : coefficient of variation, sd/mean (log link)
Insurance use: claim severity. sigma models heterogeneity of variance — more risky segments have higher CV.
50 def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray: 51 mu = params["mu"] 52 sigma = params["sigma"] 53 nu = 1.0 / sigma**2 # shape parameter 54 return ( 55 nu * np.log(nu) 56 - gammaln(nu) 57 + (nu - 1.0) * np.log(y) 58 - nu * np.log(mu) 59 - nu * y / mu 60 )
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
62 def dl_deta(self, y, params, param_name): 63 self._check_param(param_name) 64 mu = params["mu"] 65 sigma = params["sigma"] 66 nu = 1.0 / sigma**2 67 68 if param_name == "mu": 69 link = self.default_links["mu"] 70 dl_dtheta = nu * (y - mu) / mu**2 71 dtheta_deta = link.inverse_deriv(link.link(mu)) 72 return dl_dtheta * dtheta_deta 73 74 else: # sigma 75 from scipy.special import digamma 76 link = self.default_links["sigma"] 77 dl_dnu = np.log(nu) + 1.0 - digamma(nu) + np.log(y / mu) - y / mu 78 dl_dsigma = dl_dnu * (-2.0 / sigma**3) 79 dsigma_deta = link.inverse_deriv(link.link(sigma)) 80 return dl_dsigma * dsigma_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
82 def d2l_deta2(self, y, params, param_name): 83 self._check_param(param_name) 84 mu = params["mu"] 85 sigma = params["sigma"] 86 nu = 1.0 / sigma**2 87 88 if param_name == "mu": 89 link = self.default_links["mu"] 90 d2l_dmu2 = nu / mu**2 91 dtheta_deta = link.inverse_deriv(link.link(mu)) 92 return d2l_dmu2 * dtheta_deta**2 93 94 else: # sigma 95 from scipy.special import polygamma 96 link = self.default_links["sigma"] 97 d2l_dnu2 = polygamma(1, nu) - 1.0 / nu 98 dnu_dsigma = -2.0 / sigma**3 99 d2l_dsigma2 = d2l_dnu2 * dnu_dsigma**2 100 dsigma_deta = link.inverse_deriv(link.link(sigma)) 101 return d2l_dsigma2 * dsigma_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
103 def starting_values(self, y): 104 mu_init = np.full(len(y), np.mean(y)) 105 sigma_init = np.full(len(y), np.std(y) / np.mean(y)) 106 sigma_init = np.clip(sigma_init, 0.01, 5.0) 107 return {"mu": mu_init, "sigma": sigma_init}
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
110class LogNormal(GamlssFamily): 111 """ 112 Log-Normal distribution with log-scale mean mu and log-scale SD sigma. 113 114 If Y ~ LogNormal(mu, sigma), then log(Y) ~ Normal(mu, sigma). 115 Note: mu here is the mean of log(Y), not E[Y]. 116 E[Y] = exp(mu + sigma^2/2). 117 118 mu : real (identity link — this is already on log scale) 119 sigma > 0 : log-scale standard deviation (log link) 120 121 Insurance use: severity when log(claims) is approximately normal. 122 Heavier right tail than Gamma for the same mean. 123 """ 124 125 @property 126 def param_names(self): 127 return ["mu", "sigma"] 128 129 @property 130 def default_links(self): 131 return {"mu": IdentityLink(), "sigma": LogLink()} 132 133 def log_likelihood(self, y, params): 134 mu = params["mu"] 135 sigma = params["sigma"] 136 log_y = np.log(np.clip(y, 1e-300, None)) 137 return ( 138 -np.log(sigma) 139 - 0.5 * np.log(2.0 * np.pi) 140 - log_y 141 - 0.5 * ((log_y - mu) / sigma) ** 2 142 ) 143 144 def dl_deta(self, y, params, param_name): 145 self._check_param(param_name) 146 mu = params["mu"] 147 sigma = params["sigma"] 148 log_y = np.log(np.clip(y, 1e-300, None)) 149 resid = (log_y - mu) / sigma 150 151 if param_name == "mu": 152 link = self.default_links["mu"] 153 dl_dmu = resid / sigma 154 dmu_deta = link.inverse_deriv(link.link(mu)) 155 return dl_dmu * dmu_deta 156 157 else: # sigma 158 link = self.default_links["sigma"] 159 dl_dsigma = (resid**2 - 1.0) / sigma 160 dsigma_deta = link.inverse_deriv(link.link(sigma)) 161 return dl_dsigma * dsigma_deta 162 163 def d2l_deta2(self, y, params, param_name): 164 self._check_param(param_name) 165 mu = params["mu"] 166 sigma = params["sigma"] 167 168 if param_name == "mu": 169 link = self.default_links["mu"] 170 d2l_dmu2 = 1.0 / sigma**2 171 dmu_deta = link.inverse_deriv(link.link(mu)) 172 return d2l_dmu2 * dmu_deta**2 173 174 else: # sigma 175 link = self.default_links["sigma"] 176 d2l_dsigma2 = 2.0 / sigma**2 177 dsigma_deta = link.inverse_deriv(link.link(sigma)) 178 return d2l_dsigma2 * dsigma_deta**2 179 180 def starting_values(self, y): 181 log_y = np.log(np.clip(y, 1e-300, None)) 182 mu_init = np.full(len(y), np.mean(log_y)) 183 sigma_init = np.full(len(y), max(np.std(log_y), 0.01)) 184 return {"mu": mu_init, "sigma": sigma_init}
Log-Normal distribution with log-scale mean mu and log-scale SD sigma.
If Y ~ LogNormal(mu, sigma), then log(Y) ~ Normal(mu, sigma). Note: mu here is the mean of log(Y), not E[Y]. E[Y] = exp(mu + sigma^2/2).
mu : real (identity link — this is already on log scale) sigma > 0 : log-scale standard deviation (log link)
Insurance use: severity when log(claims) is approximately normal. Heavier right tail than Gamma for the same mean.
133 def log_likelihood(self, y, params): 134 mu = params["mu"] 135 sigma = params["sigma"] 136 log_y = np.log(np.clip(y, 1e-300, None)) 137 return ( 138 -np.log(sigma) 139 - 0.5 * np.log(2.0 * np.pi) 140 - log_y 141 - 0.5 * ((log_y - mu) / sigma) ** 2 142 )
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
144 def dl_deta(self, y, params, param_name): 145 self._check_param(param_name) 146 mu = params["mu"] 147 sigma = params["sigma"] 148 log_y = np.log(np.clip(y, 1e-300, None)) 149 resid = (log_y - mu) / sigma 150 151 if param_name == "mu": 152 link = self.default_links["mu"] 153 dl_dmu = resid / sigma 154 dmu_deta = link.inverse_deriv(link.link(mu)) 155 return dl_dmu * dmu_deta 156 157 else: # sigma 158 link = self.default_links["sigma"] 159 dl_dsigma = (resid**2 - 1.0) / sigma 160 dsigma_deta = link.inverse_deriv(link.link(sigma)) 161 return dl_dsigma * dsigma_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
163 def d2l_deta2(self, y, params, param_name): 164 self._check_param(param_name) 165 mu = params["mu"] 166 sigma = params["sigma"] 167 168 if param_name == "mu": 169 link = self.default_links["mu"] 170 d2l_dmu2 = 1.0 / sigma**2 171 dmu_deta = link.inverse_deriv(link.link(mu)) 172 return d2l_dmu2 * dmu_deta**2 173 174 else: # sigma 175 link = self.default_links["sigma"] 176 d2l_dsigma2 = 2.0 / sigma**2 177 dsigma_deta = link.inverse_deriv(link.link(sigma)) 178 return d2l_dsigma2 * dsigma_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
180 def starting_values(self, y): 181 log_y = np.log(np.clip(y, 1e-300, None)) 182 mu_init = np.full(len(y), np.mean(log_y)) 183 sigma_init = np.full(len(y), max(np.std(log_y), 0.01)) 184 return {"mu": mu_init, "sigma": sigma_init}
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
187class InverseGaussian(GamlssFamily): 188 """ 189 Inverse Gaussian distribution with mean mu and dispersion sigma. 190 191 Standard parameterisation: if lambda = 1/sigma^2 is the shape (precision) 192 parameter, then: 193 PDF: f(y) = sqrt(lambda/(2*pi*y^3)) * exp(-lambda*(y-mu)^2/(2*mu^2*y)) 194 Var[Y] = mu^3 / lambda = sigma^2 * mu^3 195 196 mu > 0 : mean (log link) 197 sigma > 0 : dispersion, Var = sigma^2 * mu^3 (log link) 198 199 Log-likelihood: 200 ll = -log(sigma) - 0.5*log(2*pi) - 1.5*log(y) - (y-mu)^2/(2*sigma^2*mu^2*y) 201 202 Insurance use: liability severity with heavy right tails. The variance 203 grows faster with mu than Gamma (mu^3 vs mu^2), suitable for large losses. 204 """ 205 206 @property 207 def param_names(self): 208 return ["mu", "sigma"] 209 210 @property 211 def default_links(self): 212 return {"mu": LogLink(), "sigma": LogLink()} 213 214 def log_likelihood(self, y, params): 215 mu = params["mu"] 216 sigma = params["sigma"] 217 y_safe = np.clip(y, 1e-300, None) 218 return ( 219 -np.log(sigma) 220 - 0.5 * np.log(2.0 * np.pi) 221 - 1.5 * np.log(y_safe) 222 - (y - mu) ** 2 / (2.0 * sigma**2 * mu**2 * y_safe) 223 ) 224 225 def dl_deta(self, y, params, param_name): 226 self._check_param(param_name) 227 mu = params["mu"] 228 sigma = params["sigma"] 229 y_safe = np.clip(y, 1e-300, None) 230 231 if param_name == "mu": 232 link = self.default_links["mu"] 233 # d(ll)/d(mu): 234 # ll contains -(y-mu)^2/(2*sigma^2*mu^2*y) 235 # Expand: -(y^2/mu^2 - 2y/mu + 1)/(2*sigma^2*y) 236 # d/dmu = (2y^2/mu^3 - 2y/mu^2)/(2*sigma^2*y) = (y-mu)/(sigma^2*mu^3) 237 dl_dmu = (y - mu) / (sigma**2 * mu**3) 238 dmu_deta = link.inverse_deriv(link.link(mu)) 239 return dl_dmu * dmu_deta 240 241 else: # sigma 242 link = self.default_links["sigma"] 243 # d(ll)/d(sigma) = -1/sigma + (y-mu)^2/(sigma^3*mu^2*y) 244 dl_dsigma = -1.0 / sigma + (y - mu)**2 / (sigma**3 * mu**2 * y_safe) 245 dsigma_deta = link.inverse_deriv(link.link(sigma)) 246 return dl_dsigma * dsigma_deta 247 248 def d2l_deta2(self, y, params, param_name): 249 self._check_param(param_name) 250 mu = params["mu"] 251 sigma = params["sigma"] 252 253 if param_name == "mu": 254 link = self.default_links["mu"] 255 # E[-d^2(ll)/dmu^2] = 1/(sigma^2*mu^3) 256 d2l_dmu2 = 1.0 / (sigma**2 * mu**3) 257 dmu_deta = link.inverse_deriv(link.link(mu)) 258 return d2l_dmu2 * dmu_deta**2 259 260 else: # sigma 261 link = self.default_links["sigma"] 262 # E[-d^2 ll/d sigma^2] = 2/sigma^2 263 d2l_dsigma2 = 2.0 / sigma**2 264 dsigma_deta = link.inverse_deriv(link.link(sigma)) 265 return d2l_dsigma2 * dsigma_deta**2 266 267 def starting_values(self, y): 268 mu_init = np.full(len(y), np.mean(y)) 269 sigma_sq = max(np.var(y) / np.mean(y)**3, 1e-6) 270 sigma_init = np.full(len(y), np.sqrt(sigma_sq)) 271 return {"mu": mu_init, "sigma": sigma_init}
Inverse Gaussian distribution with mean mu and dispersion sigma.
Standard parameterisation: if lambda = 1/sigma^2 is the shape (precision) parameter, then: PDF: f(y) = sqrt(lambda/(2piy^3)) * exp(-lambda*(y-mu)^2/(2mu^2y)) Var[Y] = mu^3 / lambda = sigma^2 * mu^3
mu > 0 : mean (log link) sigma > 0 : dispersion, Var = sigma^2 * mu^3 (log link)
Log-likelihood: ll = -log(sigma) - 0.5log(2pi) - 1.5log(y) - (y-mu)^2/(2sigma^2mu^2y)
Insurance use: liability severity with heavy right tails. The variance grows faster with mu than Gamma (mu^3 vs mu^2), suitable for large losses.
214 def log_likelihood(self, y, params): 215 mu = params["mu"] 216 sigma = params["sigma"] 217 y_safe = np.clip(y, 1e-300, None) 218 return ( 219 -np.log(sigma) 220 - 0.5 * np.log(2.0 * np.pi) 221 - 1.5 * np.log(y_safe) 222 - (y - mu) ** 2 / (2.0 * sigma**2 * mu**2 * y_safe) 223 )
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
225 def dl_deta(self, y, params, param_name): 226 self._check_param(param_name) 227 mu = params["mu"] 228 sigma = params["sigma"] 229 y_safe = np.clip(y, 1e-300, None) 230 231 if param_name == "mu": 232 link = self.default_links["mu"] 233 # d(ll)/d(mu): 234 # ll contains -(y-mu)^2/(2*sigma^2*mu^2*y) 235 # Expand: -(y^2/mu^2 - 2y/mu + 1)/(2*sigma^2*y) 236 # d/dmu = (2y^2/mu^3 - 2y/mu^2)/(2*sigma^2*y) = (y-mu)/(sigma^2*mu^3) 237 dl_dmu = (y - mu) / (sigma**2 * mu**3) 238 dmu_deta = link.inverse_deriv(link.link(mu)) 239 return dl_dmu * dmu_deta 240 241 else: # sigma 242 link = self.default_links["sigma"] 243 # d(ll)/d(sigma) = -1/sigma + (y-mu)^2/(sigma^3*mu^2*y) 244 dl_dsigma = -1.0 / sigma + (y - mu)**2 / (sigma**3 * mu**2 * y_safe) 245 dsigma_deta = link.inverse_deriv(link.link(sigma)) 246 return dl_dsigma * dsigma_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
248 def d2l_deta2(self, y, params, param_name): 249 self._check_param(param_name) 250 mu = params["mu"] 251 sigma = params["sigma"] 252 253 if param_name == "mu": 254 link = self.default_links["mu"] 255 # E[-d^2(ll)/dmu^2] = 1/(sigma^2*mu^3) 256 d2l_dmu2 = 1.0 / (sigma**2 * mu**3) 257 dmu_deta = link.inverse_deriv(link.link(mu)) 258 return d2l_dmu2 * dmu_deta**2 259 260 else: # sigma 261 link = self.default_links["sigma"] 262 # E[-d^2 ll/d sigma^2] = 2/sigma^2 263 d2l_dsigma2 = 2.0 / sigma**2 264 dsigma_deta = link.inverse_deriv(link.link(sigma)) 265 return d2l_dsigma2 * dsigma_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
267 def starting_values(self, y): 268 mu_init = np.full(len(y), np.mean(y)) 269 sigma_sq = max(np.var(y) / np.mean(y)**3, 1e-6) 270 sigma_init = np.full(len(y), np.sqrt(sigma_sq)) 271 return {"mu": mu_init, "sigma": sigma_init}
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
274class Tweedie(GamlssFamily): 275 """ 276 Tweedie distribution: compound Poisson-Gamma, useful for pure premiums. 277 278 The Tweedie family with power parameter p in (1,2) gives a distribution 279 with a point mass at zero (claim frequency component) and a continuous 280 positive part (severity component). This makes it natural for: 281 - Pure premiums (expected cost per unit exposure) 282 - Combined frequency-severity models 283 284 mu > 0 : mean (log link) 285 phi > 0 : dispersion, Var[Y] = phi * mu^p (log link) 286 p : power parameter, fixed at init (default 1.5) 287 288 The log-likelihood uses the series approximation from Dunn & Smyth (2005). 289 For p=1: Poisson, p=2: Gamma, p in (1,2): compound Poisson-Gamma. 290 291 Note: p is NOT estimated by default — fix it or profile over a grid. 292 """ 293 294 def __init__(self, power: float = 1.5): 295 if not (1.0 < power < 2.0): 296 raise ValueError(f"Tweedie power must be in (1,2), got {power}") 297 self.power = power 298 299 @property 300 def param_names(self): 301 return ["mu", "phi"] 302 303 @property 304 def default_links(self): 305 return {"mu": LogLink(), "phi": LogLink()} 306 307 def log_likelihood(self, y, params): 308 mu = params["mu"] 309 phi = params["phi"] 310 p = self.power 311 312 ll = np.where( 313 y == 0, 314 -mu**(2.0 - p) / ((2.0 - p) * phi), 315 (y * mu**(1.0 - p) / ((1.0 - p) * phi) 316 - mu**(2.0 - p) / ((2.0 - p) * phi) 317 + self._log_w(y, phi, p)), 318 ) 319 return ll 320 321 def _log_w(self, y: np.ndarray, phi: np.ndarray, p: float) -> np.ndarray: 322 """ 323 Log of the Tweedie series sum W(y, phi, p) for y > 0. 324 325 Uses the compound Poisson-Gamma series from Dunn & Smyth (2005). 326 Each term j corresponds to a Poisson count of j severity events. 327 328 For fixed y > 0 the density is: 329 f(y) * exp(ll_front_inverse) = (1/y) * sum_j V_j 330 where: 331 V_j = lam^j * y^{j*alpha - 1} / (j! * scale^{j*alpha} * Gamma(j*alpha)) 332 lam = mu^{2-p} / (phi * (2-p)) [Poisson mean] 333 scale = phi * (p-1) * mu^{p-1} [Gamma scale per event] 334 alpha = (2-p) / (p-1) [> 0 for p in (1,2)] 335 336 The log-offset terms for lam and scale simplify to: 337 j*log(lam) - j*alpha*log(scale) = -j*log(phi*(2-p)) - j*alpha*log(phi*(p-1)) 338 (the mu terms cancel because alpha*(p-1) = 2-p exactly). 339 340 Per-term log formula: 341 log(W_j) = (j*alpha - 1)*log(y) - gammaln(j*alpha) - gammaln(j+1) 342 - j*log(phi*(2-p)) - j*alpha*log(phi*(p-1)) 343 """ 344 j_max = 50 345 alpha = (2.0 - p) / (p - 1.0) # positive for p in (1, 2) 346 347 y_flat = np.asarray(y, dtype=float).ravel() 348 phi_flat = np.asarray(phi, dtype=float).ravel() 349 350 log_phi_2mp = np.log(phi_flat * (2.0 - p)) 351 log_phi_pm1 = np.log(phi_flat * (p - 1.0)) 352 log_y = np.log(np.maximum(y_flat, 1e-300)) 353 354 log_w = np.full(len(y_flat), -np.inf) 355 356 for j in range(1, j_max + 1): 357 log_term = ( 358 (j * alpha - 1.0) * log_y 359 - gammaln(j * alpha) 360 - gammaln(j + 1) 361 - j * log_phi_2mp 362 - j * alpha * log_phi_pm1 363 ) 364 m = np.maximum(log_w, log_term) 365 finite = np.isfinite(m) 366 log_w = np.where( 367 finite, 368 m + np.log( 369 np.exp(np.where(finite, log_w - m, 0)) 370 + np.exp(np.where(finite, log_term - m, 0)) 371 ), 372 np.where(np.isfinite(log_term), log_term, log_w), 373 ) 374 375 return log_w.reshape(np.asarray(y).shape) 376 377 def dl_deta(self, y, params, param_name): 378 self._check_param(param_name) 379 mu = params["mu"] 380 phi = params["phi"] 381 p = self.power 382 383 if param_name == "mu": 384 link = self.default_links["mu"] 385 dl_dmu = (y * mu**(-p) - mu**(1.0 - p)) / phi 386 dmu_deta = link.inverse_deriv(link.link(mu)) 387 return dl_dmu * dmu_deta 388 389 else: # phi 390 link = self.default_links["phi"] 391 dl_dphi = np.where( 392 y == 0, 393 mu**(2.0 - p) / ((2.0 - p) * phi**2), 394 -y * mu**(1.0 - p) / ((1.0 - p) * phi**2) 395 + mu**(2.0 - p) / ((2.0 - p) * phi**2), 396 ) 397 dphi_deta = link.inverse_deriv(link.link(phi)) 398 return dl_dphi * dphi_deta 399 400 def d2l_deta2(self, y, params, param_name): 401 self._check_param(param_name) 402 mu = params["mu"] 403 phi = params["phi"] 404 p = self.power 405 406 if param_name == "mu": 407 link = self.default_links["mu"] 408 # E[-d^2 ll/d mu^2] = mu^{-p} / phi 409 # Derived: d^2ll/dmu^2 = -p*y*mu^{-p-1}/phi + (p-1)*mu^{-p}/phi 410 # Taking expectation with E[y]=mu: E[-d^2ll/dmu^2] = mu^{-p}/phi 411 d2l_dmu2 = mu ** (-p) / phi 412 dmu_deta = link.inverse_deriv(link.link(mu)) 413 return d2l_dmu2 * dmu_deta**2 414 415 else: # phi 416 link = self.default_links["phi"] 417 # E[-d^2 ll/d phi^2] = 2 * mu^{2-p} / (phi^3 * (p-1) * (2-p)) 418 # Derived: d^2ll/dphi^2 = 2*y*mu^{1-p}/((1-p)*phi^3) - 2*mu^{2-p}/((2-p)*phi^3) 419 # Taking expectation with E[y]=mu: 420 # E[-d^2ll/dphi^2] = -2*mu^{2-p}/((1-p)*phi^3) + 2*mu^{2-p}/((2-p)*phi^3) 421 # = 2*mu^{2-p}/phi^3 * [1/(p-1) + 1/(2-p)] 422 # = 2*mu^{2-p}/(phi^3 * (p-1) * (2-p)) 423 d2l_dphi2 = 2.0 * mu ** (2.0 - p) / (phi**3 * (p - 1.0) * (2.0 - p)) 424 dphi_deta = link.inverse_deriv(link.link(phi)) 425 return d2l_dphi2 * dphi_deta**2 426 427 def starting_values(self, y): 428 mu_init = np.full(len(y), np.mean(y[y > 0]) if np.any(y > 0) else 1.0) 429 phi_init = np.full(len(y), 1.0) 430 return {"mu": mu_init, "phi": phi_init}
Tweedie distribution: compound Poisson-Gamma, useful for pure premiums.
The Tweedie family with power parameter p in (1,2) gives a distribution with a point mass at zero (claim frequency component) and a continuous positive part (severity component). This makes it natural for:
- Pure premiums (expected cost per unit exposure)
- Combined frequency-severity models
mu > 0 : mean (log link) phi > 0 : dispersion, Var[Y] = phi * mu^p (log link) p : power parameter, fixed at init (default 1.5)
The log-likelihood uses the series approximation from Dunn & Smyth (2005). For p=1: Poisson, p=2: Gamma, p in (1,2): compound Poisson-Gamma.
Note: p is NOT estimated by default — fix it or profile over a grid.
307 def log_likelihood(self, y, params): 308 mu = params["mu"] 309 phi = params["phi"] 310 p = self.power 311 312 ll = np.where( 313 y == 0, 314 -mu**(2.0 - p) / ((2.0 - p) * phi), 315 (y * mu**(1.0 - p) / ((1.0 - p) * phi) 316 - mu**(2.0 - p) / ((2.0 - p) * phi) 317 + self._log_w(y, phi, p)), 318 ) 319 return ll
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
377 def dl_deta(self, y, params, param_name): 378 self._check_param(param_name) 379 mu = params["mu"] 380 phi = params["phi"] 381 p = self.power 382 383 if param_name == "mu": 384 link = self.default_links["mu"] 385 dl_dmu = (y * mu**(-p) - mu**(1.0 - p)) / phi 386 dmu_deta = link.inverse_deriv(link.link(mu)) 387 return dl_dmu * dmu_deta 388 389 else: # phi 390 link = self.default_links["phi"] 391 dl_dphi = np.where( 392 y == 0, 393 mu**(2.0 - p) / ((2.0 - p) * phi**2), 394 -y * mu**(1.0 - p) / ((1.0 - p) * phi**2) 395 + mu**(2.0 - p) / ((2.0 - p) * phi**2), 396 ) 397 dphi_deta = link.inverse_deriv(link.link(phi)) 398 return dl_dphi * dphi_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
400 def d2l_deta2(self, y, params, param_name): 401 self._check_param(param_name) 402 mu = params["mu"] 403 phi = params["phi"] 404 p = self.power 405 406 if param_name == "mu": 407 link = self.default_links["mu"] 408 # E[-d^2 ll/d mu^2] = mu^{-p} / phi 409 # Derived: d^2ll/dmu^2 = -p*y*mu^{-p-1}/phi + (p-1)*mu^{-p}/phi 410 # Taking expectation with E[y]=mu: E[-d^2ll/dmu^2] = mu^{-p}/phi 411 d2l_dmu2 = mu ** (-p) / phi 412 dmu_deta = link.inverse_deriv(link.link(mu)) 413 return d2l_dmu2 * dmu_deta**2 414 415 else: # phi 416 link = self.default_links["phi"] 417 # E[-d^2 ll/d phi^2] = 2 * mu^{2-p} / (phi^3 * (p-1) * (2-p)) 418 # Derived: d^2ll/dphi^2 = 2*y*mu^{1-p}/((1-p)*phi^3) - 2*mu^{2-p}/((2-p)*phi^3) 419 # Taking expectation with E[y]=mu: 420 # E[-d^2ll/dphi^2] = -2*mu^{2-p}/((1-p)*phi^3) + 2*mu^{2-p}/((2-p)*phi^3) 421 # = 2*mu^{2-p}/phi^3 * [1/(p-1) + 1/(2-p)] 422 # = 2*mu^{2-p}/(phi^3 * (p-1) * (2-p)) 423 d2l_dphi2 = 2.0 * mu ** (2.0 - p) / (phi**3 * (p - 1.0) * (2.0 - p)) 424 dphi_deta = link.inverse_deriv(link.link(phi)) 425 return d2l_dphi2 * dphi_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
427 def starting_values(self, y): 428 mu_init = np.full(len(y), np.mean(y[y > 0]) if np.any(y > 0) else 1.0) 429 phi_init = np.full(len(y), 1.0) 430 return {"mu": mu_init, "phi": phi_init}
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
29class Poisson(GamlssFamily): 30 """ 31 Poisson distribution for claim frequency. 32 33 mu > 0 : rate parameter (log link) 34 35 One-parameter family — the RS algorithm only updates mu. 36 Use NBI if your data shows overdispersion (most insurance data does). 37 """ 38 39 @property 40 def param_names(self): 41 return ["mu"] 42 43 @property 44 def default_links(self): 45 return {"mu": LogLink()} 46 47 def log_likelihood(self, y, params): 48 mu = params["mu"] 49 return xlogy(y, mu) - mu - gammaln(y + 1.0) 50 51 def dl_deta(self, y, params, param_name): 52 self._check_param(param_name) 53 mu = params["mu"] 54 link = self.default_links["mu"] 55 # d(ll)/d(mu) = (y - mu) / mu; with log link, d(mu)/d(eta) = mu 56 # so dl/d(eta) = (y - mu) / mu * mu = y - mu 57 dl_dmu = (y - mu) / mu 58 dmu_deta = link.inverse_deriv(link.link(mu)) 59 return dl_dmu * dmu_deta 60 61 def d2l_deta2(self, y, params, param_name): 62 self._check_param(param_name) 63 mu = params["mu"] 64 link = self.default_links["mu"] 65 # E[-d^2 ll/d mu^2] = 1/mu; with log link dmu/deta=mu: E[-d^2/deta^2] = mu^2/mu = mu 66 d2l_dmu2 = 1.0 / mu 67 dmu_deta = link.inverse_deriv(link.link(mu)) 68 return d2l_dmu2 * dmu_deta**2 69 70 def starting_values(self, y): 71 mu_init = np.full(len(y), max(np.mean(y), 0.01)) 72 return {"mu": mu_init}
Poisson distribution for claim frequency.
mu > 0 : rate parameter (log link)
One-parameter family — the RS algorithm only updates mu. Use NBI if your data shows overdispersion (most insurance data does).
47 def log_likelihood(self, y, params): 48 mu = params["mu"] 49 return xlogy(y, mu) - mu - gammaln(y + 1.0)
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
51 def dl_deta(self, y, params, param_name): 52 self._check_param(param_name) 53 mu = params["mu"] 54 link = self.default_links["mu"] 55 # d(ll)/d(mu) = (y - mu) / mu; with log link, d(mu)/d(eta) = mu 56 # so dl/d(eta) = (y - mu) / mu * mu = y - mu 57 dl_dmu = (y - mu) / mu 58 dmu_deta = link.inverse_deriv(link.link(mu)) 59 return dl_dmu * dmu_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
61 def d2l_deta2(self, y, params, param_name): 62 self._check_param(param_name) 63 mu = params["mu"] 64 link = self.default_links["mu"] 65 # E[-d^2 ll/d mu^2] = 1/mu; with log link dmu/deta=mu: E[-d^2/deta^2] = mu^2/mu = mu 66 d2l_dmu2 = 1.0 / mu 67 dmu_deta = link.inverse_deriv(link.link(mu)) 68 return d2l_dmu2 * dmu_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
70 def starting_values(self, y): 71 mu_init = np.full(len(y), max(np.mean(y), 0.01)) 72 return {"mu": mu_init}
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
75class NBI(GamlssFamily): 76 """ 77 Negative Binomial Type I (NBI) for overdispersed claim counts. 78 79 Parameterisation: Var[Y] = mu + sigma * mu^2 80 mu > 0 : mean (log link) 81 sigma > 0 : overdispersion parameter (log link) 82 83 PMF: P(Y=y) = Gamma(y + 1/sigma) / (y! * Gamma(1/sigma)) 84 * (mu*sigma/(1+mu*sigma))^y * (1/(1+mu*sigma))^(1/sigma) 85 86 Insurance use: the standard choice when claim counts are overdispersed, 87 which is almost always true in motor and property portfolios. sigma 88 captures unobserved heterogeneity not captured by the covariates. 89 """ 90 91 @property 92 def param_names(self): 93 return ["mu", "sigma"] 94 95 @property 96 def default_links(self): 97 return {"mu": LogLink(), "sigma": LogLink()} 98 99 def log_likelihood(self, y, params): 100 mu = params["mu"] 101 sigma = params["sigma"] 102 k = 1.0 / sigma # shape / reciprocal overdispersion 103 104 return ( 105 gammaln(y + k) 106 - gammaln(k) 107 - gammaln(y + 1.0) 108 + k * np.log(k / (k + mu)) 109 + xlogy(y, mu / (mu + k)) 110 ) 111 112 def dl_deta(self, y, params, param_name): 113 self._check_param(param_name) 114 mu = params["mu"] 115 sigma = params["sigma"] 116 k = 1.0 / sigma 117 118 if param_name == "mu": 119 link = self.default_links["mu"] 120 # d(ll)/d(mu) = y/mu - (y+k)/(mu+k) 121 dl_dmu = y / mu - (y + k) / (mu + k) 122 dmu_deta = link.inverse_deriv(link.link(mu)) 123 return dl_dmu * dmu_deta 124 125 else: # sigma 126 from scipy.special import digamma 127 link = self.default_links["sigma"] 128 dl_dk = ( 129 digamma(y + k) 130 - digamma(k) 131 + np.log(k / (k + mu)) 132 + 1.0 133 - (y + k) / (k + mu) 134 ) 135 dk_dsigma = -1.0 / sigma**2 136 dl_dsigma = dl_dk * dk_dsigma 137 dsigma_deta = link.inverse_deriv(link.link(sigma)) 138 return dl_dsigma * dsigma_deta 139 140 def d2l_deta2(self, y, params, param_name): 141 self._check_param(param_name) 142 mu = params["mu"] 143 sigma = params["sigma"] 144 k = 1.0 / sigma 145 146 if param_name == "mu": 147 link = self.default_links["mu"] 148 # E[-d^2 ll/d mu^2] = k/(mu*(mu+k)) 149 d2l_dmu2 = k / (mu * (mu + k)) 150 dmu_deta = link.inverse_deriv(link.link(mu)) 151 return d2l_dmu2 * dmu_deta**2 152 153 else: # sigma 154 from scipy.special import polygamma 155 link = self.default_links["sigma"] 156 # Observed Fisher information per observation: -d^2 ll / d k^2 157 # d^2 ll / d k^2 = polygamma(1, y+k) - polygamma(1, k) - 1/k + 1/(k+mu) + (y-mu)/(k+mu)^2 158 # so -d^2 ll / d k^2 = polygamma(1, k) - polygamma(1, y+k) - 1/k + 1/(k+mu) + (mu-y)/(k+mu)^2 159 # Individual observations can give negative values; clip to floor before chain rule. 160 neg_d2l_dk2 = ( 161 polygamma(1, k) 162 - polygamma(1, y + k) 163 - 1.0 / k 164 + 1.0 / (k + mu) 165 + (mu - y) / (k + mu) ** 2 166 ) 167 neg_d2l_dk2 = np.maximum(neg_d2l_dk2, 1e-8) 168 dk_dsigma = -1.0 / sigma**2 169 d2l_dsigma2 = neg_d2l_dk2 * dk_dsigma**2 170 dsigma_deta = link.inverse_deriv(link.link(sigma)) 171 return d2l_dsigma2 * dsigma_deta**2 172 173 def starting_values(self, y): 174 mu_val = max(np.mean(y), 0.01) 175 var_val = max(np.var(y), mu_val + 1e-6) 176 sigma_val = max((var_val - mu_val) / mu_val**2, 0.01) 177 return { 178 "mu": np.full(len(y), mu_val), 179 "sigma": np.full(len(y), sigma_val), 180 }
Negative Binomial Type I (NBI) for overdispersed claim counts.
Parameterisation: Var[Y] = mu + sigma * mu^2 mu > 0 : mean (log link) sigma > 0 : overdispersion parameter (log link)
PMF: P(Y=y) = Gamma(y + 1/sigma) / (y! * Gamma(1/sigma)) * (musigma/(1+musigma))^y * (1/(1+mu*sigma))^(1/sigma)
Insurance use: the standard choice when claim counts are overdispersed, which is almost always true in motor and property portfolios. sigma captures unobserved heterogeneity not captured by the covariates.
99 def log_likelihood(self, y, params): 100 mu = params["mu"] 101 sigma = params["sigma"] 102 k = 1.0 / sigma # shape / reciprocal overdispersion 103 104 return ( 105 gammaln(y + k) 106 - gammaln(k) 107 - gammaln(y + 1.0) 108 + k * np.log(k / (k + mu)) 109 + xlogy(y, mu / (mu + k)) 110 )
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
112 def dl_deta(self, y, params, param_name): 113 self._check_param(param_name) 114 mu = params["mu"] 115 sigma = params["sigma"] 116 k = 1.0 / sigma 117 118 if param_name == "mu": 119 link = self.default_links["mu"] 120 # d(ll)/d(mu) = y/mu - (y+k)/(mu+k) 121 dl_dmu = y / mu - (y + k) / (mu + k) 122 dmu_deta = link.inverse_deriv(link.link(mu)) 123 return dl_dmu * dmu_deta 124 125 else: # sigma 126 from scipy.special import digamma 127 link = self.default_links["sigma"] 128 dl_dk = ( 129 digamma(y + k) 130 - digamma(k) 131 + np.log(k / (k + mu)) 132 + 1.0 133 - (y + k) / (k + mu) 134 ) 135 dk_dsigma = -1.0 / sigma**2 136 dl_dsigma = dl_dk * dk_dsigma 137 dsigma_deta = link.inverse_deriv(link.link(sigma)) 138 return dl_dsigma * dsigma_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
140 def d2l_deta2(self, y, params, param_name): 141 self._check_param(param_name) 142 mu = params["mu"] 143 sigma = params["sigma"] 144 k = 1.0 / sigma 145 146 if param_name == "mu": 147 link = self.default_links["mu"] 148 # E[-d^2 ll/d mu^2] = k/(mu*(mu+k)) 149 d2l_dmu2 = k / (mu * (mu + k)) 150 dmu_deta = link.inverse_deriv(link.link(mu)) 151 return d2l_dmu2 * dmu_deta**2 152 153 else: # sigma 154 from scipy.special import polygamma 155 link = self.default_links["sigma"] 156 # Observed Fisher information per observation: -d^2 ll / d k^2 157 # d^2 ll / d k^2 = polygamma(1, y+k) - polygamma(1, k) - 1/k + 1/(k+mu) + (y-mu)/(k+mu)^2 158 # so -d^2 ll / d k^2 = polygamma(1, k) - polygamma(1, y+k) - 1/k + 1/(k+mu) + (mu-y)/(k+mu)^2 159 # Individual observations can give negative values; clip to floor before chain rule. 160 neg_d2l_dk2 = ( 161 polygamma(1, k) 162 - polygamma(1, y + k) 163 - 1.0 / k 164 + 1.0 / (k + mu) 165 + (mu - y) / (k + mu) ** 2 166 ) 167 neg_d2l_dk2 = np.maximum(neg_d2l_dk2, 1e-8) 168 dk_dsigma = -1.0 / sigma**2 169 d2l_dsigma2 = neg_d2l_dk2 * dk_dsigma**2 170 dsigma_deta = link.inverse_deriv(link.link(sigma)) 171 return d2l_dsigma2 * dsigma_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
173 def starting_values(self, y): 174 mu_val = max(np.mean(y), 0.01) 175 var_val = max(np.var(y), mu_val + 1e-6) 176 sigma_val = max((var_val - mu_val) / mu_val**2, 0.01) 177 return { 178 "mu": np.full(len(y), mu_val), 179 "sigma": np.full(len(y), sigma_val), 180 }
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.
183class ZIP(GamlssFamily): 184 """ 185 Zero-Inflated Poisson (ZIP) for count data with excess zeros. 186 187 Parameterisation: 188 mu > 0 : Poisson rate for the non-zero-inflated component (log link) 189 pi in (0,1) : probability of structural zero (logit link) 190 191 P(Y=0) = pi + (1-pi)*exp(-mu) 192 P(Y=y) = (1-pi) * Poisson(y; mu) for y > 0 193 194 Insurance use: portfolios where a proportion of risks have zero claims 195 due to non-exposure (parked vehicles, seasonal risks). pi can then be 196 modelled as a function of coverage period or risk activity covariates. 197 198 Implementation note: the zero component introduces separate score 199 equations for mu and pi, with different forms for y=0 and y>0. 200 """ 201 202 @property 203 def param_names(self): 204 return ["mu", "pi"] 205 206 @property 207 def default_links(self): 208 return {"mu": LogLink(), "pi": LogitLink()} 209 210 def log_likelihood(self, y, params): 211 mu = params["mu"] 212 pi = params["pi"] 213 pi = np.clip(pi, 1e-10, 1 - 1e-10) 214 215 ll_zero = np.log(pi + (1.0 - pi) * np.exp(-mu)) 216 ll_pos = np.log(1.0 - pi) + xlogy(y, mu) - mu - gammaln(y + 1.0) 217 218 return np.where(y == 0, ll_zero, ll_pos) 219 220 def dl_deta(self, y, params, param_name): 221 self._check_param(param_name) 222 mu = params["mu"] 223 pi = params["pi"] 224 pi = np.clip(pi, 1e-10, 1 - 1e-10) 225 226 p0 = pi + (1.0 - pi) * np.exp(-mu) 227 p0 = np.clip(p0, 1e-300, None) 228 229 if param_name == "mu": 230 link = self.default_links["mu"] 231 dl_dmu = np.where( 232 y == 0, 233 -(1.0 - pi) * np.exp(-mu) / p0, 234 y / mu - 1.0, 235 ) 236 dmu_deta = link.inverse_deriv(link.link(mu)) 237 return dl_dmu * dmu_deta 238 239 else: # pi 240 link = self.default_links["pi"] 241 dl_dpi = np.where( 242 y == 0, 243 (1.0 - np.exp(-mu)) / p0, 244 -1.0 / (1.0 - pi), 245 ) 246 dpi_deta = link.inverse_deriv(link.link(pi)) 247 return dl_dpi * dpi_deta 248 249 def d2l_deta2(self, y, params, param_name): 250 self._check_param(param_name) 251 mu = params["mu"] 252 pi = params["pi"] 253 pi = np.clip(pi, 1e-10, 1 - 1e-10) 254 255 p0 = pi + (1.0 - pi) * np.exp(-mu) 256 p0 = np.clip(p0, 1e-300, None) 257 exp_neg_mu = np.exp(-mu) 258 259 if param_name == "mu": 260 link = self.default_links["mu"] 261 d2_zero = -((1.0 - pi) * exp_neg_mu * p0 - ((1.0 - pi) * exp_neg_mu)**2) / p0**2 262 d2_pos = -y / mu**2 263 d2l_dmu2 = np.where(y == 0, -d2_zero, -d2_pos) 264 d2l_dmu2 = np.maximum(np.abs(d2l_dmu2), 1e-8) 265 dmu_deta = link.inverse_deriv(link.link(mu)) 266 return d2l_dmu2 * dmu_deta**2 267 268 else: # pi 269 link = self.default_links["pi"] 270 d2_zero = -((1.0 - exp_neg_mu) * p0 - (1.0 - exp_neg_mu)**2) / p0**2 271 d2_pos = -1.0 / (1.0 - pi)**2 272 d2l_dpi2 = np.where(y == 0, -d2_zero, -d2_pos) 273 d2l_dpi2 = np.maximum(np.abs(d2l_dpi2), 1e-8) 274 dpi_deta = link.inverse_deriv(link.link(pi)) 275 return d2l_dpi2 * dpi_deta**2 276 277 def starting_values(self, y): 278 prop_zero = np.mean(y == 0) 279 mu_init = max(np.mean(y[y > 0]) if np.any(y > 0) else 1.0, 0.01) 280 pi_init = max(prop_zero - np.exp(-mu_init) * (1 - prop_zero), 0.05) 281 pi_init = min(pi_init, 0.95) 282 return { 283 "mu": np.full(len(y), mu_init), 284 "pi": np.full(len(y), pi_init), 285 }
Zero-Inflated Poisson (ZIP) for count data with excess zeros.
Parameterisation:
mu > 0 : Poisson rate for the non-zero-inflated component (log link) pi in (0,1) : probability of structural zero (logit link)
P(Y=0) = pi + (1-pi)*exp(-mu) P(Y=y) = (1-pi) * Poisson(y; mu) for y > 0
Insurance use: portfolios where a proportion of risks have zero claims due to non-exposure (parked vehicles, seasonal risks). pi can then be modelled as a function of coverage period or risk activity covariates.
Implementation note: the zero component introduces separate score equations for mu and pi, with different forms for y=0 and y>0.
210 def log_likelihood(self, y, params): 211 mu = params["mu"] 212 pi = params["pi"] 213 pi = np.clip(pi, 1e-10, 1 - 1e-10) 214 215 ll_zero = np.log(pi + (1.0 - pi) * np.exp(-mu)) 216 ll_pos = np.log(1.0 - pi) + xlogy(y, mu) - mu - gammaln(y + 1.0) 217 218 return np.where(y == 0, ll_zero, ll_pos)
Per-observation log-likelihood.
Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.
220 def dl_deta(self, y, params, param_name): 221 self._check_param(param_name) 222 mu = params["mu"] 223 pi = params["pi"] 224 pi = np.clip(pi, 1e-10, 1 - 1e-10) 225 226 p0 = pi + (1.0 - pi) * np.exp(-mu) 227 p0 = np.clip(p0, 1e-300, None) 228 229 if param_name == "mu": 230 link = self.default_links["mu"] 231 dl_dmu = np.where( 232 y == 0, 233 -(1.0 - pi) * np.exp(-mu) / p0, 234 y / mu - 1.0, 235 ) 236 dmu_deta = link.inverse_deriv(link.link(mu)) 237 return dl_dmu * dmu_deta 238 239 else: # pi 240 link = self.default_links["pi"] 241 dl_dpi = np.where( 242 y == 0, 243 (1.0 - np.exp(-mu)) / p0, 244 -1.0 / (1.0 - pi), 245 ) 246 dpi_deta = link.inverse_deriv(link.link(pi)) 247 return dl_dpi * dpi_deta
d(log L_i)/d(eta_k) for parameter param_name.
This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.
By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
Raises ValueError for unknown param_name.
249 def d2l_deta2(self, y, params, param_name): 250 self._check_param(param_name) 251 mu = params["mu"] 252 pi = params["pi"] 253 pi = np.clip(pi, 1e-10, 1 - 1e-10) 254 255 p0 = pi + (1.0 - pi) * np.exp(-mu) 256 p0 = np.clip(p0, 1e-300, None) 257 exp_neg_mu = np.exp(-mu) 258 259 if param_name == "mu": 260 link = self.default_links["mu"] 261 d2_zero = -((1.0 - pi) * exp_neg_mu * p0 - ((1.0 - pi) * exp_neg_mu)**2) / p0**2 262 d2_pos = -y / mu**2 263 d2l_dmu2 = np.where(y == 0, -d2_zero, -d2_pos) 264 d2l_dmu2 = np.maximum(np.abs(d2l_dmu2), 1e-8) 265 dmu_deta = link.inverse_deriv(link.link(mu)) 266 return d2l_dmu2 * dmu_deta**2 267 268 else: # pi 269 link = self.default_links["pi"] 270 d2_zero = -((1.0 - exp_neg_mu) * p0 - (1.0 - exp_neg_mu)**2) / p0**2 271 d2_pos = -1.0 / (1.0 - pi)**2 272 d2l_dpi2 = np.where(y == 0, -d2_zero, -d2_pos) 273 d2l_dpi2 = np.maximum(np.abs(d2l_dpi2), 1e-8) 274 dpi_deta = link.inverse_deriv(link.link(pi)) 275 return d2l_dpi2 * dpi_deta**2
Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).
Raises ValueError for unknown param_name.
277 def starting_values(self, y): 278 prop_zero = np.mean(y == 0) 279 mu_init = max(np.mean(y[y > 0]) if np.any(y > 0) else 1.0, 0.01) 280 pi_init = max(prop_zero - np.exp(-mu_init) * (1 - prop_zero), 0.05) 281 pi_init = min(pi_init, 0.95) 282 return { 283 "mu": np.full(len(y), mu_init), 284 "pi": np.full(len(y), pi_init), 285 }
Initial parameter estimates from the marginal distribution of y.
Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.