Edit on GitHub

insurance_distributional_glm.families

Distribution families for GAMLSS.

 1"""Distribution families for GAMLSS."""
 2
 3from .base import (
 4    GamlssFamily,
 5    Link,
 6    LogLink,
 7    LogitLink,
 8    IdentityLink,
 9    SqrtLink,
10    log,
11    logit,
12    identity,
13    sqrt,
14)
15from .continuous import Gamma, LogNormal, InverseGaussian, Tweedie
16from .count import Poisson, NBI, ZIP
17
18__all__ = [
19    # Base
20    "GamlssFamily",
21    "Link",
22    "LogLink",
23    "LogitLink",
24    "IdentityLink",
25    "SqrtLink",
26    "log",
27    "logit",
28    "identity",
29    "sqrt",
30    # Continuous
31    "Gamma",
32    "LogNormal",
33    "InverseGaussian",
34    "Tweedie",
35    # Count
36    "Poisson",
37    "NBI",
38    "ZIP",
39]
class GamlssFamily(abc.ABC):
146class GamlssFamily(ABC):
147    """
148    Abstract base for GAMLSS distribution families.
149
150    Each subclass implements a specific distribution and its relationship
151    to the RS fitting algorithm. The key quantities are:
152      - log_likelihood: total log-likelihood (sum over obs)
153      - dl_deta: d(log L)/d(eta_k) — the score for the IRLS working response
154      - d2l_deta2: E[d^2(log L)/d(eta_k)^2] — negative for IRLS weights
155
156    These drive the weighted least squares step in the RS algorithm.
157    """
158
159    @property
160    @abstractmethod
161    def param_names(self) -> List[str]:
162        """Ordered list of distribution parameter names, e.g. ['mu', 'sigma']."""
163
164    @property
165    @abstractmethod
166    def default_links(self) -> Dict[str, Link]:
167        """Default link function for each parameter."""
168
169    @abstractmethod
170    def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray:
171        """
172        Per-observation log-likelihood.
173
174        Returns array of shape (n,), one value per observation.
175        The fitting engine sums these to get total log-likelihood.
176        """
177
178    @abstractmethod
179    def dl_deta(
180        self,
181        y: np.ndarray,
182        params: Dict[str, np.ndarray],
183        param_name: str,
184    ) -> np.ndarray:
185        """
186        d(log L_i)/d(eta_k) for parameter param_name.
187
188        This is the derivative of log-likelihood with respect to the linear
189        predictor eta_k (not theta_k). Used to construct the working response
190        in the IRLS step.
191
192        By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
193
194        Raises ValueError for unknown param_name.
195        """
196
197    @abstractmethod
198    def d2l_deta2(
199        self,
200        y: np.ndarray,
201        params: Dict[str, np.ndarray],
202        param_name: str,
203    ) -> np.ndarray:
204        """
205        Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
206
207        Returns positive values (this is the negative expected second derivative).
208        Used as IRLS weights. If analytic expectation is hard, use the observed
209        Fisher information (negative of actual second derivative).
210
211        Raises ValueError for unknown param_name.
212        """
213
214    @abstractmethod
215    def starting_values(self, y: np.ndarray) -> Dict[str, np.ndarray]:
216        """
217        Initial parameter estimates from the marginal distribution of y.
218
219        Called once before the RS loop begins. Should be robust — moment
220        estimates are fine here; precision doesn't matter much.
221        """
222
223    def _check_param(self, param_name: str) -> None:
224        """Raise ValueError if param_name is not a known parameter."""
225        if param_name not in self.param_names:
226            raise ValueError(
227                f"Unknown parameter '{param_name}' for {self.__class__.__name__}. "
228                f"Known parameters: {self.param_names}"
229            )
230
231    def validate_params(self, params: Dict[str, np.ndarray]) -> None:
232        """Optional validation hook. Raise ValueError if params are out of range."""
233        pass
234
235    def __repr__(self) -> str:
236        return f"{self.__class__.__name__}(links={{{', '.join(f'{k}: {v.name}' for k, v in self.default_links.items())}}})"

Abstract base for GAMLSS distribution families.

Each subclass implements a specific distribution and its relationship to the RS fitting algorithm. The key quantities are:

  • log_likelihood: total log-likelihood (sum over obs)
  • dl_deta: d(log L)/d(eta_k) — the score for the IRLS working response
  • d2l_deta2: E[d^2(log L)/d(eta_k)^2] — negative for IRLS weights

These drive the weighted least squares step in the RS algorithm.

param_names: List[str]
159    @property
160    @abstractmethod
161    def param_names(self) -> List[str]:
162        """Ordered list of distribution parameter names, e.g. ['mu', 'sigma']."""

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

@abstractmethod
def log_likelihood( self, y: numpy.ndarray, params: Dict[str, numpy.ndarray]) -> numpy.ndarray:
169    @abstractmethod
170    def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray:
171        """
172        Per-observation log-likelihood.
173
174        Returns array of shape (n,), one value per observation.
175        The fitting engine sums these to get total log-likelihood.
176        """

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

@abstractmethod
def dl_deta( self, y: numpy.ndarray, params: Dict[str, numpy.ndarray], param_name: str) -> numpy.ndarray:
178    @abstractmethod
179    def dl_deta(
180        self,
181        y: np.ndarray,
182        params: Dict[str, np.ndarray],
183        param_name: str,
184    ) -> np.ndarray:
185        """
186        d(log L_i)/d(eta_k) for parameter param_name.
187
188        This is the derivative of log-likelihood with respect to the linear
189        predictor eta_k (not theta_k). Used to construct the working response
190        in the IRLS step.
191
192        By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)
193
194        Raises ValueError for unknown param_name.
195        """

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

@abstractmethod
def d2l_deta2( self, y: numpy.ndarray, params: Dict[str, numpy.ndarray], param_name: str) -> numpy.ndarray:
197    @abstractmethod
198    def d2l_deta2(
199        self,
200        y: np.ndarray,
201        params: Dict[str, np.ndarray],
202        param_name: str,
203    ) -> np.ndarray:
204        """
205        Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.
206
207        Returns positive values (this is the negative expected second derivative).
208        Used as IRLS weights. If analytic expectation is hard, use the observed
209        Fisher information (negative of actual second derivative).
210
211        Raises ValueError for unknown param_name.
212        """

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

@abstractmethod
def starting_values(self, y: numpy.ndarray) -> Dict[str, numpy.ndarray]:
214    @abstractmethod
215    def starting_values(self, y: np.ndarray) -> Dict[str, np.ndarray]:
216        """
217        Initial parameter estimates from the marginal distribution of y.
218
219        Called once before the RS loop begins. Should be robust — moment
220        estimates are fine here; precision doesn't matter much.
221        """

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

def validate_params(self, params: Dict[str, numpy.ndarray]) -> None:
231    def validate_params(self, params: Dict[str, np.ndarray]) -> None:
232        """Optional validation hook. Raise ValueError if params are out of range."""
233        pass

Optional validation hook. Raise ValueError if params are out of range.

log = <LogLink object>
logit = <LogitLink object>
identity = <IdentityLink object>
sqrt = <SqrtLink object>
 28class Gamma(GamlssFamily):
 29    """
 30    Gamma distribution with mean mu and coefficient of variation sigma.
 31
 32    PDF: f(y) = (1/sigma^2)^(1/sigma^2) * y^(1/sigma^2 - 1) * exp(-y/(mu*sigma^2))
 33                / (Gamma(1/sigma^2) * (mu*sigma^2)^(1/sigma^2))
 34
 35    mu    > 0 : mean (log link)
 36    sigma > 0 : coefficient of variation, sd/mean (log link)
 37
 38    Insurance use: claim severity. sigma models heterogeneity of variance
 39    — more risky segments have higher CV.
 40    """
 41
 42    @property
 43    def param_names(self):
 44        return ["mu", "sigma"]
 45
 46    @property
 47    def default_links(self):
 48        return {"mu": LogLink(), "sigma": LogLink()}
 49
 50    def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray:
 51        mu = params["mu"]
 52        sigma = params["sigma"]
 53        nu = 1.0 / sigma**2  # shape parameter
 54        return (
 55            nu * np.log(nu)
 56            - gammaln(nu)
 57            + (nu - 1.0) * np.log(y)
 58            - nu * np.log(mu)
 59            - nu * y / mu
 60        )
 61
 62    def dl_deta(self, y, params, param_name):
 63        self._check_param(param_name)
 64        mu = params["mu"]
 65        sigma = params["sigma"]
 66        nu = 1.0 / sigma**2
 67
 68        if param_name == "mu":
 69            link = self.default_links["mu"]
 70            dl_dtheta = nu * (y - mu) / mu**2
 71            dtheta_deta = link.inverse_deriv(link.link(mu))
 72            return dl_dtheta * dtheta_deta
 73
 74        else:  # sigma
 75            from scipy.special import digamma
 76            link = self.default_links["sigma"]
 77            dl_dnu = np.log(nu) + 1.0 - digamma(nu) + np.log(y / mu) - y / mu
 78            dl_dsigma = dl_dnu * (-2.0 / sigma**3)
 79            dsigma_deta = link.inverse_deriv(link.link(sigma))
 80            return dl_dsigma * dsigma_deta
 81
 82    def d2l_deta2(self, y, params, param_name):
 83        self._check_param(param_name)
 84        mu = params["mu"]
 85        sigma = params["sigma"]
 86        nu = 1.0 / sigma**2
 87
 88        if param_name == "mu":
 89            link = self.default_links["mu"]
 90            d2l_dmu2 = nu / mu**2
 91            dtheta_deta = link.inverse_deriv(link.link(mu))
 92            return d2l_dmu2 * dtheta_deta**2
 93
 94        else:  # sigma
 95            from scipy.special import polygamma
 96            link = self.default_links["sigma"]
 97            d2l_dnu2 = polygamma(1, nu) - 1.0 / nu
 98            dnu_dsigma = -2.0 / sigma**3
 99            d2l_dsigma2 = d2l_dnu2 * dnu_dsigma**2
100            dsigma_deta = link.inverse_deriv(link.link(sigma))
101            return d2l_dsigma2 * dsigma_deta**2
102
103    def starting_values(self, y):
104        mu_init = np.full(len(y), np.mean(y))
105        sigma_init = np.full(len(y), np.std(y) / np.mean(y))
106        sigma_init = np.clip(sigma_init, 0.01, 5.0)
107        return {"mu": mu_init, "sigma": sigma_init}

Gamma distribution with mean mu and coefficient of variation sigma.

PDF: f(y) = (1/sigma^2)^(1/sigma^2) * y^(1/sigma^2 - 1) * exp(-y/(musigma^2)) / (Gamma(1/sigma^2) * (musigma^2)^(1/sigma^2))

mu > 0 : mean (log link) sigma > 0 : coefficient of variation, sd/mean (log link)

Insurance use: claim severity. sigma models heterogeneity of variance — more risky segments have higher CV.

param_names
42    @property
43    def param_names(self):
44        return ["mu", "sigma"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood( self, y: numpy.ndarray, params: Dict[str, numpy.ndarray]) -> numpy.ndarray:
50    def log_likelihood(self, y: np.ndarray, params: Dict[str, np.ndarray]) -> np.ndarray:
51        mu = params["mu"]
52        sigma = params["sigma"]
53        nu = 1.0 / sigma**2  # shape parameter
54        return (
55            nu * np.log(nu)
56            - gammaln(nu)
57            + (nu - 1.0) * np.log(y)
58            - nu * np.log(mu)
59            - nu * y / mu
60        )

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
62    def dl_deta(self, y, params, param_name):
63        self._check_param(param_name)
64        mu = params["mu"]
65        sigma = params["sigma"]
66        nu = 1.0 / sigma**2
67
68        if param_name == "mu":
69            link = self.default_links["mu"]
70            dl_dtheta = nu * (y - mu) / mu**2
71            dtheta_deta = link.inverse_deriv(link.link(mu))
72            return dl_dtheta * dtheta_deta
73
74        else:  # sigma
75            from scipy.special import digamma
76            link = self.default_links["sigma"]
77            dl_dnu = np.log(nu) + 1.0 - digamma(nu) + np.log(y / mu) - y / mu
78            dl_dsigma = dl_dnu * (-2.0 / sigma**3)
79            dsigma_deta = link.inverse_deriv(link.link(sigma))
80            return dl_dsigma * dsigma_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
 82    def d2l_deta2(self, y, params, param_name):
 83        self._check_param(param_name)
 84        mu = params["mu"]
 85        sigma = params["sigma"]
 86        nu = 1.0 / sigma**2
 87
 88        if param_name == "mu":
 89            link = self.default_links["mu"]
 90            d2l_dmu2 = nu / mu**2
 91            dtheta_deta = link.inverse_deriv(link.link(mu))
 92            return d2l_dmu2 * dtheta_deta**2
 93
 94        else:  # sigma
 95            from scipy.special import polygamma
 96            link = self.default_links["sigma"]
 97            d2l_dnu2 = polygamma(1, nu) - 1.0 / nu
 98            dnu_dsigma = -2.0 / sigma**3
 99            d2l_dsigma2 = d2l_dnu2 * dnu_dsigma**2
100            dsigma_deta = link.inverse_deriv(link.link(sigma))
101            return d2l_dsigma2 * dsigma_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
103    def starting_values(self, y):
104        mu_init = np.full(len(y), np.mean(y))
105        sigma_init = np.full(len(y), np.std(y) / np.mean(y))
106        sigma_init = np.clip(sigma_init, 0.01, 5.0)
107        return {"mu": mu_init, "sigma": sigma_init}

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

110class LogNormal(GamlssFamily):
111    """
112    Log-Normal distribution with log-scale mean mu and log-scale SD sigma.
113
114    If Y ~ LogNormal(mu, sigma), then log(Y) ~ Normal(mu, sigma).
115    Note: mu here is the mean of log(Y), not E[Y].
116    E[Y] = exp(mu + sigma^2/2).
117
118    mu    : real (identity link — this is already on log scale)
119    sigma > 0 : log-scale standard deviation (log link)
120
121    Insurance use: severity when log(claims) is approximately normal.
122    Heavier right tail than Gamma for the same mean.
123    """
124
125    @property
126    def param_names(self):
127        return ["mu", "sigma"]
128
129    @property
130    def default_links(self):
131        return {"mu": IdentityLink(), "sigma": LogLink()}
132
133    def log_likelihood(self, y, params):
134        mu = params["mu"]
135        sigma = params["sigma"]
136        log_y = np.log(np.clip(y, 1e-300, None))
137        return (
138            -np.log(sigma)
139            - 0.5 * np.log(2.0 * np.pi)
140            - log_y
141            - 0.5 * ((log_y - mu) / sigma) ** 2
142        )
143
144    def dl_deta(self, y, params, param_name):
145        self._check_param(param_name)
146        mu = params["mu"]
147        sigma = params["sigma"]
148        log_y = np.log(np.clip(y, 1e-300, None))
149        resid = (log_y - mu) / sigma
150
151        if param_name == "mu":
152            link = self.default_links["mu"]
153            dl_dmu = resid / sigma
154            dmu_deta = link.inverse_deriv(link.link(mu))
155            return dl_dmu * dmu_deta
156
157        else:  # sigma
158            link = self.default_links["sigma"]
159            dl_dsigma = (resid**2 - 1.0) / sigma
160            dsigma_deta = link.inverse_deriv(link.link(sigma))
161            return dl_dsigma * dsigma_deta
162
163    def d2l_deta2(self, y, params, param_name):
164        self._check_param(param_name)
165        mu = params["mu"]
166        sigma = params["sigma"]
167
168        if param_name == "mu":
169            link = self.default_links["mu"]
170            d2l_dmu2 = 1.0 / sigma**2
171            dmu_deta = link.inverse_deriv(link.link(mu))
172            return d2l_dmu2 * dmu_deta**2
173
174        else:  # sigma
175            link = self.default_links["sigma"]
176            d2l_dsigma2 = 2.0 / sigma**2
177            dsigma_deta = link.inverse_deriv(link.link(sigma))
178            return d2l_dsigma2 * dsigma_deta**2
179
180    def starting_values(self, y):
181        log_y = np.log(np.clip(y, 1e-300, None))
182        mu_init = np.full(len(y), np.mean(log_y))
183        sigma_init = np.full(len(y), max(np.std(log_y), 0.01))
184        return {"mu": mu_init, "sigma": sigma_init}

Log-Normal distribution with log-scale mean mu and log-scale SD sigma.

If Y ~ LogNormal(mu, sigma), then log(Y) ~ Normal(mu, sigma). Note: mu here is the mean of log(Y), not E[Y]. E[Y] = exp(mu + sigma^2/2).

mu : real (identity link — this is already on log scale) sigma > 0 : log-scale standard deviation (log link)

Insurance use: severity when log(claims) is approximately normal. Heavier right tail than Gamma for the same mean.

param_names
125    @property
126    def param_names(self):
127        return ["mu", "sigma"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
133    def log_likelihood(self, y, params):
134        mu = params["mu"]
135        sigma = params["sigma"]
136        log_y = np.log(np.clip(y, 1e-300, None))
137        return (
138            -np.log(sigma)
139            - 0.5 * np.log(2.0 * np.pi)
140            - log_y
141            - 0.5 * ((log_y - mu) / sigma) ** 2
142        )

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
144    def dl_deta(self, y, params, param_name):
145        self._check_param(param_name)
146        mu = params["mu"]
147        sigma = params["sigma"]
148        log_y = np.log(np.clip(y, 1e-300, None))
149        resid = (log_y - mu) / sigma
150
151        if param_name == "mu":
152            link = self.default_links["mu"]
153            dl_dmu = resid / sigma
154            dmu_deta = link.inverse_deriv(link.link(mu))
155            return dl_dmu * dmu_deta
156
157        else:  # sigma
158            link = self.default_links["sigma"]
159            dl_dsigma = (resid**2 - 1.0) / sigma
160            dsigma_deta = link.inverse_deriv(link.link(sigma))
161            return dl_dsigma * dsigma_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
163    def d2l_deta2(self, y, params, param_name):
164        self._check_param(param_name)
165        mu = params["mu"]
166        sigma = params["sigma"]
167
168        if param_name == "mu":
169            link = self.default_links["mu"]
170            d2l_dmu2 = 1.0 / sigma**2
171            dmu_deta = link.inverse_deriv(link.link(mu))
172            return d2l_dmu2 * dmu_deta**2
173
174        else:  # sigma
175            link = self.default_links["sigma"]
176            d2l_dsigma2 = 2.0 / sigma**2
177            dsigma_deta = link.inverse_deriv(link.link(sigma))
178            return d2l_dsigma2 * dsigma_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
180    def starting_values(self, y):
181        log_y = np.log(np.clip(y, 1e-300, None))
182        mu_init = np.full(len(y), np.mean(log_y))
183        sigma_init = np.full(len(y), max(np.std(log_y), 0.01))
184        return {"mu": mu_init, "sigma": sigma_init}

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

class InverseGaussian(insurance_distributional_glm.families.GamlssFamily):
187class InverseGaussian(GamlssFamily):
188    """
189    Inverse Gaussian distribution with mean mu and dispersion sigma.
190
191    Standard parameterisation: if lambda = 1/sigma^2 is the shape (precision)
192    parameter, then:
193      PDF: f(y) = sqrt(lambda/(2*pi*y^3)) * exp(-lambda*(y-mu)^2/(2*mu^2*y))
194      Var[Y] = mu^3 / lambda = sigma^2 * mu^3
195
196    mu    > 0 : mean (log link)
197    sigma > 0 : dispersion, Var = sigma^2 * mu^3 (log link)
198
199    Log-likelihood:
200      ll = -log(sigma) - 0.5*log(2*pi) - 1.5*log(y) - (y-mu)^2/(2*sigma^2*mu^2*y)
201
202    Insurance use: liability severity with heavy right tails. The variance
203    grows faster with mu than Gamma (mu^3 vs mu^2), suitable for large losses.
204    """
205
206    @property
207    def param_names(self):
208        return ["mu", "sigma"]
209
210    @property
211    def default_links(self):
212        return {"mu": LogLink(), "sigma": LogLink()}
213
214    def log_likelihood(self, y, params):
215        mu = params["mu"]
216        sigma = params["sigma"]
217        y_safe = np.clip(y, 1e-300, None)
218        return (
219            -np.log(sigma)
220            - 0.5 * np.log(2.0 * np.pi)
221            - 1.5 * np.log(y_safe)
222            - (y - mu) ** 2 / (2.0 * sigma**2 * mu**2 * y_safe)
223        )
224
225    def dl_deta(self, y, params, param_name):
226        self._check_param(param_name)
227        mu = params["mu"]
228        sigma = params["sigma"]
229        y_safe = np.clip(y, 1e-300, None)
230
231        if param_name == "mu":
232            link = self.default_links["mu"]
233            # d(ll)/d(mu):
234            # ll contains -(y-mu)^2/(2*sigma^2*mu^2*y)
235            # Expand: -(y^2/mu^2 - 2y/mu + 1)/(2*sigma^2*y)
236            # d/dmu = (2y^2/mu^3 - 2y/mu^2)/(2*sigma^2*y) = (y-mu)/(sigma^2*mu^3)
237            dl_dmu = (y - mu) / (sigma**2 * mu**3)
238            dmu_deta = link.inverse_deriv(link.link(mu))
239            return dl_dmu * dmu_deta
240
241        else:  # sigma
242            link = self.default_links["sigma"]
243            # d(ll)/d(sigma) = -1/sigma + (y-mu)^2/(sigma^3*mu^2*y)
244            dl_dsigma = -1.0 / sigma + (y - mu)**2 / (sigma**3 * mu**2 * y_safe)
245            dsigma_deta = link.inverse_deriv(link.link(sigma))
246            return dl_dsigma * dsigma_deta
247
248    def d2l_deta2(self, y, params, param_name):
249        self._check_param(param_name)
250        mu = params["mu"]
251        sigma = params["sigma"]
252
253        if param_name == "mu":
254            link = self.default_links["mu"]
255            # E[-d^2(ll)/dmu^2] = 1/(sigma^2*mu^3)
256            d2l_dmu2 = 1.0 / (sigma**2 * mu**3)
257            dmu_deta = link.inverse_deriv(link.link(mu))
258            return d2l_dmu2 * dmu_deta**2
259
260        else:  # sigma
261            link = self.default_links["sigma"]
262            # E[-d^2 ll/d sigma^2] = 2/sigma^2
263            d2l_dsigma2 = 2.0 / sigma**2
264            dsigma_deta = link.inverse_deriv(link.link(sigma))
265            return d2l_dsigma2 * dsigma_deta**2
266
267    def starting_values(self, y):
268        mu_init = np.full(len(y), np.mean(y))
269        sigma_sq = max(np.var(y) / np.mean(y)**3, 1e-6)
270        sigma_init = np.full(len(y), np.sqrt(sigma_sq))
271        return {"mu": mu_init, "sigma": sigma_init}

Inverse Gaussian distribution with mean mu and dispersion sigma.

Standard parameterisation: if lambda = 1/sigma^2 is the shape (precision) parameter, then: PDF: f(y) = sqrt(lambda/(2piy^3)) * exp(-lambda*(y-mu)^2/(2mu^2y)) Var[Y] = mu^3 / lambda = sigma^2 * mu^3

mu > 0 : mean (log link) sigma > 0 : dispersion, Var = sigma^2 * mu^3 (log link)

Log-likelihood: ll = -log(sigma) - 0.5log(2pi) - 1.5log(y) - (y-mu)^2/(2sigma^2mu^2y)

Insurance use: liability severity with heavy right tails. The variance grows faster with mu than Gamma (mu^3 vs mu^2), suitable for large losses.

param_names
206    @property
207    def param_names(self):
208        return ["mu", "sigma"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
214    def log_likelihood(self, y, params):
215        mu = params["mu"]
216        sigma = params["sigma"]
217        y_safe = np.clip(y, 1e-300, None)
218        return (
219            -np.log(sigma)
220            - 0.5 * np.log(2.0 * np.pi)
221            - 1.5 * np.log(y_safe)
222            - (y - mu) ** 2 / (2.0 * sigma**2 * mu**2 * y_safe)
223        )

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
225    def dl_deta(self, y, params, param_name):
226        self._check_param(param_name)
227        mu = params["mu"]
228        sigma = params["sigma"]
229        y_safe = np.clip(y, 1e-300, None)
230
231        if param_name == "mu":
232            link = self.default_links["mu"]
233            # d(ll)/d(mu):
234            # ll contains -(y-mu)^2/(2*sigma^2*mu^2*y)
235            # Expand: -(y^2/mu^2 - 2y/mu + 1)/(2*sigma^2*y)
236            # d/dmu = (2y^2/mu^3 - 2y/mu^2)/(2*sigma^2*y) = (y-mu)/(sigma^2*mu^3)
237            dl_dmu = (y - mu) / (sigma**2 * mu**3)
238            dmu_deta = link.inverse_deriv(link.link(mu))
239            return dl_dmu * dmu_deta
240
241        else:  # sigma
242            link = self.default_links["sigma"]
243            # d(ll)/d(sigma) = -1/sigma + (y-mu)^2/(sigma^3*mu^2*y)
244            dl_dsigma = -1.0 / sigma + (y - mu)**2 / (sigma**3 * mu**2 * y_safe)
245            dsigma_deta = link.inverse_deriv(link.link(sigma))
246            return dl_dsigma * dsigma_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
248    def d2l_deta2(self, y, params, param_name):
249        self._check_param(param_name)
250        mu = params["mu"]
251        sigma = params["sigma"]
252
253        if param_name == "mu":
254            link = self.default_links["mu"]
255            # E[-d^2(ll)/dmu^2] = 1/(sigma^2*mu^3)
256            d2l_dmu2 = 1.0 / (sigma**2 * mu**3)
257            dmu_deta = link.inverse_deriv(link.link(mu))
258            return d2l_dmu2 * dmu_deta**2
259
260        else:  # sigma
261            link = self.default_links["sigma"]
262            # E[-d^2 ll/d sigma^2] = 2/sigma^2
263            d2l_dsigma2 = 2.0 / sigma**2
264            dsigma_deta = link.inverse_deriv(link.link(sigma))
265            return d2l_dsigma2 * dsigma_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
267    def starting_values(self, y):
268        mu_init = np.full(len(y), np.mean(y))
269        sigma_sq = max(np.var(y) / np.mean(y)**3, 1e-6)
270        sigma_init = np.full(len(y), np.sqrt(sigma_sq))
271        return {"mu": mu_init, "sigma": sigma_init}

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

274class Tweedie(GamlssFamily):
275    """
276    Tweedie distribution: compound Poisson-Gamma, useful for pure premiums.
277
278    The Tweedie family with power parameter p in (1,2) gives a distribution
279    with a point mass at zero (claim frequency component) and a continuous
280    positive part (severity component). This makes it natural for:
281      - Pure premiums (expected cost per unit exposure)
282      - Combined frequency-severity models
283
284    mu  > 0 : mean (log link)
285    phi > 0 : dispersion, Var[Y] = phi * mu^p (log link)
286    p       : power parameter, fixed at init (default 1.5)
287
288    The log-likelihood uses the series approximation from Dunn & Smyth (2005).
289    For p=1: Poisson, p=2: Gamma, p in (1,2): compound Poisson-Gamma.
290
291    Note: p is NOT estimated by default — fix it or profile over a grid.
292    """
293
294    def __init__(self, power: float = 1.5):
295        if not (1.0 < power < 2.0):
296            raise ValueError(f"Tweedie power must be in (1,2), got {power}")
297        self.power = power
298
299    @property
300    def param_names(self):
301        return ["mu", "phi"]
302
303    @property
304    def default_links(self):
305        return {"mu": LogLink(), "phi": LogLink()}
306
307    def log_likelihood(self, y, params):
308        mu = params["mu"]
309        phi = params["phi"]
310        p = self.power
311
312        ll = np.where(
313            y == 0,
314            -mu**(2.0 - p) / ((2.0 - p) * phi),
315            (y * mu**(1.0 - p) / ((1.0 - p) * phi)
316             - mu**(2.0 - p) / ((2.0 - p) * phi)
317             + self._log_w(y, phi, p)),
318        )
319        return ll
320
321    def _log_w(self, y: np.ndarray, phi: np.ndarray, p: float) -> np.ndarray:
322        """
323        Log of the Tweedie series sum W(y, phi, p) for y > 0.
324
325        Uses the compound Poisson-Gamma series from Dunn & Smyth (2005).
326        Each term j corresponds to a Poisson count of j severity events.
327
328        For fixed y > 0 the density is:
329          f(y) * exp(ll_front_inverse) = (1/y) * sum_j V_j
330        where:
331          V_j = lam^j * y^{j*alpha - 1} / (j! * scale^{j*alpha} * Gamma(j*alpha))
332          lam   = mu^{2-p} / (phi * (2-p))   [Poisson mean]
333          scale = phi * (p-1) * mu^{p-1}      [Gamma scale per event]
334          alpha = (2-p) / (p-1)               [> 0 for p in (1,2)]
335
336        The log-offset terms for lam and scale simplify to:
337          j*log(lam) - j*alpha*log(scale) = -j*log(phi*(2-p)) - j*alpha*log(phi*(p-1))
338        (the mu terms cancel because alpha*(p-1) = 2-p exactly).
339
340        Per-term log formula:
341          log(W_j) = (j*alpha - 1)*log(y) - gammaln(j*alpha) - gammaln(j+1)
342                     - j*log(phi*(2-p)) - j*alpha*log(phi*(p-1))
343        """
344        j_max = 50
345        alpha = (2.0 - p) / (p - 1.0)  # positive for p in (1, 2)
346
347        y_flat = np.asarray(y, dtype=float).ravel()
348        phi_flat = np.asarray(phi, dtype=float).ravel()
349
350        log_phi_2mp = np.log(phi_flat * (2.0 - p))
351        log_phi_pm1 = np.log(phi_flat * (p - 1.0))
352        log_y = np.log(np.maximum(y_flat, 1e-300))
353
354        log_w = np.full(len(y_flat), -np.inf)
355
356        for j in range(1, j_max + 1):
357            log_term = (
358                (j * alpha - 1.0) * log_y
359                - gammaln(j * alpha)
360                - gammaln(j + 1)
361                - j * log_phi_2mp
362                - j * alpha * log_phi_pm1
363            )
364            m = np.maximum(log_w, log_term)
365            finite = np.isfinite(m)
366            log_w = np.where(
367                finite,
368                m + np.log(
369                    np.exp(np.where(finite, log_w - m, 0))
370                    + np.exp(np.where(finite, log_term - m, 0))
371                ),
372                np.where(np.isfinite(log_term), log_term, log_w),
373            )
374
375        return log_w.reshape(np.asarray(y).shape)
376
377    def dl_deta(self, y, params, param_name):
378        self._check_param(param_name)
379        mu = params["mu"]
380        phi = params["phi"]
381        p = self.power
382
383        if param_name == "mu":
384            link = self.default_links["mu"]
385            dl_dmu = (y * mu**(-p) - mu**(1.0 - p)) / phi
386            dmu_deta = link.inverse_deriv(link.link(mu))
387            return dl_dmu * dmu_deta
388
389        else:  # phi
390            link = self.default_links["phi"]
391            dl_dphi = np.where(
392                y == 0,
393                mu**(2.0 - p) / ((2.0 - p) * phi**2),
394                -y * mu**(1.0 - p) / ((1.0 - p) * phi**2)
395                + mu**(2.0 - p) / ((2.0 - p) * phi**2),
396            )
397            dphi_deta = link.inverse_deriv(link.link(phi))
398            return dl_dphi * dphi_deta
399
400    def d2l_deta2(self, y, params, param_name):
401        self._check_param(param_name)
402        mu = params["mu"]
403        phi = params["phi"]
404        p = self.power
405
406        if param_name == "mu":
407            link = self.default_links["mu"]
408            # E[-d^2 ll/d mu^2] = mu^{-p} / phi
409            # Derived: d^2ll/dmu^2 = -p*y*mu^{-p-1}/phi + (p-1)*mu^{-p}/phi
410            # Taking expectation with E[y]=mu: E[-d^2ll/dmu^2] = mu^{-p}/phi
411            d2l_dmu2 = mu ** (-p) / phi
412            dmu_deta = link.inverse_deriv(link.link(mu))
413            return d2l_dmu2 * dmu_deta**2
414
415        else:  # phi
416            link = self.default_links["phi"]
417            # E[-d^2 ll/d phi^2] = 2 * mu^{2-p} / (phi^3 * (p-1) * (2-p))
418            # Derived: d^2ll/dphi^2 = 2*y*mu^{1-p}/((1-p)*phi^3) - 2*mu^{2-p}/((2-p)*phi^3)
419            # Taking expectation with E[y]=mu:
420            #   E[-d^2ll/dphi^2] = -2*mu^{2-p}/((1-p)*phi^3) + 2*mu^{2-p}/((2-p)*phi^3)
421            #                    = 2*mu^{2-p}/phi^3 * [1/(p-1) + 1/(2-p)]
422            #                    = 2*mu^{2-p}/(phi^3 * (p-1) * (2-p))
423            d2l_dphi2 = 2.0 * mu ** (2.0 - p) / (phi**3 * (p - 1.0) * (2.0 - p))
424            dphi_deta = link.inverse_deriv(link.link(phi))
425            return d2l_dphi2 * dphi_deta**2
426
427    def starting_values(self, y):
428        mu_init = np.full(len(y), np.mean(y[y > 0]) if np.any(y > 0) else 1.0)
429        phi_init = np.full(len(y), 1.0)
430        return {"mu": mu_init, "phi": phi_init}

Tweedie distribution: compound Poisson-Gamma, useful for pure premiums.

The Tweedie family with power parameter p in (1,2) gives a distribution with a point mass at zero (claim frequency component) and a continuous positive part (severity component). This makes it natural for:

  • Pure premiums (expected cost per unit exposure)
  • Combined frequency-severity models

mu > 0 : mean (log link) phi > 0 : dispersion, Var[Y] = phi * mu^p (log link) p : power parameter, fixed at init (default 1.5)

The log-likelihood uses the series approximation from Dunn & Smyth (2005). For p=1: Poisson, p=2: Gamma, p in (1,2): compound Poisson-Gamma.

Note: p is NOT estimated by default — fix it or profile over a grid.

Tweedie(power: float = 1.5)
294    def __init__(self, power: float = 1.5):
295        if not (1.0 < power < 2.0):
296            raise ValueError(f"Tweedie power must be in (1,2), got {power}")
297        self.power = power
power
param_names
299    @property
300    def param_names(self):
301        return ["mu", "phi"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
307    def log_likelihood(self, y, params):
308        mu = params["mu"]
309        phi = params["phi"]
310        p = self.power
311
312        ll = np.where(
313            y == 0,
314            -mu**(2.0 - p) / ((2.0 - p) * phi),
315            (y * mu**(1.0 - p) / ((1.0 - p) * phi)
316             - mu**(2.0 - p) / ((2.0 - p) * phi)
317             + self._log_w(y, phi, p)),
318        )
319        return ll

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
377    def dl_deta(self, y, params, param_name):
378        self._check_param(param_name)
379        mu = params["mu"]
380        phi = params["phi"]
381        p = self.power
382
383        if param_name == "mu":
384            link = self.default_links["mu"]
385            dl_dmu = (y * mu**(-p) - mu**(1.0 - p)) / phi
386            dmu_deta = link.inverse_deriv(link.link(mu))
387            return dl_dmu * dmu_deta
388
389        else:  # phi
390            link = self.default_links["phi"]
391            dl_dphi = np.where(
392                y == 0,
393                mu**(2.0 - p) / ((2.0 - p) * phi**2),
394                -y * mu**(1.0 - p) / ((1.0 - p) * phi**2)
395                + mu**(2.0 - p) / ((2.0 - p) * phi**2),
396            )
397            dphi_deta = link.inverse_deriv(link.link(phi))
398            return dl_dphi * dphi_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
400    def d2l_deta2(self, y, params, param_name):
401        self._check_param(param_name)
402        mu = params["mu"]
403        phi = params["phi"]
404        p = self.power
405
406        if param_name == "mu":
407            link = self.default_links["mu"]
408            # E[-d^2 ll/d mu^2] = mu^{-p} / phi
409            # Derived: d^2ll/dmu^2 = -p*y*mu^{-p-1}/phi + (p-1)*mu^{-p}/phi
410            # Taking expectation with E[y]=mu: E[-d^2ll/dmu^2] = mu^{-p}/phi
411            d2l_dmu2 = mu ** (-p) / phi
412            dmu_deta = link.inverse_deriv(link.link(mu))
413            return d2l_dmu2 * dmu_deta**2
414
415        else:  # phi
416            link = self.default_links["phi"]
417            # E[-d^2 ll/d phi^2] = 2 * mu^{2-p} / (phi^3 * (p-1) * (2-p))
418            # Derived: d^2ll/dphi^2 = 2*y*mu^{1-p}/((1-p)*phi^3) - 2*mu^{2-p}/((2-p)*phi^3)
419            # Taking expectation with E[y]=mu:
420            #   E[-d^2ll/dphi^2] = -2*mu^{2-p}/((1-p)*phi^3) + 2*mu^{2-p}/((2-p)*phi^3)
421            #                    = 2*mu^{2-p}/phi^3 * [1/(p-1) + 1/(2-p)]
422            #                    = 2*mu^{2-p}/(phi^3 * (p-1) * (2-p))
423            d2l_dphi2 = 2.0 * mu ** (2.0 - p) / (phi**3 * (p - 1.0) * (2.0 - p))
424            dphi_deta = link.inverse_deriv(link.link(phi))
425            return d2l_dphi2 * dphi_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
427    def starting_values(self, y):
428        mu_init = np.full(len(y), np.mean(y[y > 0]) if np.any(y > 0) else 1.0)
429        phi_init = np.full(len(y), 1.0)
430        return {"mu": mu_init, "phi": phi_init}

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

29class Poisson(GamlssFamily):
30    """
31    Poisson distribution for claim frequency.
32
33    mu > 0 : rate parameter (log link)
34
35    One-parameter family — the RS algorithm only updates mu.
36    Use NBI if your data shows overdispersion (most insurance data does).
37    """
38
39    @property
40    def param_names(self):
41        return ["mu"]
42
43    @property
44    def default_links(self):
45        return {"mu": LogLink()}
46
47    def log_likelihood(self, y, params):
48        mu = params["mu"]
49        return xlogy(y, mu) - mu - gammaln(y + 1.0)
50
51    def dl_deta(self, y, params, param_name):
52        self._check_param(param_name)
53        mu = params["mu"]
54        link = self.default_links["mu"]
55        # d(ll)/d(mu) = (y - mu) / mu; with log link, d(mu)/d(eta) = mu
56        # so dl/d(eta) = (y - mu) / mu * mu = y - mu
57        dl_dmu = (y - mu) / mu
58        dmu_deta = link.inverse_deriv(link.link(mu))
59        return dl_dmu * dmu_deta
60
61    def d2l_deta2(self, y, params, param_name):
62        self._check_param(param_name)
63        mu = params["mu"]
64        link = self.default_links["mu"]
65        # E[-d^2 ll/d mu^2] = 1/mu; with log link dmu/deta=mu: E[-d^2/deta^2] = mu^2/mu = mu
66        d2l_dmu2 = 1.0 / mu
67        dmu_deta = link.inverse_deriv(link.link(mu))
68        return d2l_dmu2 * dmu_deta**2
69
70    def starting_values(self, y):
71        mu_init = np.full(len(y), max(np.mean(y), 0.01))
72        return {"mu": mu_init}

Poisson distribution for claim frequency.

mu > 0 : rate parameter (log link)

One-parameter family — the RS algorithm only updates mu. Use NBI if your data shows overdispersion (most insurance data does).

param_names
39    @property
40    def param_names(self):
41        return ["mu"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
47    def log_likelihood(self, y, params):
48        mu = params["mu"]
49        return xlogy(y, mu) - mu - gammaln(y + 1.0)

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
51    def dl_deta(self, y, params, param_name):
52        self._check_param(param_name)
53        mu = params["mu"]
54        link = self.default_links["mu"]
55        # d(ll)/d(mu) = (y - mu) / mu; with log link, d(mu)/d(eta) = mu
56        # so dl/d(eta) = (y - mu) / mu * mu = y - mu
57        dl_dmu = (y - mu) / mu
58        dmu_deta = link.inverse_deriv(link.link(mu))
59        return dl_dmu * dmu_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
61    def d2l_deta2(self, y, params, param_name):
62        self._check_param(param_name)
63        mu = params["mu"]
64        link = self.default_links["mu"]
65        # E[-d^2 ll/d mu^2] = 1/mu; with log link dmu/deta=mu: E[-d^2/deta^2] = mu^2/mu = mu
66        d2l_dmu2 = 1.0 / mu
67        dmu_deta = link.inverse_deriv(link.link(mu))
68        return d2l_dmu2 * dmu_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
70    def starting_values(self, y):
71        mu_init = np.full(len(y), max(np.mean(y), 0.01))
72        return {"mu": mu_init}

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

 75class NBI(GamlssFamily):
 76    """
 77    Negative Binomial Type I (NBI) for overdispersed claim counts.
 78
 79    Parameterisation: Var[Y] = mu + sigma * mu^2
 80      mu    > 0 : mean (log link)
 81      sigma > 0 : overdispersion parameter (log link)
 82
 83    PMF: P(Y=y) = Gamma(y + 1/sigma) / (y! * Gamma(1/sigma))
 84                  * (mu*sigma/(1+mu*sigma))^y * (1/(1+mu*sigma))^(1/sigma)
 85
 86    Insurance use: the standard choice when claim counts are overdispersed,
 87    which is almost always true in motor and property portfolios. sigma
 88    captures unobserved heterogeneity not captured by the covariates.
 89    """
 90
 91    @property
 92    def param_names(self):
 93        return ["mu", "sigma"]
 94
 95    @property
 96    def default_links(self):
 97        return {"mu": LogLink(), "sigma": LogLink()}
 98
 99    def log_likelihood(self, y, params):
100        mu = params["mu"]
101        sigma = params["sigma"]
102        k = 1.0 / sigma  # shape / reciprocal overdispersion
103
104        return (
105            gammaln(y + k)
106            - gammaln(k)
107            - gammaln(y + 1.0)
108            + k * np.log(k / (k + mu))
109            + xlogy(y, mu / (mu + k))
110        )
111
112    def dl_deta(self, y, params, param_name):
113        self._check_param(param_name)
114        mu = params["mu"]
115        sigma = params["sigma"]
116        k = 1.0 / sigma
117
118        if param_name == "mu":
119            link = self.default_links["mu"]
120            # d(ll)/d(mu) = y/mu - (y+k)/(mu+k)
121            dl_dmu = y / mu - (y + k) / (mu + k)
122            dmu_deta = link.inverse_deriv(link.link(mu))
123            return dl_dmu * dmu_deta
124
125        else:  # sigma
126            from scipy.special import digamma
127            link = self.default_links["sigma"]
128            dl_dk = (
129                digamma(y + k)
130                - digamma(k)
131                + np.log(k / (k + mu))
132                + 1.0
133                - (y + k) / (k + mu)
134            )
135            dk_dsigma = -1.0 / sigma**2
136            dl_dsigma = dl_dk * dk_dsigma
137            dsigma_deta = link.inverse_deriv(link.link(sigma))
138            return dl_dsigma * dsigma_deta
139
140    def d2l_deta2(self, y, params, param_name):
141        self._check_param(param_name)
142        mu = params["mu"]
143        sigma = params["sigma"]
144        k = 1.0 / sigma
145
146        if param_name == "mu":
147            link = self.default_links["mu"]
148            # E[-d^2 ll/d mu^2] = k/(mu*(mu+k))
149            d2l_dmu2 = k / (mu * (mu + k))
150            dmu_deta = link.inverse_deriv(link.link(mu))
151            return d2l_dmu2 * dmu_deta**2
152
153        else:  # sigma
154            from scipy.special import polygamma
155            link = self.default_links["sigma"]
156            # Observed Fisher information per observation: -d^2 ll / d k^2
157            # d^2 ll / d k^2 = polygamma(1, y+k) - polygamma(1, k) - 1/k + 1/(k+mu) + (y-mu)/(k+mu)^2
158            # so -d^2 ll / d k^2 = polygamma(1, k) - polygamma(1, y+k) - 1/k + 1/(k+mu) + (mu-y)/(k+mu)^2
159            # Individual observations can give negative values; clip to floor before chain rule.
160            neg_d2l_dk2 = (
161                polygamma(1, k)
162                - polygamma(1, y + k)
163                - 1.0 / k
164                + 1.0 / (k + mu)
165                + (mu - y) / (k + mu) ** 2
166            )
167            neg_d2l_dk2 = np.maximum(neg_d2l_dk2, 1e-8)
168            dk_dsigma = -1.0 / sigma**2
169            d2l_dsigma2 = neg_d2l_dk2 * dk_dsigma**2
170            dsigma_deta = link.inverse_deriv(link.link(sigma))
171            return d2l_dsigma2 * dsigma_deta**2
172
173    def starting_values(self, y):
174        mu_val = max(np.mean(y), 0.01)
175        var_val = max(np.var(y), mu_val + 1e-6)
176        sigma_val = max((var_val - mu_val) / mu_val**2, 0.01)
177        return {
178            "mu": np.full(len(y), mu_val),
179            "sigma": np.full(len(y), sigma_val),
180        }

Negative Binomial Type I (NBI) for overdispersed claim counts.

Parameterisation: Var[Y] = mu + sigma * mu^2 mu > 0 : mean (log link) sigma > 0 : overdispersion parameter (log link)

PMF: P(Y=y) = Gamma(y + 1/sigma) / (y! * Gamma(1/sigma)) * (musigma/(1+musigma))^y * (1/(1+mu*sigma))^(1/sigma)

Insurance use: the standard choice when claim counts are overdispersed, which is almost always true in motor and property portfolios. sigma captures unobserved heterogeneity not captured by the covariates.

param_names
91    @property
92    def param_names(self):
93        return ["mu", "sigma"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
 99    def log_likelihood(self, y, params):
100        mu = params["mu"]
101        sigma = params["sigma"]
102        k = 1.0 / sigma  # shape / reciprocal overdispersion
103
104        return (
105            gammaln(y + k)
106            - gammaln(k)
107            - gammaln(y + 1.0)
108            + k * np.log(k / (k + mu))
109            + xlogy(y, mu / (mu + k))
110        )

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
112    def dl_deta(self, y, params, param_name):
113        self._check_param(param_name)
114        mu = params["mu"]
115        sigma = params["sigma"]
116        k = 1.0 / sigma
117
118        if param_name == "mu":
119            link = self.default_links["mu"]
120            # d(ll)/d(mu) = y/mu - (y+k)/(mu+k)
121            dl_dmu = y / mu - (y + k) / (mu + k)
122            dmu_deta = link.inverse_deriv(link.link(mu))
123            return dl_dmu * dmu_deta
124
125        else:  # sigma
126            from scipy.special import digamma
127            link = self.default_links["sigma"]
128            dl_dk = (
129                digamma(y + k)
130                - digamma(k)
131                + np.log(k / (k + mu))
132                + 1.0
133                - (y + k) / (k + mu)
134            )
135            dk_dsigma = -1.0 / sigma**2
136            dl_dsigma = dl_dk * dk_dsigma
137            dsigma_deta = link.inverse_deriv(link.link(sigma))
138            return dl_dsigma * dsigma_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
140    def d2l_deta2(self, y, params, param_name):
141        self._check_param(param_name)
142        mu = params["mu"]
143        sigma = params["sigma"]
144        k = 1.0 / sigma
145
146        if param_name == "mu":
147            link = self.default_links["mu"]
148            # E[-d^2 ll/d mu^2] = k/(mu*(mu+k))
149            d2l_dmu2 = k / (mu * (mu + k))
150            dmu_deta = link.inverse_deriv(link.link(mu))
151            return d2l_dmu2 * dmu_deta**2
152
153        else:  # sigma
154            from scipy.special import polygamma
155            link = self.default_links["sigma"]
156            # Observed Fisher information per observation: -d^2 ll / d k^2
157            # d^2 ll / d k^2 = polygamma(1, y+k) - polygamma(1, k) - 1/k + 1/(k+mu) + (y-mu)/(k+mu)^2
158            # so -d^2 ll / d k^2 = polygamma(1, k) - polygamma(1, y+k) - 1/k + 1/(k+mu) + (mu-y)/(k+mu)^2
159            # Individual observations can give negative values; clip to floor before chain rule.
160            neg_d2l_dk2 = (
161                polygamma(1, k)
162                - polygamma(1, y + k)
163                - 1.0 / k
164                + 1.0 / (k + mu)
165                + (mu - y) / (k + mu) ** 2
166            )
167            neg_d2l_dk2 = np.maximum(neg_d2l_dk2, 1e-8)
168            dk_dsigma = -1.0 / sigma**2
169            d2l_dsigma2 = neg_d2l_dk2 * dk_dsigma**2
170            dsigma_deta = link.inverse_deriv(link.link(sigma))
171            return d2l_dsigma2 * dsigma_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
173    def starting_values(self, y):
174        mu_val = max(np.mean(y), 0.01)
175        var_val = max(np.var(y), mu_val + 1e-6)
176        sigma_val = max((var_val - mu_val) / mu_val**2, 0.01)
177        return {
178            "mu": np.full(len(y), mu_val),
179            "sigma": np.full(len(y), sigma_val),
180        }

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.

183class ZIP(GamlssFamily):
184    """
185    Zero-Inflated Poisson (ZIP) for count data with excess zeros.
186
187    Parameterisation:
188      mu  > 0       : Poisson rate for the non-zero-inflated component (log link)
189      pi  in (0,1)  : probability of structural zero (logit link)
190
191    P(Y=0) = pi + (1-pi)*exp(-mu)
192    P(Y=y) = (1-pi) * Poisson(y; mu)  for y > 0
193
194    Insurance use: portfolios where a proportion of risks have zero claims
195    due to non-exposure (parked vehicles, seasonal risks). pi can then be
196    modelled as a function of coverage period or risk activity covariates.
197
198    Implementation note: the zero component introduces separate score
199    equations for mu and pi, with different forms for y=0 and y>0.
200    """
201
202    @property
203    def param_names(self):
204        return ["mu", "pi"]
205
206    @property
207    def default_links(self):
208        return {"mu": LogLink(), "pi": LogitLink()}
209
210    def log_likelihood(self, y, params):
211        mu = params["mu"]
212        pi = params["pi"]
213        pi = np.clip(pi, 1e-10, 1 - 1e-10)
214
215        ll_zero = np.log(pi + (1.0 - pi) * np.exp(-mu))
216        ll_pos = np.log(1.0 - pi) + xlogy(y, mu) - mu - gammaln(y + 1.0)
217
218        return np.where(y == 0, ll_zero, ll_pos)
219
220    def dl_deta(self, y, params, param_name):
221        self._check_param(param_name)
222        mu = params["mu"]
223        pi = params["pi"]
224        pi = np.clip(pi, 1e-10, 1 - 1e-10)
225
226        p0 = pi + (1.0 - pi) * np.exp(-mu)
227        p0 = np.clip(p0, 1e-300, None)
228
229        if param_name == "mu":
230            link = self.default_links["mu"]
231            dl_dmu = np.where(
232                y == 0,
233                -(1.0 - pi) * np.exp(-mu) / p0,
234                y / mu - 1.0,
235            )
236            dmu_deta = link.inverse_deriv(link.link(mu))
237            return dl_dmu * dmu_deta
238
239        else:  # pi
240            link = self.default_links["pi"]
241            dl_dpi = np.where(
242                y == 0,
243                (1.0 - np.exp(-mu)) / p0,
244                -1.0 / (1.0 - pi),
245            )
246            dpi_deta = link.inverse_deriv(link.link(pi))
247            return dl_dpi * dpi_deta
248
249    def d2l_deta2(self, y, params, param_name):
250        self._check_param(param_name)
251        mu = params["mu"]
252        pi = params["pi"]
253        pi = np.clip(pi, 1e-10, 1 - 1e-10)
254
255        p0 = pi + (1.0 - pi) * np.exp(-mu)
256        p0 = np.clip(p0, 1e-300, None)
257        exp_neg_mu = np.exp(-mu)
258
259        if param_name == "mu":
260            link = self.default_links["mu"]
261            d2_zero = -((1.0 - pi) * exp_neg_mu * p0 - ((1.0 - pi) * exp_neg_mu)**2) / p0**2
262            d2_pos = -y / mu**2
263            d2l_dmu2 = np.where(y == 0, -d2_zero, -d2_pos)
264            d2l_dmu2 = np.maximum(np.abs(d2l_dmu2), 1e-8)
265            dmu_deta = link.inverse_deriv(link.link(mu))
266            return d2l_dmu2 * dmu_deta**2
267
268        else:  # pi
269            link = self.default_links["pi"]
270            d2_zero = -((1.0 - exp_neg_mu) * p0 - (1.0 - exp_neg_mu)**2) / p0**2
271            d2_pos = -1.0 / (1.0 - pi)**2
272            d2l_dpi2 = np.where(y == 0, -d2_zero, -d2_pos)
273            d2l_dpi2 = np.maximum(np.abs(d2l_dpi2), 1e-8)
274            dpi_deta = link.inverse_deriv(link.link(pi))
275            return d2l_dpi2 * dpi_deta**2
276
277    def starting_values(self, y):
278        prop_zero = np.mean(y == 0)
279        mu_init = max(np.mean(y[y > 0]) if np.any(y > 0) else 1.0, 0.01)
280        pi_init = max(prop_zero - np.exp(-mu_init) * (1 - prop_zero), 0.05)
281        pi_init = min(pi_init, 0.95)
282        return {
283            "mu": np.full(len(y), mu_init),
284            "pi": np.full(len(y), pi_init),
285        }

Zero-Inflated Poisson (ZIP) for count data with excess zeros.

Parameterisation:

mu > 0 : Poisson rate for the non-zero-inflated component (log link) pi in (0,1) : probability of structural zero (logit link)

P(Y=0) = pi + (1-pi)*exp(-mu) P(Y=y) = (1-pi) * Poisson(y; mu) for y > 0

Insurance use: portfolios where a proportion of risks have zero claims due to non-exposure (parked vehicles, seasonal risks). pi can then be modelled as a function of coverage period or risk activity covariates.

Implementation note: the zero component introduces separate score equations for mu and pi, with different forms for y=0 and y>0.

param_names
202    @property
203    def param_names(self):
204        return ["mu", "pi"]

Ordered list of distribution parameter names, e.g. ['mu', 'sigma'].

def log_likelihood(self, y, params):
210    def log_likelihood(self, y, params):
211        mu = params["mu"]
212        pi = params["pi"]
213        pi = np.clip(pi, 1e-10, 1 - 1e-10)
214
215        ll_zero = np.log(pi + (1.0 - pi) * np.exp(-mu))
216        ll_pos = np.log(1.0 - pi) + xlogy(y, mu) - mu - gammaln(y + 1.0)
217
218        return np.where(y == 0, ll_zero, ll_pos)

Per-observation log-likelihood.

Returns array of shape (n,), one value per observation. The fitting engine sums these to get total log-likelihood.

def dl_deta(self, y, params, param_name):
220    def dl_deta(self, y, params, param_name):
221        self._check_param(param_name)
222        mu = params["mu"]
223        pi = params["pi"]
224        pi = np.clip(pi, 1e-10, 1 - 1e-10)
225
226        p0 = pi + (1.0 - pi) * np.exp(-mu)
227        p0 = np.clip(p0, 1e-300, None)
228
229        if param_name == "mu":
230            link = self.default_links["mu"]
231            dl_dmu = np.where(
232                y == 0,
233                -(1.0 - pi) * np.exp(-mu) / p0,
234                y / mu - 1.0,
235            )
236            dmu_deta = link.inverse_deriv(link.link(mu))
237            return dl_dmu * dmu_deta
238
239        else:  # pi
240            link = self.default_links["pi"]
241            dl_dpi = np.where(
242                y == 0,
243                (1.0 - np.exp(-mu)) / p0,
244                -1.0 / (1.0 - pi),
245            )
246            dpi_deta = link.inverse_deriv(link.link(pi))
247            return dl_dpi * dpi_deta

d(log L_i)/d(eta_k) for parameter param_name.

This is the derivative of log-likelihood with respect to the linear predictor eta_k (not theta_k). Used to construct the working response in the IRLS step.

By chain rule: d(log L)/d(eta_k) = d(log L)/d(theta_k) * d(theta_k)/d(eta_k)

Raises ValueError for unknown param_name.

def d2l_deta2(self, y, params, param_name):
249    def d2l_deta2(self, y, params, param_name):
250        self._check_param(param_name)
251        mu = params["mu"]
252        pi = params["pi"]
253        pi = np.clip(pi, 1e-10, 1 - 1e-10)
254
255        p0 = pi + (1.0 - pi) * np.exp(-mu)
256        p0 = np.clip(p0, 1e-300, None)
257        exp_neg_mu = np.exp(-mu)
258
259        if param_name == "mu":
260            link = self.default_links["mu"]
261            d2_zero = -((1.0 - pi) * exp_neg_mu * p0 - ((1.0 - pi) * exp_neg_mu)**2) / p0**2
262            d2_pos = -y / mu**2
263            d2l_dmu2 = np.where(y == 0, -d2_zero, -d2_pos)
264            d2l_dmu2 = np.maximum(np.abs(d2l_dmu2), 1e-8)
265            dmu_deta = link.inverse_deriv(link.link(mu))
266            return d2l_dmu2 * dmu_deta**2
267
268        else:  # pi
269            link = self.default_links["pi"]
270            d2_zero = -((1.0 - exp_neg_mu) * p0 - (1.0 - exp_neg_mu)**2) / p0**2
271            d2_pos = -1.0 / (1.0 - pi)**2
272            d2l_dpi2 = np.where(y == 0, -d2_zero, -d2_pos)
273            d2l_dpi2 = np.maximum(np.abs(d2l_dpi2), 1e-8)
274            dpi_deta = link.inverse_deriv(link.link(pi))
275            return d2l_dpi2 * dpi_deta**2

Expected -d^2(log L_i)/d(eta_k)^2 for parameter param_name.

Returns positive values (this is the negative expected second derivative). Used as IRLS weights. If analytic expectation is hard, use the observed Fisher information (negative of actual second derivative).

Raises ValueError for unknown param_name.

def starting_values(self, y):
277    def starting_values(self, y):
278        prop_zero = np.mean(y == 0)
279        mu_init = max(np.mean(y[y > 0]) if np.any(y > 0) else 1.0, 0.01)
280        pi_init = max(prop_zero - np.exp(-mu_init) * (1 - prop_zero), 0.05)
281        pi_init = min(pi_init, 0.95)
282        return {
283            "mu": np.full(len(y), mu_init),
284            "pi": np.full(len(y), pi_init),
285        }

Initial parameter estimates from the marginal distribution of y.

Called once before the RS loop begins. Should be robust — moment estimates are fine here; precision doesn't matter much.