Edit on GitHub

insurance_frequency_severity

insurance-frequency-severity: Sarmanov copula joint frequency-severity modelling, plus neural two-part dependent model.

Challenges the independence assumption in the standard two-model GLM framework. Implements Sarmanov copula for mixed discrete-continuous margins, Gaussian copula as a comparison, and Garrido's conditional method as the simplest baseline.

The neural two-part dependent model lives in the dependent subpackage and requires torch. Install with: pip install insurance-frequency-severity[neural]

For the neural two-part dependent model with shared encoder trunk:

>>> from insurance_frequency_severity.dependent import DependentFSModel

Typical usage (Sarmanov)

>>> from insurance_frequency_severity import JointFreqSev
>>> model = JointFreqSev(freq_glm=my_nb_glm, sev_glm=my_gamma_glm)
>>> model.fit(claims_df, n_col="claim_count", s_col="avg_severity")
>>> corrections = model.premium_correction(X_new)
 1"""
 2insurance-frequency-severity: Sarmanov copula joint frequency-severity modelling,
 3plus neural two-part dependent model.
 4
 5Challenges the independence assumption in the standard two-model GLM framework.
 6Implements Sarmanov copula for mixed discrete-continuous margins, Gaussian copula
 7as a comparison, and Garrido's conditional method as the simplest baseline.
 8
 9The neural two-part dependent model lives in the ``dependent`` subpackage and
10requires torch. Install with: pip install insurance-frequency-severity[neural]
11
12For the neural two-part dependent model with shared encoder trunk:
13
14>>> from insurance_frequency_severity.dependent import DependentFSModel
15
16Typical usage (Sarmanov)
17------------------------
18>>> from insurance_frequency_severity import JointFreqSev
19>>> model = JointFreqSev(freq_glm=my_nb_glm, sev_glm=my_gamma_glm)
20>>> model.fit(claims_df, n_col="claim_count", s_col="avg_severity")
21>>> corrections = model.premium_correction(X_new)
22"""
23
24from insurance_frequency_severity.copula import (
25    SarmanovCopula,
26    GaussianCopulaMixed,
27    FGMCopula,
28)
29from insurance_frequency_severity.joint import (
30    JointFreqSev,
31    ConditionalFreqSev,
32)
33from insurance_frequency_severity.diagnostics import (
34    DependenceTest,
35    CopulaGOF,
36    compare_copulas,
37)
38from insurance_frequency_severity.report import JointModelReport
39
40# ``dependent`` is loaded lazily so that importing this package does not
41# require torch.  Access it as insurance_frequency_severity.dependent or
42# import directly from the subpackage.
43def __getattr__(name: str):
44    if name == "dependent":
45        from insurance_frequency_severity import dependent
46        return dependent
47    raise AttributeError(f"module 'insurance_frequency_severity' has no attribute {name!r}")
48
49
50from importlib.metadata import version, PackageNotFoundError
51
52try:
53    __version__ = version("insurance-frequency-severity")
54except PackageNotFoundError:
55    __version__ = "0.0.0"  # not installed
56
57__all__ = [
58    "SarmanovCopula",
59    "GaussianCopulaMixed",
60    "FGMCopula",
61    "JointFreqSev",
62    "ConditionalFreqSev",
63    "DependenceTest",
64    "CopulaGOF",
65    "compare_copulas",
66    "JointModelReport",
67    "dependent",
68]
@dataclass
class SarmanovCopula:
241@dataclass
242class SarmanovCopula:
243    """
244    Sarmanov bivariate distribution for mixed discrete-continuous margins.
245
246    Joint density/PMF:
247        f(n, s) = f_N(n) * f_S(s) * [1 + omega * phi_1(n) * phi_2(s)]
248
249    Parameters
250    ----------
251    freq_family : 'nb' | 'poisson'
252        Marginal family for claim frequency.
253    sev_family : 'gamma' | 'lognormal'
254        Marginal family for claim severity.
255    omega : float
256        Dependence parameter. Must satisfy the non-negativity constraint.
257        Negative omega indicates that high-frequency claims tend to have
258        lower severity (the typical finding in UK motor via NCD suppression).
259    kernel_theta : float
260        Laplace exponent for the frequency kernel phi_1.
261    kernel_alpha : float
262        Laplace exponent for the severity kernel phi_2.
263
264    Notes
265    -----
266    omega=0 corresponds to independence. The Sarmanov family reduces to
267    the product of marginals when omega=0.
268
269    The parameter space for omega is bounded:
270        -1 / (sup_phi1 * sup_phi2) <= omega <= 1 / (sup_phi1 * sup_phi2)
271
272    In practice, for moderate mu_N and mu_S, the feasible omega range is
273    roughly [-10, 10] or wider, so the bound rarely binds.
274    """
275
276    freq_family: Literal["nb", "poisson"] = "nb"
277    sev_family: Literal["gamma", "lognormal"] = "gamma"
278    omega: float = 0.0
279    kernel_theta: float = 0.5
280    kernel_alpha: float = 0.001
281
282    def __post_init__(self):
283        self._freq_kernel: Optional[LaplaceKernelNB | LaplaceKernelPoisson] = None
284        self._sev_kernel: Optional[LaplaceKernelGamma | LaplaceKernelLognormal] = None
285        self._build_kernels()
286
287    def _build_kernels(self):
288        if self.freq_family == "nb":
289            self._freq_kernel = LaplaceKernelNB(theta=self.kernel_theta)
290        else:
291            self._freq_kernel = LaplaceKernelPoisson(theta=self.kernel_theta)
292
293        if self.sev_family == "gamma":
294            self._sev_kernel = LaplaceKernelGamma(alpha=self.kernel_alpha)
295        else:
296            self._sev_kernel = LaplaceKernelLognormal(alpha=self.kernel_alpha)
297
298    def _phi1(self, n: np.ndarray, freq_params: dict) -> np.ndarray:
299        """Centred frequency kernel phi_1(n)."""
300        if self.freq_family == "nb":
301            return self._freq_kernel.centred(n, freq_params["mu"], freq_params["alpha"])
302        else:
303            return self._freq_kernel.centred(n, freq_params["mu"])
304
305    def _phi2(self, s: np.ndarray, sev_params: dict) -> np.ndarray:
306        """Centred severity kernel phi_2(s)."""
307        if self.sev_family == "gamma":
308            return self._sev_kernel.centred(s, sev_params["mu"], sev_params["shape"])
309        else:
310            return self._sev_kernel.centred(s, sev_params["log_mu"], sev_params["log_sigma"])
311
312    def _log_freq_pmf(self, n: np.ndarray, freq_params: dict) -> np.ndarray:
313        """log P(N=n) under marginal frequency distribution."""
314        mu = freq_params["mu"]
315        if self.freq_family == "nb":
316            alpha = freq_params["alpha"]
317            r = 1.0 / alpha
318            p = 1.0 / (1.0 + mu * alpha)
319            n_arr = np.asarray(n, dtype=float)
320            # NB PMF: Gamma(n+r)/(n! Gamma(r)) * p^r * (1-p)^n
321            log_pmf = (
322                gammaln(n_arr + r) - gammaln(r) - gammaln(n_arr + 1)
323                + r * np.log(p)
324                + n_arr * np.log1p(-p)
325            )
326        else:
327            n_arr = np.asarray(n, dtype=float)
328            log_pmf = n_arr * np.log(mu) - mu - gammaln(n_arr + 1)
329        return log_pmf
330
331    def _log_sev_pdf(self, s: np.ndarray, sev_params: dict) -> np.ndarray:
332        """log f_S(s) under marginal severity distribution."""
333        s_arr = np.asarray(s, dtype=float)
334        if self.sev_family == "gamma":
335            mu_s = sev_params["mu"]
336            shape = sev_params["shape"]
337            scale = mu_s / shape
338            log_pdf = stats.gamma.logpdf(s_arr, a=shape, scale=scale)
339        else:
340            log_mu = sev_params["log_mu"]
341            log_sigma = sev_params["log_sigma"]
342            log_pdf = stats.lognorm.logpdf(s_arr, s=log_sigma, scale=np.exp(log_mu))
343        return log_pdf
344
345    def log_joint_density(
346        self,
347        n: np.ndarray,
348        s: np.ndarray,
349        freq_params: dict,
350        sev_params: dict,
351    ) -> np.ndarray:
352        """
353        log f(n, s) = log f_N(n) + log f_S(s) + log(1 + omega * phi_1(n) * phi_2(s))
354
355        Parameters
356        ----------
357        n : array-like
358            Claim counts (non-negative integers).
359        s : array-like
360            Average claim severities (positive reals). Only used for policies
361            with n > 0.
362        freq_params : dict
363            Distribution parameters for frequency margin.
364            For NB: {'mu': float, 'alpha': float}
365            For Poisson: {'mu': float}
366        sev_params : dict
367            Distribution parameters for severity margin.
368            For Gamma: {'mu': float, 'shape': float}
369            For Lognormal: {'log_mu': float, 'log_sigma': float}
370
371        Returns
372        -------
373        ndarray of log joint densities. Shape matches n/s.
374        """
375        n = np.asarray(n, dtype=float)
376        s = np.asarray(s, dtype=float)
377
378        log_f_n = self._log_freq_pmf(n, freq_params)
379        log_f_s = self._log_sev_pdf(s, sev_params)
380        phi1 = self._phi1(n, freq_params)
381        phi2 = self._phi2(s, sev_params)
382
383        interaction = 1.0 + self.omega * phi1 * phi2
384        # Guard against numerical zero or negative values
385        interaction = np.clip(interaction, 1e-300, None)
386
387        return log_f_n + log_f_s + np.log(interaction)
388
389    def log_likelihood(
390        self,
391        n: np.ndarray,
392        s: np.ndarray,
393        freq_params: dict | list,
394        sev_params: dict | list,
395    ) -> float:
396        """
397        Total log-likelihood over a sample.
398
399        For observations with n=0, the severity does not exist; the contribution
400        is just log f_N(0). Pass s=0 or any positive value for these rows;
401        the function handles them correctly by treating the joint as a marginal
402        frequency contribution only when n=0.
403
404        Implementation note: when n=0, the joint is f(0, s) = f_N(0) * f_S(s) *
405        [1 + omega*phi_1(0)*phi_2(s)]. But since s is unobserved for n=0, we
406        must integrate over s, recovering f_N(0). This is correct automatically
407        since E[phi_2(S)] = 0, so:
408
409            integral f_S(s) * [1 + omega*phi_1(0)*phi_2(s)] ds
410            = 1 + omega * phi_1(0) * E[phi_2(S)]
411            = 1 + omega * phi_1(0) * 0  = 1
412
413        So for n=0 rows: contribution = log f_N(0).
414        For n>0 rows: contribution = log f_N(n) + log f_S(s) + log(1 + omega*phi1*phi2).
415        """
416        n = np.asarray(n, dtype=float)
417        s = np.asarray(s, dtype=float)
418
419        # Handle per-observation parameters (arrays of dicts) or single dict
420        if isinstance(freq_params, dict):
421            fp_mu = np.full_like(n, freq_params["mu"])
422            if self.freq_family == "nb":
423                fp_alpha = np.full_like(n, freq_params["alpha"])
424        else:
425            fp_mu = np.array([p["mu"] for p in freq_params])
426            if self.freq_family == "nb":
427                fp_alpha = np.array([p["alpha"] for p in freq_params])
428
429        if isinstance(sev_params, dict):
430            if self.sev_family == "gamma":
431                sp_mu = np.full_like(s, sev_params["mu"])
432                sp_shape = np.full_like(s, sev_params["shape"])
433            else:
434                sp_lmu = np.full_like(s, sev_params["log_mu"])
435                sp_lsigma = np.full_like(s, sev_params["log_sigma"])
436        else:
437            if self.sev_family == "gamma":
438                sp_mu = np.array([p["mu"] for p in sev_params])
439                sp_shape = np.array([p["shape"] for p in sev_params])
440            else:
441                sp_lmu = np.array([p["log_mu"] for p in sev_params])
442                sp_lsigma = np.array([p["log_sigma"] for p in sev_params])
443
444        # Frequency log-PMF for all observations
445        if self.freq_family == "nb":
446            fp = {"mu": fp_mu, "alpha": fp_alpha}
447        else:
448            fp = {"mu": fp_mu}
449        log_f_n = self._log_freq_pmf(n, fp)
450
451        # For n=0 rows: contribution is just log f_N(0)
452        mask_pos = n > 0
453        ll = np.where(mask_pos, 0.0, log_f_n)
454
455        if mask_pos.any():
456            n_pos = n[mask_pos]
457            s_pos = s[mask_pos]
458
459            if self.freq_family == "nb":
460                fp_pos = {"mu": fp_mu[mask_pos], "alpha": fp_alpha[mask_pos]}
461            else:
462                fp_pos = {"mu": fp_mu[mask_pos]}
463
464            if self.sev_family == "gamma":
465                sp_pos = {"mu": sp_mu[mask_pos], "shape": sp_shape[mask_pos]}
466            else:
467                sp_pos = {"log_mu": sp_lmu[mask_pos], "log_sigma": sp_lsigma[mask_pos]}
468
469            log_jd = self.log_joint_density(n_pos, s_pos, fp_pos, sp_pos)
470            ll[mask_pos] = log_jd
471
472        return float(np.sum(ll))
473
474    def omega_bounds(
475        self,
476        freq_params: dict,
477        sev_params: dict,
478        n_grid: int = 50,
479        s_grid: int = 200,
480    ) -> Tuple[float, float]:
481        """
482        Compute feasible omega range for given marginal parameters.
483
484        Returns (omega_min, omega_max) such that
485            1 + omega * phi_1(n) * phi_2(s) >= 0 everywhere.
486
487        Uses a grid search over (n, s) to find the extremes of phi1*phi2.
488        """
489        # Frequency support: n = 0, 1, ..., n_grid
490        n_vals = np.arange(n_grid)
491        phi1 = self._phi1(n_vals, freq_params)
492
493        # Severity support: quantiles of marginal distribution
494        if self.sev_family == "gamma":
495            mu_s = sev_params["mu"]
496            shape = sev_params["shape"]
497            scale = mu_s / shape
498            s_vals = stats.gamma.ppf(np.linspace(0.001, 0.999, s_grid), a=shape, scale=scale)
499        else:
500            log_mu = sev_params["log_mu"]
501            log_sigma = sev_params["log_sigma"]
502            s_vals = stats.lognorm.ppf(
503                np.linspace(0.001, 0.999, s_grid),
504                s=log_sigma, scale=np.exp(log_mu)
505            )
506        phi2 = self._phi2(s_vals, sev_params)
507
508        # Product grid
509        products = np.outer(phi1, phi2)
510        max_prod = float(np.max(products))
511        min_prod = float(np.min(products))
512
513        omega_max = (1.0 / max_prod) if max_prod > 0 else np.inf
514        omega_min = (-1.0 / (-min_prod)) if min_prod < 0 else -np.inf
515
516        return (omega_min, omega_max)
517
518    def spearman_rho(
519        self,
520        freq_params: dict,
521        sev_params: dict,
522        n_mc: int = 50_000,
523        rng: Optional[np.random.Generator] = None,
524    ) -> float:
525        """
526        Monte Carlo estimate of Spearman's rho for this Sarmanov distribution.
527
528        Simulates n_mc samples and computes the rank correlation.
529        """
530        n_samp, s_samp = self.sample(n_mc, freq_params, sev_params, rng=rng)
531        rho = float(np.corrcoef(
532            stats.rankdata(n_samp),
533            stats.rankdata(s_samp)
534        )[0, 1])
535        return rho
536
537    def kendall_tau(
538        self,
539        freq_params: dict,
540        sev_params: dict,
541        n_mc: int = 20_000,
542        rng: Optional[np.random.Generator] = None,
543    ) -> float:
544        """Monte Carlo estimate of Kendall's tau."""
545        from scipy.stats import kendalltau
546        n_samp, s_samp = self.sample(n_mc, freq_params, sev_params, rng=rng)
547        tau, _ = kendalltau(n_samp, s_samp)
548        return float(tau)
549
550    def sample(
551        self,
552        size: int,
553        freq_params: dict,
554        sev_params: dict,
555        rng: Optional[np.random.Generator] = None,
556        max_iter: int = 10,
557    ) -> Tuple[np.ndarray, np.ndarray]:
558        """
559        Sample from the Sarmanov distribution using acceptance-rejection.
560
561        The proposal distribution is the product f_N * f_S. The acceptance
562        probability is proportional to 1 + omega * phi1(n) * phi2(s).
563
564        The bound on the acceptance ratio is:
565            M = 1 + |omega| * sup|phi1| * sup|phi2|
566
567        For typical insurance parameters with small omega, acceptance rate is
568        very high (> 90%).
569
570        Returns
571        -------
572        (n_samples, s_samples) : arrays of shape (size,)
573        """
574        if rng is None:
575            rng = np.random.default_rng()
576
577        # Compute acceptance bound M
578        if self.freq_family == "nb":
579            sup1 = self._freq_kernel.sup_abs(freq_params["mu"], freq_params["alpha"])
580        else:
581            sup1 = self._freq_kernel.sup_abs(freq_params["mu"])
582
583        if self.sev_family == "gamma":
584            sup2 = self._sev_kernel.sup_abs(sev_params["mu"], sev_params["shape"])
585        else:
586            sup2 = self._sev_kernel.sup_abs(sev_params["log_mu"], sev_params["log_sigma"])
587
588        M = 1.0 + abs(self.omega) * sup1 * sup2
589
590        n_collected = np.zeros(size, dtype=int)
591        s_collected = np.zeros(size, dtype=float)
592        filled = 0
593
594        for _ in range(max_iter):
595            remaining = size - filled
596            if remaining <= 0:
597                break
598            # Oversample to reduce iterations
599            n_propose = int(remaining * M * 1.5) + 100
600
601            # Sample from marginals
602            if self.freq_family == "nb":
603                mu = freq_params["mu"]
604                alpha = freq_params["alpha"]
605                r = 1.0 / alpha
606                p = r / (r + mu)
607                n_prop = rng.negative_binomial(r, p, size=n_propose)
608            else:
609                n_prop = rng.poisson(freq_params["mu"], size=n_propose)
610
611            if self.sev_family == "gamma":
612                mu_s = sev_params["mu"]
613                shape = sev_params["shape"]
614                scale = mu_s / shape
615                s_prop = rng.gamma(shape, scale, size=n_propose)
616            else:
617                log_mu = sev_params["log_mu"]
618                log_sigma = sev_params["log_sigma"]
619                s_prop = np.exp(rng.normal(log_mu, log_sigma, size=n_propose))
620
621            # Acceptance ratio
622            if self.freq_family == "nb":
623                p1 = self._phi1(n_prop, freq_params)
624            else:
625                p1 = self._phi1(n_prop, freq_params)
626
627            if self.sev_family == "gamma":
628                p2 = self._phi2(s_prop, sev_params)
629            else:
630                p2 = self._phi2(s_prop, sev_params)
631
632            ratio = (1.0 + self.omega * p1 * p2) / M
633            ratio = np.clip(ratio, 0, 1)
634
635            accept = rng.uniform(size=n_propose) < ratio
636            n_acc = n_prop[accept]
637            s_acc = s_prop[accept]
638
639            take = min(len(n_acc), remaining)
640            n_collected[filled:filled + take] = n_acc[:take]
641            s_collected[filled:filled + take] = s_acc[:take]
642            filled += take
643
644        if filled < size:
645            warnings.warn(
646                f"Acceptance-rejection sampler: only filled {filled}/{size} samples "
647                f"after {max_iter} iterations. Acceptance rate may be very low. "
648                f"Check omega bounds (M={M:.2f})."
649            )
650
651        return n_collected[:filled], s_collected[:filled]

Sarmanov bivariate distribution for mixed discrete-continuous margins.

Joint density/PMF: f(n, s) = f_N(n) * f_S(s) * [1 + omega * phi_1(n) * phi_2(s)]

Parameters

freq_family : 'nb' | 'poisson' Marginal family for claim frequency. sev_family : 'gamma' | 'lognormal' Marginal family for claim severity. omega : float Dependence parameter. Must satisfy the non-negativity constraint. Negative omega indicates that high-frequency claims tend to have lower severity (the typical finding in UK motor via NCD suppression). kernel_theta : float Laplace exponent for the frequency kernel phi_1. kernel_alpha : float Laplace exponent for the severity kernel phi_2.

Notes

omega=0 corresponds to independence. The Sarmanov family reduces to the product of marginals when omega=0.

The parameter space for omega is bounded:

-1 / (sup_phi1 * sup_phi2) <= omega <= 1 / (sup_phi1 * sup_phi2)

In practice, for moderate mu_N and mu_S, the feasible omega range is roughly [-10, 10] or wider, so the bound rarely binds.

SarmanovCopula( freq_family: Literal['nb', 'poisson'] = 'nb', sev_family: Literal['gamma', 'lognormal'] = 'gamma', omega: float = 0.0, kernel_theta: float = 0.5, kernel_alpha: float = 0.001)
freq_family: Literal['nb', 'poisson'] = 'nb'
sev_family: Literal['gamma', 'lognormal'] = 'gamma'
omega: float = 0.0
kernel_theta: float = 0.5
kernel_alpha: float = 0.001
def log_joint_density( self, n: numpy.ndarray, s: numpy.ndarray, freq_params: dict, sev_params: dict) -> numpy.ndarray:
345    def log_joint_density(
346        self,
347        n: np.ndarray,
348        s: np.ndarray,
349        freq_params: dict,
350        sev_params: dict,
351    ) -> np.ndarray:
352        """
353        log f(n, s) = log f_N(n) + log f_S(s) + log(1 + omega * phi_1(n) * phi_2(s))
354
355        Parameters
356        ----------
357        n : array-like
358            Claim counts (non-negative integers).
359        s : array-like
360            Average claim severities (positive reals). Only used for policies
361            with n > 0.
362        freq_params : dict
363            Distribution parameters for frequency margin.
364            For NB: {'mu': float, 'alpha': float}
365            For Poisson: {'mu': float}
366        sev_params : dict
367            Distribution parameters for severity margin.
368            For Gamma: {'mu': float, 'shape': float}
369            For Lognormal: {'log_mu': float, 'log_sigma': float}
370
371        Returns
372        -------
373        ndarray of log joint densities. Shape matches n/s.
374        """
375        n = np.asarray(n, dtype=float)
376        s = np.asarray(s, dtype=float)
377
378        log_f_n = self._log_freq_pmf(n, freq_params)
379        log_f_s = self._log_sev_pdf(s, sev_params)
380        phi1 = self._phi1(n, freq_params)
381        phi2 = self._phi2(s, sev_params)
382
383        interaction = 1.0 + self.omega * phi1 * phi2
384        # Guard against numerical zero or negative values
385        interaction = np.clip(interaction, 1e-300, None)
386
387        return log_f_n + log_f_s + np.log(interaction)

log f(n, s) = log f_N(n) + log f_S(s) + log(1 + omega * phi_1(n) * phi_2(s))

Parameters

n : array-like Claim counts (non-negative integers). s : array-like Average claim severities (positive reals). Only used for policies with n > 0. freq_params : dict Distribution parameters for frequency margin. For NB: {'mu': float, 'alpha': float} For Poisson: {'mu': float} sev_params : dict Distribution parameters for severity margin. For Gamma: {'mu': float, 'shape': float} For Lognormal: {'log_mu': float, 'log_sigma': float}

Returns

ndarray of log joint densities. Shape matches n/s.

def log_likelihood( self, n: numpy.ndarray, s: numpy.ndarray, freq_params: dict | list, sev_params: dict | list) -> float:
389    def log_likelihood(
390        self,
391        n: np.ndarray,
392        s: np.ndarray,
393        freq_params: dict | list,
394        sev_params: dict | list,
395    ) -> float:
396        """
397        Total log-likelihood over a sample.
398
399        For observations with n=0, the severity does not exist; the contribution
400        is just log f_N(0). Pass s=0 or any positive value for these rows;
401        the function handles them correctly by treating the joint as a marginal
402        frequency contribution only when n=0.
403
404        Implementation note: when n=0, the joint is f(0, s) = f_N(0) * f_S(s) *
405        [1 + omega*phi_1(0)*phi_2(s)]. But since s is unobserved for n=0, we
406        must integrate over s, recovering f_N(0). This is correct automatically
407        since E[phi_2(S)] = 0, so:
408
409            integral f_S(s) * [1 + omega*phi_1(0)*phi_2(s)] ds
410            = 1 + omega * phi_1(0) * E[phi_2(S)]
411            = 1 + omega * phi_1(0) * 0  = 1
412
413        So for n=0 rows: contribution = log f_N(0).
414        For n>0 rows: contribution = log f_N(n) + log f_S(s) + log(1 + omega*phi1*phi2).
415        """
416        n = np.asarray(n, dtype=float)
417        s = np.asarray(s, dtype=float)
418
419        # Handle per-observation parameters (arrays of dicts) or single dict
420        if isinstance(freq_params, dict):
421            fp_mu = np.full_like(n, freq_params["mu"])
422            if self.freq_family == "nb":
423                fp_alpha = np.full_like(n, freq_params["alpha"])
424        else:
425            fp_mu = np.array([p["mu"] for p in freq_params])
426            if self.freq_family == "nb":
427                fp_alpha = np.array([p["alpha"] for p in freq_params])
428
429        if isinstance(sev_params, dict):
430            if self.sev_family == "gamma":
431                sp_mu = np.full_like(s, sev_params["mu"])
432                sp_shape = np.full_like(s, sev_params["shape"])
433            else:
434                sp_lmu = np.full_like(s, sev_params["log_mu"])
435                sp_lsigma = np.full_like(s, sev_params["log_sigma"])
436        else:
437            if self.sev_family == "gamma":
438                sp_mu = np.array([p["mu"] for p in sev_params])
439                sp_shape = np.array([p["shape"] for p in sev_params])
440            else:
441                sp_lmu = np.array([p["log_mu"] for p in sev_params])
442                sp_lsigma = np.array([p["log_sigma"] for p in sev_params])
443
444        # Frequency log-PMF for all observations
445        if self.freq_family == "nb":
446            fp = {"mu": fp_mu, "alpha": fp_alpha}
447        else:
448            fp = {"mu": fp_mu}
449        log_f_n = self._log_freq_pmf(n, fp)
450
451        # For n=0 rows: contribution is just log f_N(0)
452        mask_pos = n > 0
453        ll = np.where(mask_pos, 0.0, log_f_n)
454
455        if mask_pos.any():
456            n_pos = n[mask_pos]
457            s_pos = s[mask_pos]
458
459            if self.freq_family == "nb":
460                fp_pos = {"mu": fp_mu[mask_pos], "alpha": fp_alpha[mask_pos]}
461            else:
462                fp_pos = {"mu": fp_mu[mask_pos]}
463
464            if self.sev_family == "gamma":
465                sp_pos = {"mu": sp_mu[mask_pos], "shape": sp_shape[mask_pos]}
466            else:
467                sp_pos = {"log_mu": sp_lmu[mask_pos], "log_sigma": sp_lsigma[mask_pos]}
468
469            log_jd = self.log_joint_density(n_pos, s_pos, fp_pos, sp_pos)
470            ll[mask_pos] = log_jd
471
472        return float(np.sum(ll))

Total log-likelihood over a sample.

For observations with n=0, the severity does not exist; the contribution is just log f_N(0). Pass s=0 or any positive value for these rows; the function handles them correctly by treating the joint as a marginal frequency contribution only when n=0.

Implementation note: when n=0, the joint is f(0, s) = f_N(0) * f_S(s) * [1 + omega*phi_1(0)*phi_2(s)]. But since s is unobserved for n=0, we must integrate over s, recovering f_N(0). This is correct automatically since E[phi_2(S)] = 0, so:

integral f_S(s) * [1 + omega*phi_1(0)*phi_2(s)] ds
= 1 + omega * phi_1(0) * E[phi_2(S)]
= 1 + omega * phi_1(0) * 0  = 1

So for n=0 rows: contribution = log f_N(0). For n>0 rows: contribution = log f_N(n) + log f_S(s) + log(1 + omegaphi1phi2).

def omega_bounds( self, freq_params: dict, sev_params: dict, n_grid: int = 50, s_grid: int = 200) -> Tuple[float, float]:
474    def omega_bounds(
475        self,
476        freq_params: dict,
477        sev_params: dict,
478        n_grid: int = 50,
479        s_grid: int = 200,
480    ) -> Tuple[float, float]:
481        """
482        Compute feasible omega range for given marginal parameters.
483
484        Returns (omega_min, omega_max) such that
485            1 + omega * phi_1(n) * phi_2(s) >= 0 everywhere.
486
487        Uses a grid search over (n, s) to find the extremes of phi1*phi2.
488        """
489        # Frequency support: n = 0, 1, ..., n_grid
490        n_vals = np.arange(n_grid)
491        phi1 = self._phi1(n_vals, freq_params)
492
493        # Severity support: quantiles of marginal distribution
494        if self.sev_family == "gamma":
495            mu_s = sev_params["mu"]
496            shape = sev_params["shape"]
497            scale = mu_s / shape
498            s_vals = stats.gamma.ppf(np.linspace(0.001, 0.999, s_grid), a=shape, scale=scale)
499        else:
500            log_mu = sev_params["log_mu"]
501            log_sigma = sev_params["log_sigma"]
502            s_vals = stats.lognorm.ppf(
503                np.linspace(0.001, 0.999, s_grid),
504                s=log_sigma, scale=np.exp(log_mu)
505            )
506        phi2 = self._phi2(s_vals, sev_params)
507
508        # Product grid
509        products = np.outer(phi1, phi2)
510        max_prod = float(np.max(products))
511        min_prod = float(np.min(products))
512
513        omega_max = (1.0 / max_prod) if max_prod > 0 else np.inf
514        omega_min = (-1.0 / (-min_prod)) if min_prod < 0 else -np.inf
515
516        return (omega_min, omega_max)

Compute feasible omega range for given marginal parameters.

Returns (omega_min, omega_max) such that 1 + omega * phi_1(n) * phi_2(s) >= 0 everywhere.

Uses a grid search over (n, s) to find the extremes of phi1*phi2.

def spearman_rho( self, freq_params: dict, sev_params: dict, n_mc: int = 50000, rng: Optional[numpy.random._generator.Generator] = None) -> float:
518    def spearman_rho(
519        self,
520        freq_params: dict,
521        sev_params: dict,
522        n_mc: int = 50_000,
523        rng: Optional[np.random.Generator] = None,
524    ) -> float:
525        """
526        Monte Carlo estimate of Spearman's rho for this Sarmanov distribution.
527
528        Simulates n_mc samples and computes the rank correlation.
529        """
530        n_samp, s_samp = self.sample(n_mc, freq_params, sev_params, rng=rng)
531        rho = float(np.corrcoef(
532            stats.rankdata(n_samp),
533            stats.rankdata(s_samp)
534        )[0, 1])
535        return rho

Monte Carlo estimate of Spearman's rho for this Sarmanov distribution.

Simulates n_mc samples and computes the rank correlation.

def kendall_tau( self, freq_params: dict, sev_params: dict, n_mc: int = 20000, rng: Optional[numpy.random._generator.Generator] = None) -> float:
537    def kendall_tau(
538        self,
539        freq_params: dict,
540        sev_params: dict,
541        n_mc: int = 20_000,
542        rng: Optional[np.random.Generator] = None,
543    ) -> float:
544        """Monte Carlo estimate of Kendall's tau."""
545        from scipy.stats import kendalltau
546        n_samp, s_samp = self.sample(n_mc, freq_params, sev_params, rng=rng)
547        tau, _ = kendalltau(n_samp, s_samp)
548        return float(tau)

Monte Carlo estimate of Kendall's tau.

def sample( self, size: int, freq_params: dict, sev_params: dict, rng: Optional[numpy.random._generator.Generator] = None, max_iter: int = 10) -> Tuple[numpy.ndarray, numpy.ndarray]:
550    def sample(
551        self,
552        size: int,
553        freq_params: dict,
554        sev_params: dict,
555        rng: Optional[np.random.Generator] = None,
556        max_iter: int = 10,
557    ) -> Tuple[np.ndarray, np.ndarray]:
558        """
559        Sample from the Sarmanov distribution using acceptance-rejection.
560
561        The proposal distribution is the product f_N * f_S. The acceptance
562        probability is proportional to 1 + omega * phi1(n) * phi2(s).
563
564        The bound on the acceptance ratio is:
565            M = 1 + |omega| * sup|phi1| * sup|phi2|
566
567        For typical insurance parameters with small omega, acceptance rate is
568        very high (> 90%).
569
570        Returns
571        -------
572        (n_samples, s_samples) : arrays of shape (size,)
573        """
574        if rng is None:
575            rng = np.random.default_rng()
576
577        # Compute acceptance bound M
578        if self.freq_family == "nb":
579            sup1 = self._freq_kernel.sup_abs(freq_params["mu"], freq_params["alpha"])
580        else:
581            sup1 = self._freq_kernel.sup_abs(freq_params["mu"])
582
583        if self.sev_family == "gamma":
584            sup2 = self._sev_kernel.sup_abs(sev_params["mu"], sev_params["shape"])
585        else:
586            sup2 = self._sev_kernel.sup_abs(sev_params["log_mu"], sev_params["log_sigma"])
587
588        M = 1.0 + abs(self.omega) * sup1 * sup2
589
590        n_collected = np.zeros(size, dtype=int)
591        s_collected = np.zeros(size, dtype=float)
592        filled = 0
593
594        for _ in range(max_iter):
595            remaining = size - filled
596            if remaining <= 0:
597                break
598            # Oversample to reduce iterations
599            n_propose = int(remaining * M * 1.5) + 100
600
601            # Sample from marginals
602            if self.freq_family == "nb":
603                mu = freq_params["mu"]
604                alpha = freq_params["alpha"]
605                r = 1.0 / alpha
606                p = r / (r + mu)
607                n_prop = rng.negative_binomial(r, p, size=n_propose)
608            else:
609                n_prop = rng.poisson(freq_params["mu"], size=n_propose)
610
611            if self.sev_family == "gamma":
612                mu_s = sev_params["mu"]
613                shape = sev_params["shape"]
614                scale = mu_s / shape
615                s_prop = rng.gamma(shape, scale, size=n_propose)
616            else:
617                log_mu = sev_params["log_mu"]
618                log_sigma = sev_params["log_sigma"]
619                s_prop = np.exp(rng.normal(log_mu, log_sigma, size=n_propose))
620
621            # Acceptance ratio
622            if self.freq_family == "nb":
623                p1 = self._phi1(n_prop, freq_params)
624            else:
625                p1 = self._phi1(n_prop, freq_params)
626
627            if self.sev_family == "gamma":
628                p2 = self._phi2(s_prop, sev_params)
629            else:
630                p2 = self._phi2(s_prop, sev_params)
631
632            ratio = (1.0 + self.omega * p1 * p2) / M
633            ratio = np.clip(ratio, 0, 1)
634
635            accept = rng.uniform(size=n_propose) < ratio
636            n_acc = n_prop[accept]
637            s_acc = s_prop[accept]
638
639            take = min(len(n_acc), remaining)
640            n_collected[filled:filled + take] = n_acc[:take]
641            s_collected[filled:filled + take] = s_acc[:take]
642            filled += take
643
644        if filled < size:
645            warnings.warn(
646                f"Acceptance-rejection sampler: only filled {filled}/{size} samples "
647                f"after {max_iter} iterations. Acceptance rate may be very low. "
648                f"Check omega bounds (M={M:.2f})."
649            )
650
651        return n_collected[:filled], s_collected[:filled]

Sample from the Sarmanov distribution using acceptance-rejection.

The proposal distribution is the product f_N * f_S. The acceptance probability is proportional to 1 + omega * phi1(n) * phi2(s).

The bound on the acceptance ratio is:

M = 1 + |omega| * sup|phi1| * sup|phi2|

For typical insurance parameters with small omega, acceptance rate is very high (> 90%).

Returns

(n_samples, s_samples) : arrays of shape (size,)

@dataclass
class GaussianCopulaMixed:
658@dataclass
659class GaussianCopulaMixed:
660    """
661    Gaussian copula linking discrete frequency and continuous severity.
662
663    This is the standard approach of Czado et al. (2012) and the R package
664    CopulaRegression. The discrete margin creates an identifiability issue
665    (Sklar's theorem is not unique for discrete distributions), addressed by
666    the mid-point PIT approximation:
667
668        U_N = (F_N(n) + F_N(n-1)) / 2    for n > 0
669        U_N = F_N(0) / 2                  for n = 0
670
671    This maps count data onto (0,1) for use with the Gaussian copula.
672    The approximation is accurate when the Poisson or NB mass at each point
673    is small (not too much probability on any single value).
674
675    Parameters
676    ----------
677    rho : float
678        Gaussian copula correlation parameter in (-1, 1).
679        This is NOT Spearman's rho (though they are closely related for the
680        Gaussian copula: rho_S ≈ (6/pi) * arcsin(rho/2)).
681    """
682
683    rho: float = 0.0
684
685    def _pit_freq(self, n: np.ndarray, freq_params: dict, freq_family: str) -> np.ndarray:
686        """Mid-point PIT for discrete frequency margin."""
687        n = np.asarray(n, dtype=float)
688        if freq_family == "nb":
689            mu = freq_params["mu"]
690            alpha = freq_params["alpha"]
691            r = 1.0 / alpha
692            p = 1.0 / (1.0 + mu * alpha)
693            cdf_n = stats.nbinom.cdf(n, r, p)
694            cdf_nm1 = np.where(n > 0, stats.nbinom.cdf(n - 1, r, p), 0.0)
695        else:
696            mu = freq_params["mu"]
697            cdf_n = stats.poisson.cdf(n, mu)
698            cdf_nm1 = np.where(n > 0, stats.poisson.cdf(n - 1, mu), 0.0)
699        return 0.5 * (cdf_n + cdf_nm1)
700
701    def _pit_sev(self, s: np.ndarray, sev_params: dict, sev_family: str) -> np.ndarray:
702        """Standard PIT for continuous severity margin."""
703        s = np.asarray(s, dtype=float)
704        if sev_family == "gamma":
705            mu_s = sev_params["mu"]
706            shape = sev_params["shape"]
707            scale = mu_s / shape
708            return stats.gamma.cdf(s, a=shape, scale=scale)
709        else:
710            log_mu = sev_params["log_mu"]
711            log_sigma = sev_params["log_sigma"]
712            return stats.lognorm.cdf(s, s=log_sigma, scale=np.exp(log_mu))
713
714    def log_likelihood(
715        self,
716        n: np.ndarray,
717        s: np.ndarray,
718        freq_params: dict | list,
719        sev_params: dict | list,
720        freq_family: str = "nb",
721        sev_family: str = "gamma",
722    ) -> float:
723        """
724        Gaussian copula log-likelihood (positive-claim observations only).
725
726        For n=0 observations, there is no severity, so only the marginal
727        frequency contributes.
728        """
729        n = np.asarray(n, dtype=float)
730        s = np.asarray(s, dtype=float)
731
732        mask_pos = n > 0
733        if not mask_pos.any():
734            return 0.0
735
736        n_pos = n[mask_pos]
737        s_pos = s[mask_pos]
738
739        if isinstance(freq_params, dict):
740            fp_pos = freq_params
741        else:
742            fp_pos = {
743                "mu": np.array([p["mu"] for p in freq_params])[mask_pos],
744                **({
745                    "alpha": np.array([p["alpha"] for p in freq_params])[mask_pos]
746                } if freq_family == "nb" else {}),
747            }
748
749        if isinstance(sev_params, dict):
750            sp_pos = sev_params
751        else:
752            if sev_family == "gamma":
753                sp_pos = {
754                    "mu": np.array([p["mu"] for p in sev_params])[mask_pos],
755                    "shape": np.array([p["shape"] for p in sev_params])[mask_pos],
756                }
757            else:
758                sp_pos = {
759                    "log_mu": np.array([p["log_mu"] for p in sev_params])[mask_pos],
760                    "log_sigma": np.array([p["log_sigma"] for p in sev_params])[mask_pos],
761                }
762
763        u = self._pit_freq(n_pos, fp_pos, freq_family)
764        v = self._pit_sev(s_pos, sp_pos, sev_family)
765
766        # Clip to avoid numerical issues at boundaries
767        u = np.clip(u, 1e-8, 1 - 1e-8)
768        v = np.clip(v, 1e-8, 1 - 1e-8)
769
770        # Gaussian copula density: c(u,v) = phi2(Phi^{-1}(u), Phi^{-1}(v); rho) /
771        #                                    (phi(Phi^{-1}(u)) * phi(Phi^{-1}(v)))
772        z1 = stats.norm.ppf(u)
773        z2 = stats.norm.ppf(v)
774
775        rho = np.clip(self.rho, -0.9999, 0.9999)
776        rho2 = rho ** 2
777
778        log_c = (
779            -0.5 * np.log(1 - rho2)
780            - (rho2 * (z1**2 + z2**2) - 2 * rho * z1 * z2) / (2 * (1 - rho2))
781        )
782
783        return float(np.sum(log_c))
784
785    def spearman_rho(self) -> float:
786        """Approximate Spearman rho from Gaussian copula parameter."""
787        return float(6.0 / np.pi * np.arcsin(self.rho / 2.0))
788
789    def sample(
790        self,
791        size: int,
792        freq_params: dict,
793        sev_params: dict,
794        freq_family: str = "nb",
795        sev_family: str = "gamma",
796        rng: Optional[np.random.Generator] = None,
797    ) -> Tuple[np.ndarray, np.ndarray]:
798        """Sample using Gaussian copula via conditional normal."""
799        if rng is None:
800            rng = np.random.default_rng()
801
802        rho = np.clip(self.rho, -0.9999, 0.9999)
803        z1 = rng.standard_normal(size)
804        z2 = rho * z1 + np.sqrt(1 - rho**2) * rng.standard_normal(size)
805        u = stats.norm.cdf(z1)
806        v = stats.norm.cdf(z2)
807
808        # Invert marginal CDFs
809        if freq_family == "nb":
810            mu = freq_params["mu"]
811            alpha = freq_params["alpha"]
812            r = 1.0 / alpha
813            p = 1.0 / (1.0 + mu * alpha)
814            n_samp = stats.nbinom.ppf(u, r, p).astype(int)
815        else:
816            n_samp = stats.poisson.ppf(u, freq_params["mu"]).astype(int)
817
818        if sev_family == "gamma":
819            mu_s = sev_params["mu"]
820            shape = sev_params["shape"]
821            scale = mu_s / shape
822            s_samp = stats.gamma.ppf(v, a=shape, scale=scale)
823        else:
824            log_mu = sev_params["log_mu"]
825            log_sigma = sev_params["log_sigma"]
826            s_samp = stats.lognorm.ppf(v, s=log_sigma, scale=np.exp(log_mu))
827
828        return n_samp, s_samp

Gaussian copula linking discrete frequency and continuous severity.

This is the standard approach of Czado et al. (2012) and the R package CopulaRegression. The discrete margin creates an identifiability issue (Sklar's theorem is not unique for discrete distributions), addressed by the mid-point PIT approximation:

U_N = (F_N(n) + F_N(n-1)) / 2    for n > 0
U_N = F_N(0) / 2                  for n = 0

This maps count data onto (0,1) for use with the Gaussian copula. The approximation is accurate when the Poisson or NB mass at each point is small (not too much probability on any single value).

Parameters

rho : float Gaussian copula correlation parameter in (-1, 1). This is NOT Spearman's rho (though they are closely related for the Gaussian copula: rho_S ≈ (6/pi) * arcsin(rho/2)).

GaussianCopulaMixed(rho: float = 0.0)
rho: float = 0.0
def log_likelihood( self, n: numpy.ndarray, s: numpy.ndarray, freq_params: dict | list, sev_params: dict | list, freq_family: str = 'nb', sev_family: str = 'gamma') -> float:
714    def log_likelihood(
715        self,
716        n: np.ndarray,
717        s: np.ndarray,
718        freq_params: dict | list,
719        sev_params: dict | list,
720        freq_family: str = "nb",
721        sev_family: str = "gamma",
722    ) -> float:
723        """
724        Gaussian copula log-likelihood (positive-claim observations only).
725
726        For n=0 observations, there is no severity, so only the marginal
727        frequency contributes.
728        """
729        n = np.asarray(n, dtype=float)
730        s = np.asarray(s, dtype=float)
731
732        mask_pos = n > 0
733        if not mask_pos.any():
734            return 0.0
735
736        n_pos = n[mask_pos]
737        s_pos = s[mask_pos]
738
739        if isinstance(freq_params, dict):
740            fp_pos = freq_params
741        else:
742            fp_pos = {
743                "mu": np.array([p["mu"] for p in freq_params])[mask_pos],
744                **({
745                    "alpha": np.array([p["alpha"] for p in freq_params])[mask_pos]
746                } if freq_family == "nb" else {}),
747            }
748
749        if isinstance(sev_params, dict):
750            sp_pos = sev_params
751        else:
752            if sev_family == "gamma":
753                sp_pos = {
754                    "mu": np.array([p["mu"] for p in sev_params])[mask_pos],
755                    "shape": np.array([p["shape"] for p in sev_params])[mask_pos],
756                }
757            else:
758                sp_pos = {
759                    "log_mu": np.array([p["log_mu"] for p in sev_params])[mask_pos],
760                    "log_sigma": np.array([p["log_sigma"] for p in sev_params])[mask_pos],
761                }
762
763        u = self._pit_freq(n_pos, fp_pos, freq_family)
764        v = self._pit_sev(s_pos, sp_pos, sev_family)
765
766        # Clip to avoid numerical issues at boundaries
767        u = np.clip(u, 1e-8, 1 - 1e-8)
768        v = np.clip(v, 1e-8, 1 - 1e-8)
769
770        # Gaussian copula density: c(u,v) = phi2(Phi^{-1}(u), Phi^{-1}(v); rho) /
771        #                                    (phi(Phi^{-1}(u)) * phi(Phi^{-1}(v)))
772        z1 = stats.norm.ppf(u)
773        z2 = stats.norm.ppf(v)
774
775        rho = np.clip(self.rho, -0.9999, 0.9999)
776        rho2 = rho ** 2
777
778        log_c = (
779            -0.5 * np.log(1 - rho2)
780            - (rho2 * (z1**2 + z2**2) - 2 * rho * z1 * z2) / (2 * (1 - rho2))
781        )
782
783        return float(np.sum(log_c))

Gaussian copula log-likelihood (positive-claim observations only).

For n=0 observations, there is no severity, so only the marginal frequency contributes.

def spearman_rho(self) -> float:
785    def spearman_rho(self) -> float:
786        """Approximate Spearman rho from Gaussian copula parameter."""
787        return float(6.0 / np.pi * np.arcsin(self.rho / 2.0))

Approximate Spearman rho from Gaussian copula parameter.

def sample( self, size: int, freq_params: dict, sev_params: dict, freq_family: str = 'nb', sev_family: str = 'gamma', rng: Optional[numpy.random._generator.Generator] = None) -> Tuple[numpy.ndarray, numpy.ndarray]:
789    def sample(
790        self,
791        size: int,
792        freq_params: dict,
793        sev_params: dict,
794        freq_family: str = "nb",
795        sev_family: str = "gamma",
796        rng: Optional[np.random.Generator] = None,
797    ) -> Tuple[np.ndarray, np.ndarray]:
798        """Sample using Gaussian copula via conditional normal."""
799        if rng is None:
800            rng = np.random.default_rng()
801
802        rho = np.clip(self.rho, -0.9999, 0.9999)
803        z1 = rng.standard_normal(size)
804        z2 = rho * z1 + np.sqrt(1 - rho**2) * rng.standard_normal(size)
805        u = stats.norm.cdf(z1)
806        v = stats.norm.cdf(z2)
807
808        # Invert marginal CDFs
809        if freq_family == "nb":
810            mu = freq_params["mu"]
811            alpha = freq_params["alpha"]
812            r = 1.0 / alpha
813            p = 1.0 / (1.0 + mu * alpha)
814            n_samp = stats.nbinom.ppf(u, r, p).astype(int)
815        else:
816            n_samp = stats.poisson.ppf(u, freq_params["mu"]).astype(int)
817
818        if sev_family == "gamma":
819            mu_s = sev_params["mu"]
820            shape = sev_params["shape"]
821            scale = mu_s / shape
822            s_samp = stats.gamma.ppf(v, a=shape, scale=scale)
823        else:
824            log_mu = sev_params["log_mu"]
825            log_sigma = sev_params["log_sigma"]
826            s_samp = stats.lognorm.ppf(v, s=log_sigma, scale=np.exp(log_mu))
827
828        return n_samp, s_samp

Sample using Gaussian copula via conditional normal.

@dataclass
class FGMCopula:
835@dataclass
836class FGMCopula:
837    """
838    Farlie-Gumbel-Morgenstern (FGM) copula.
839
840    C(u, v) = u * v * (1 + theta * (1-u) * (1-v))
841
842    theta in [-1, 1] corresponds to Spearman rho in [-1/3, 1/3].
843
844    The FGM copula is a special case of the Sarmanov family with
845    phi_1(x) = F_1(x) - E[F_1(X)] and phi_2(y) = F_2(y) - E[F_2(Y)].
846
847    This is implemented here as a simple sanity-check baseline. If the FGM
848    copula fits as well as Sarmanov, the dependence is weak and the correction
849    factor is small. If FGM is inadequate (theta hits ±1), the data likely
850    has moderate dependence and Sarmanov is justified.
851    """
852
853    theta: float = 0.0
854
855    def __post_init__(self):
856        if not -1.0 <= self.theta <= 1.0:
857            raise ValueError(f"FGM theta must be in [-1, 1], got {self.theta}")
858
859    def cdf(self, u: np.ndarray, v: np.ndarray) -> np.ndarray:
860        """C(u, v) = uv[1 + theta(1-u)(1-v)]"""
861        u = np.asarray(u)
862        v = np.asarray(v)
863        return u * v * (1.0 + self.theta * (1.0 - u) * (1.0 - v))
864
865    def pdf(self, u: np.ndarray, v: np.ndarray) -> np.ndarray:
866        """c(u, v) = 1 + theta(1 - 2u)(1 - 2v)"""
867        u = np.asarray(u)
868        v = np.asarray(v)
869        return 1.0 + self.theta * (1.0 - 2.0 * u) * (1.0 - 2.0 * v)
870
871    def log_likelihood(
872        self,
873        u: np.ndarray,
874        v: np.ndarray,
875    ) -> float:
876        """Log-likelihood given PIT-transformed observations."""
877        density = self.pdf(u, v)
878        density = np.clip(density, 1e-300, None)
879        return float(np.sum(np.log(density)))
880
881    def spearman_rho(self) -> float:
882        """Exact Spearman rho = theta / 3."""
883        return self.theta / 3.0
884
885    def kendall_tau(self) -> float:
886        """Exact Kendall tau = 2*theta/9."""
887        return 2.0 * self.theta / 9.0
888
889    def sample(
890        self,
891        size: int,
892        rng: Optional[np.random.Generator] = None,
893    ) -> Tuple[np.ndarray, np.ndarray]:
894        """
895        Sample (U, V) from FGM copula using conditional CDF method.
896
897        C_{V|U}(v|u) = v * [1 + theta*(1-2u)*(1-v)]
898        Solved analytically via quadratic: b*v^2 - (1+b)*v + w = 0,
899        where b = theta*(1-2u).
900        """
901        if rng is None:
902            rng = np.random.default_rng()
903        u = rng.uniform(size=size)
904        w = rng.uniform(size=size)
905        # C_{V|U}(v|u) = w => solve quadratic in v
906        # b*v^2 - (1+b)*v + w = 0, where b = theta*(1-2u)
907        # v = [(1+b) - sqrt((1+b)^2 - 4*b*w)] / (2*b)
908        b = self.theta * (1.0 - 2.0 * u)
909        with np.errstate(invalid="ignore"):
910            disc = (1.0 + b) ** 2 - 4.0 * b * w
911            disc = np.maximum(disc, 0.0)
912            v = np.where(
913                np.abs(b) < 1e-10,
914                w,
915                ((1.0 + b) - np.sqrt(disc)) / (2.0 * b),
916            )
917        return u, np.clip(v, 0, 1)

Farlie-Gumbel-Morgenstern (FGM) copula.

C(u, v) = u * v * (1 + theta * (1-u) * (1-v))

theta in [-1, 1] corresponds to Spearman rho in [-1/3, 1/3].

The FGM copula is a special case of the Sarmanov family with phi_1(x) = F_1(x) - E[F_1(X)] and phi_2(y) = F_2(y) - E[F_2(Y)].

This is implemented here as a simple sanity-check baseline. If the FGM copula fits as well as Sarmanov, the dependence is weak and the correction factor is small. If FGM is inadequate (theta hits ±1), the data likely has moderate dependence and Sarmanov is justified.

FGMCopula(theta: float = 0.0)
theta: float = 0.0
def cdf(self, u: numpy.ndarray, v: numpy.ndarray) -> numpy.ndarray:
859    def cdf(self, u: np.ndarray, v: np.ndarray) -> np.ndarray:
860        """C(u, v) = uv[1 + theta(1-u)(1-v)]"""
861        u = np.asarray(u)
862        v = np.asarray(v)
863        return u * v * (1.0 + self.theta * (1.0 - u) * (1.0 - v))

C(u, v) = uv[1 + theta(1-u)(1-v)]

def pdf(self, u: numpy.ndarray, v: numpy.ndarray) -> numpy.ndarray:
865    def pdf(self, u: np.ndarray, v: np.ndarray) -> np.ndarray:
866        """c(u, v) = 1 + theta(1 - 2u)(1 - 2v)"""
867        u = np.asarray(u)
868        v = np.asarray(v)
869        return 1.0 + self.theta * (1.0 - 2.0 * u) * (1.0 - 2.0 * v)

c(u, v) = 1 + theta(1 - 2u)(1 - 2v)

def log_likelihood(self, u: numpy.ndarray, v: numpy.ndarray) -> float:
871    def log_likelihood(
872        self,
873        u: np.ndarray,
874        v: np.ndarray,
875    ) -> float:
876        """Log-likelihood given PIT-transformed observations."""
877        density = self.pdf(u, v)
878        density = np.clip(density, 1e-300, None)
879        return float(np.sum(np.log(density)))

Log-likelihood given PIT-transformed observations.

def spearman_rho(self) -> float:
881    def spearman_rho(self) -> float:
882        """Exact Spearman rho = theta / 3."""
883        return self.theta / 3.0

Exact Spearman rho = theta / 3.

def kendall_tau(self) -> float:
885    def kendall_tau(self) -> float:
886        """Exact Kendall tau = 2*theta/9."""
887        return 2.0 * self.theta / 9.0

Exact Kendall tau = 2*theta/9.

def sample( self, size: int, rng: Optional[numpy.random._generator.Generator] = None) -> Tuple[numpy.ndarray, numpy.ndarray]:
889    def sample(
890        self,
891        size: int,
892        rng: Optional[np.random.Generator] = None,
893    ) -> Tuple[np.ndarray, np.ndarray]:
894        """
895        Sample (U, V) from FGM copula using conditional CDF method.
896
897        C_{V|U}(v|u) = v * [1 + theta*(1-2u)*(1-v)]
898        Solved analytically via quadratic: b*v^2 - (1+b)*v + w = 0,
899        where b = theta*(1-2u).
900        """
901        if rng is None:
902            rng = np.random.default_rng()
903        u = rng.uniform(size=size)
904        w = rng.uniform(size=size)
905        # C_{V|U}(v|u) = w => solve quadratic in v
906        # b*v^2 - (1+b)*v + w = 0, where b = theta*(1-2u)
907        # v = [(1+b) - sqrt((1+b)^2 - 4*b*w)] / (2*b)
908        b = self.theta * (1.0 - 2.0 * u)
909        with np.errstate(invalid="ignore"):
910            disc = (1.0 + b) ** 2 - 4.0 * b * w
911            disc = np.maximum(disc, 0.0)
912            v = np.where(
913                np.abs(b) < 1e-10,
914                w,
915                ((1.0 + b) - np.sqrt(disc)) / (2.0 * b),
916            )
917        return u, np.clip(v, 0, 1)

Sample (U, V) from FGM copula using conditional CDF method.

C_{V|U}(v|u) = v * [1 + theta*(1-2u)(1-v)] Solved analytically via quadratic: bv^2 - (1+b)v + w = 0, where b = theta(1-2u).

class JointFreqSev:
185class JointFreqSev:
186    """
187    Joint frequency-severity model using Sarmanov copula (or alternatives).
188
189    Accepts fitted GLM objects from statsmodels (or any compatible object with
190    .predict() and .fittedvalues). Does not refit the marginal models.
191
192    Parameters
193    ----------
194    freq_glm : fitted GLM
195        Frequency model (Poisson or NegativeBinomial GLM).
196    sev_glm : fitted GLM
197        Severity model (Gamma or lognormal GLM). Should be fitted on
198        positive-claim observations only.
199    copula : 'sarmanov' | 'gaussian' | 'fgm'
200        Copula family to use. Default 'sarmanov'.
201    kernel_theta : float
202        Laplace exponent for the frequency kernel in Sarmanov copula.
203    kernel_alpha : float
204        Laplace exponent for the severity kernel in Sarmanov copula.
205
206    Examples
207    --------
208    >>> model = JointFreqSev(freq_glm=nb_glm, sev_glm=gamma_glm)
209    >>> model.fit(
210    ...     claims_df,
211    ...     n_col="claim_count",
212    ...     s_col="avg_severity",
213    ...     freq_X=X_freq,
214    ...     sev_X=X_sev,
215    ... )
216    >>> corrections = model.premium_correction(X_new)
217    >>> model.dependence_summary()
218    """
219
220    def __init__(
221        self,
222        freq_glm: Any,
223        sev_glm: Any,
224        copula: Literal["sarmanov", "gaussian", "fgm"] = "sarmanov",
225        kernel_theta: float = 0.5,
226        kernel_alpha: float = 0.001,
227    ):
228        self.freq_glm = freq_glm
229        self.sev_glm = sev_glm
230        self.copula_family = copula
231        self.kernel_theta = kernel_theta
232        self.kernel_alpha = kernel_alpha
233
234        # Set after .fit()
235        self.omega_: Optional[float] = None
236        self.rho_: Optional[float] = None
237        self.omega_ci_: Optional[Tuple[float, float]] = None
238        self.aic_: Optional[float] = None
239        self.bic_: Optional[float] = None
240        self._copula_obj: Optional[SarmanovCopula | GaussianCopulaMixed | FGMCopula] = None
241        self._freq_family: Optional[str] = None
242        self._sev_family: Optional[str] = None
243        self._alpha: Optional[float] = None
244        self._shape: Optional[float] = None
245        self._n_obs: Optional[int] = None
246        self._n_claims: Optional[int] = None
247        self._mu_n_fitted: Optional[np.ndarray] = None
248        self._mu_s_fitted: Optional[np.ndarray] = None
249
250    def fit(
251        self,
252        data: pd.DataFrame,
253        n_col: str = "claim_count",
254        s_col: str = "avg_severity",
255        freq_X: Optional[pd.DataFrame] = None,
256        sev_X: Optional[pd.DataFrame] = None,
257        exposure_col: Optional[str] = None,
258        method: Literal["ifm", "mle"] = "ifm",
259        ci_method: Literal["profile", "bootstrap"] = "profile",
260        n_bootstrap: int = 200,
261        rng: Optional[np.random.Generator] = None,
262    ) -> "JointFreqSev":
263        """
264        Estimate the dependence parameter omega.
265
266        Parameters
267        ----------
268        data : DataFrame
269            Policy-level data. Must contain n_col (claim counts) and s_col
270            (average severity for claims, 0 or NaN for zero-claim policies).
271        n_col : str
272            Column name for claim counts.
273        s_col : str
274            Column name for average claim severity.
275        freq_X : DataFrame, optional
276            Covariates for frequency GLM prediction. If None, uses
277            glm.fittedvalues.
278        sev_X : DataFrame, optional
279            Covariates for severity GLM prediction (on claims rows only). If
280            None, uses glm.fittedvalues.
281        exposure_col : str, optional
282            Column of exposure (e.g., years at risk) for the frequency GLM.
283        method : 'ifm' | 'mle'
284            IFM (default): profile likelihood for omega given fitted marginals.
285            MLE: joint estimation (experimental, slower).
286        ci_method : 'profile' | 'bootstrap'
287            Method for omega confidence intervals.
288        n_bootstrap : int
289            Number of bootstrap replicates (ci_method='bootstrap' only).
290        rng : numpy Generator, optional
291            Random state for bootstrap.
292        """
293        n = np.asarray(data[n_col], dtype=float)
294        s_raw = np.asarray(data[s_col], dtype=float)
295
296        # Replace NaN/0 severity for zero-claim rows with a placeholder
297        # (1.0 works; those rows won't contribute severity to the likelihood)
298        s = np.where((n == 0) | np.isnan(s_raw) | (s_raw <= 0), 1.0, s_raw)
299
300        n_policies = len(n)
301        n_claims = int(np.sum(n > 0))
302
303        self._n_obs = n_policies
304        self._n_claims = n_claims
305
306        if n_policies < 1000:
307            warnings.warn(
308                f"Only {n_policies} policies. Omega estimation needs at least 1,000 "
309                f"observations for reliable inference. Results may be unstable.",
310                UserWarning,
311                stacklevel=2,
312            )
313        if n_claims < 500:
314            warnings.warn(
315                f"Only {n_claims} claim events. Standard errors on omega will be "
316                f"large. Consider the Garrido conditional method (ConditionalFreqSev) "
317                f"as a more stable alternative.",
318                UserWarning,
319                stacklevel=2,
320            )
321
322        # Extract marginal parameters
323        exposure = (
324            np.asarray(data[exposure_col], dtype=float) if exposure_col else None
325        )
326        mu_n, alpha, freq_family = _extract_freq_params(self.freq_glm, freq_X, exposure)
327        mu_s, shape, sev_family = _extract_sev_params(self.sev_glm, sev_X, None)
328
329        # mu_n has n_policies entries; mu_s has n_policies entries (using fittedvalues)
330        # If sev GLM was fitted on claims-only, we need to handle alignment.
331        if len(mu_s) != n_policies:
332            # Severity GLM was fitted on positive-claim rows only.
333            # We need E[S|x] for all policies (even zero-claim ones) for prediction.
334            # For zero-claim policies, E[S|x] is still well-defined as the GLM
335            # prediction under their covariates.
336            if sev_X is not None:
337                mu_s_all = np.asarray(self.sev_glm.predict(sev_X), dtype=float)
338            else:
339                # Cannot align — use a common mean
340                warnings.warn(
341                    "Severity GLM fittedvalues length does not match n_policies and "
342                    "no sev_X provided. Using mean severity for all policies.",
343                    UserWarning,
344                    stacklevel=2,
345                )
346                mu_s_all = np.full(n_policies, float(np.mean(mu_s)))
347        else:
348            mu_s_all = mu_s
349
350        self._freq_family = freq_family
351        self._sev_family = sev_family
352        self._alpha = alpha
353        self._shape = shape
354        self._mu_n_fitted = mu_n
355        self._mu_s_fitted = mu_s_all
356
357        # Build per-observation parameter arrays
358        if freq_family == "nb":
359            freq_params = [{"mu": float(mu_n[i]), "alpha": float(alpha)} for i in range(n_policies)]
360        else:
361            freq_params = [{"mu": float(mu_n[i])} for i in range(n_policies)]
362
363        if sev_family == "gamma":
364            sev_params = [{"mu": float(mu_s_all[i]), "shape": float(shape)} for i in range(n_policies)]
365        else:
366            # Lognormal: log_mu = log(E[S]) - log_sigma^2/2
367            # We treat shape as 1/phi; log_sigma^2 = log(1 + 1/shape)
368            log_sigma = float(np.sqrt(np.log(1.0 + 1.0 / shape)))
369            log_mu_arr = np.log(mu_s_all) - 0.5 * log_sigma**2
370            sev_params = [{"log_mu": float(log_mu_arr[i]), "log_sigma": log_sigma} for i in range(n_policies)]
371
372        # --- Fit copula ---
373        if self.copula_family == "sarmanov":
374            self._fit_sarmanov(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
375        elif self.copula_family == "gaussian":
376            self._fit_gaussian(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
377        elif self.copula_family == "fgm":
378            self._fit_fgm(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
379        else:
380            raise ValueError(f"Unknown copula family: {self.copula_family}")
381
382        return self
383
384    def _profile_ll_sarmanov(
385        self,
386        omega: float,
387        n: np.ndarray,
388        s: np.ndarray,
389        freq_params: list,
390        sev_params: list,
391        freq_family: str,
392        sev_family: str,
393    ) -> float:
394        copula = SarmanovCopula(
395            freq_family=freq_family,
396            sev_family=sev_family,
397            omega=omega,
398            kernel_theta=self.kernel_theta,
399            kernel_alpha=self.kernel_alpha,
400        )
401        return copula.log_likelihood(n, s, freq_params, sev_params)
402
403    def _fit_sarmanov(self, n, s, freq_params, sev_params, freq_family, sev_family,
404                      ci_method, n_bootstrap, rng):
405        # Compute omega bounds from a representative observation
406        ref_fp = freq_params[0]
407        ref_sp = sev_params[0]
408        copula_ref = SarmanovCopula(
409            freq_family=freq_family,
410            sev_family=sev_family,
411            omega=0.0,
412            kernel_theta=self.kernel_theta,
413            kernel_alpha=self.kernel_alpha,
414        )
415        omega_min, omega_max = copula_ref.omega_bounds(ref_fp, ref_sp)
416        # Clamp to a reasonable search range
417        omega_min = max(omega_min, -50.0)
418        omega_max = min(omega_max, 50.0)
419
420        def neg_ll(omega):
421            return -self._profile_ll_sarmanov(omega, n, s, freq_params, sev_params,
422                                               freq_family, sev_family)
423
424        result = minimize_scalar(neg_ll, bounds=(omega_min, omega_max), method="bounded")
425        self.omega_ = float(result.x)
426
427        copula = SarmanovCopula(
428            freq_family=freq_family,
429            sev_family=sev_family,
430            omega=self.omega_,
431            kernel_theta=self.kernel_theta,
432            kernel_alpha=self.kernel_alpha,
433        )
434        self._copula_obj = copula
435
436        ll_hat = -float(result.fun)
437        ll_indep = self._profile_ll_sarmanov(0.0, n, s, freq_params, sev_params,
438                                              freq_family, sev_family)
439
440        # AIC/BIC: count omega as the extra parameter
441        self.aic_ = -2 * ll_hat + 2 * 1
442        self.bic_ = -2 * ll_hat + np.log(self._n_obs) * 1
443
444        # Spearman rho (MC estimate)
445        self.rho_ = copula.spearman_rho(ref_fp, ref_sp, n_mc=10_000)
446
447        # Confidence intervals
448        if ci_method == "profile":
449            self.omega_ci_ = self._profile_ci_sarmanov(
450                n, s, freq_params, sev_params, freq_family, sev_family,
451                omega_min, omega_max, ll_hat
452            )
453        elif ci_method == "bootstrap":
454            self.omega_ci_ = self._bootstrap_ci_sarmanov(
455                n, s, freq_params, sev_params, freq_family, sev_family,
456                omega_min, omega_max, n_bootstrap, rng
457            )
458
459    def _profile_ci_sarmanov(self, n, s, freq_params, sev_params, freq_family, sev_family,
460                               omega_min, omega_max, ll_hat):
461        """95% CI via profile likelihood (chi-squared approximation, df=1)."""
462        threshold = ll_hat - 0.5 * stats.chi2.ppf(0.95, df=1)
463
464        def above_threshold(omega):
465            ll = self._profile_ll_sarmanov(omega, n, s, freq_params, sev_params,
466                                           freq_family, sev_family)
467            return ll - threshold
468
469        from scipy.optimize import brentq
470
471        omega_hat = self.omega_
472
473        try:
474            lo = brentq(above_threshold, omega_min, omega_hat - 1e-8)
475        except (ValueError, RuntimeError):
476            lo = omega_min
477
478        try:
479            hi = brentq(above_threshold, omega_hat + 1e-8, omega_max)
480        except (ValueError, RuntimeError):
481            hi = omega_max
482
483        return (lo, hi)
484
485    def _bootstrap_ci_sarmanov(self, n, s, freq_params, sev_params, freq_family, sev_family,
486                                omega_min, omega_max, n_bootstrap, rng):
487        if rng is None:
488            rng = np.random.default_rng()
489        n_obs = len(n)
490        boot_omegas = []
491        for _ in range(n_bootstrap):
492            idx = rng.integers(0, n_obs, size=n_obs)
493            n_b = n[idx]
494            s_b = s[idx]
495            fp_b = [freq_params[i] for i in idx]
496            sp_b = [sev_params[i] for i in idx]
497
498            def neg_ll(omega):
499                return -self._profile_ll_sarmanov(omega, n_b, s_b, fp_b, sp_b,
500                                                  freq_family, sev_family)
501            try:
502                res = minimize_scalar(neg_ll, bounds=(omega_min, omega_max), method="bounded")
503                boot_omegas.append(float(res.x))
504            except Exception:
505                pass
506
507        if len(boot_omegas) < 10:
508            warnings.warn("Bootstrap CI estimation failed for most replicates.", UserWarning)
509            return (omega_min, omega_max)
510
511        boot_arr = np.array(boot_omegas)
512        return (float(np.percentile(boot_arr, 2.5)), float(np.percentile(boot_arr, 97.5)))
513
514    def _fit_gaussian(self, n, s, freq_params, sev_params, freq_family, sev_family,
515                      ci_method, n_bootstrap, rng):
516        """Fit Gaussian copula rho via profile likelihood."""
517        gc = GaussianCopulaMixed(rho=0.0)
518
519        def neg_ll(rho):
520            gc.rho = float(rho)
521            return -gc.log_likelihood(n, s, freq_params, sev_params, freq_family, sev_family)
522
523        result = minimize_scalar(neg_ll, bounds=(-0.999, 0.999), method="bounded")
524        self.omega_ = float(result.x)  # store rho as omega for consistency
525
526        gc.rho = self.omega_
527        self._copula_obj = gc
528        self.rho_ = gc.spearman_rho()
529
530        ll_hat = -float(result.fun)
531        self.aic_ = -2 * ll_hat + 2 * 1
532        self.bic_ = -2 * ll_hat + np.log(self._n_obs) * 1
533
534        # Simple CI via bootstrap
535        if ci_method == "bootstrap" and rng is not None:
536            boot_rhos = []
537            n_obs = len(n)
538            for _ in range(n_bootstrap):
539                idx = rng.integers(0, n_obs, size=n_obs)
540                n_b, s_b = n[idx], s[idx]
541                fp_b = [freq_params[i] for i in idx]
542                sp_b = [sev_params[i] for i in idx]
543                try:
544                    def nll_b(rho):
545                        gc.rho = float(rho)
546                        return -gc.log_likelihood(n_b, s_b, fp_b, sp_b, freq_family, sev_family)
547                    res = minimize_scalar(nll_b, bounds=(-0.999, 0.999), method="bounded")
548                    boot_rhos.append(float(res.x))
549                except Exception:
550                    pass
551            if boot_rhos:
552                self.omega_ci_ = (
553                    float(np.percentile(boot_rhos, 2.5)),
554                    float(np.percentile(boot_rhos, 97.5)),
555                )
556        if self.omega_ci_ is None:
557            self.omega_ci_ = (-0.999, 0.999)
558
559    def _fit_fgm(self, n, s, freq_params, sev_params, freq_family, sev_family,
560                 ci_method, n_bootstrap, rng):
561        """Fit FGM copula theta via PIT + profile likelihood."""
562        gc_ref = GaussianCopulaMixed(rho=0.0)
563
564        def pit_u(fp):
565            return gc_ref._pit_freq(
566                np.array([fp["mu"]]), {"mu": fp["mu"], **({} if freq_family == "poisson" else {"alpha": fp.get("alpha", 1.0)})},
567                freq_family
568            )[0]
569
570        def pit_v(sp):
571            return gc_ref._pit_sev(
572                np.array([sp["mu"] if sev_family == "gamma" else np.exp(sp["log_mu"])]),
573                sp, sev_family
574            )[0]
575
576        # Compute PIT values for positive-claim rows only
577        mask = np.asarray([ni > 0 for ni in n])
578        if not mask.any():
579            self.omega_ = 0.0
580            self._copula_obj = FGMCopula(theta=0.0)
581            self.rho_ = 0.0
582            self.omega_ci_ = (-1.0, 1.0)
583            return
584
585        n_pos = n[mask]
586        s_pos = s[mask]
587
588        # Build PIT arrays using GaussianCopulaMixed._pit methods
589        gc = GaussianCopulaMixed(rho=0.0)
590        fp_mask = [freq_params[i] for i in range(len(n)) if mask[i]]
591        sp_mask = [sev_params[i] for i in range(len(n)) if mask[i]]
592
593        fp_dict = {
594            "mu": np.array([p["mu"] for p in fp_mask]),
595            **({
596                "alpha": np.array([p["alpha"] for p in fp_mask])
597            } if freq_family == "nb" else {}),
598        }
599        sp_dict_gamma = sev_family == "gamma"
600        if sp_dict_gamma:
601            sp_dict = {
602                "mu": np.array([p["mu"] for p in sp_mask]),
603                "shape": np.array([p["shape"] for p in sp_mask]),
604            }
605        else:
606            sp_dict = {
607                "log_mu": np.array([p["log_mu"] for p in sp_mask]),
608                "log_sigma": np.array([p["log_sigma"] for p in sp_mask]),
609            }
610
611        u = gc._pit_freq(n_pos, fp_dict, freq_family)
612        v = gc._pit_sev(s_pos, sp_dict, sev_family)
613        u = np.clip(u, 1e-8, 1 - 1e-8)
614        v = np.clip(v, 1e-8, 1 - 1e-8)
615
616        fgm = FGMCopula(theta=0.0)
617
618        def neg_ll(theta):
619            fgm.theta = float(theta)
620            return -fgm.log_likelihood(u, v)
621
622        result = minimize_scalar(neg_ll, bounds=(-1.0, 1.0), method="bounded")
623        self.omega_ = float(result.x)
624        fgm.theta = self.omega_
625        self._copula_obj = fgm
626        self.rho_ = fgm.spearman_rho()
627
628        ll_hat = -float(result.fun)
629        self.aic_ = -2 * ll_hat + 2 * 1
630        self.bic_ = -2 * ll_hat + np.log(self._n_obs) * 1
631        self.omega_ci_ = (-1.0, 1.0)
632
633    def premium_correction(
634        self,
635        X: Optional[pd.DataFrame] = None,
636        exposure: Optional[np.ndarray] = None,
637        n_mc: int = 50_000,
638        rng: Optional[np.random.Generator] = None,
639    ) -> pd.DataFrame:
640        """
641        Compute per-policy premium correction factors.
642
643        The correction factor is: E[N*S] / (E[N] * E[S])
644
645        Under independence, E[N*S] = E[N]*E[S] so the factor equals 1.
646        With negative dependence (typical in UK motor), the factor < 1.
647
648        For the Sarmanov copula, we have the analytical result:
649            E[N*S] = E[N]*E[S] + omega * E[N*phi_1(N)] * E[S*phi_2(S)]
650
651        For Gaussian copula and FGM, we use Monte Carlo simulation.
652
653        Parameters
654        ----------
655        X : DataFrame, optional
656            New covariate data for prediction. If None, uses training fitted values.
657        exposure : array-like, optional
658            Exposure for frequency prediction.
659        n_mc : int
660            Monte Carlo samples for Gaussian/FGM copulas.
661
662        Returns
663        -------
664        DataFrame with columns: mu_n, mu_s, mu_ns_independent, mu_ns_joint,
665            correction_factor, premium_independent, premium_joint.
666        """
667        if self.omega_ is None:
668            raise RuntimeError("Must call .fit() before .premium_correction()")
669
670        if X is not None:
671            mu_n = np.asarray(self.freq_glm.predict(X), dtype=float)
672            mu_s = np.asarray(self.sev_glm.predict(X), dtype=float)
673        else:
674            mu_n = self._mu_n_fitted
675            mu_s = self._mu_s_fitted
676
677        if exposure is not None:
678            mu_n = mu_n * np.asarray(exposure, dtype=float)
679
680        n_policies = len(mu_n)
681        correction = np.ones(n_policies)
682
683        if self.copula_family == "sarmanov":
684            correction = self._sarmanov_correction(mu_n, mu_s)
685        else:
686            # Monte Carlo for Gaussian/FGM
687            correction = self._mc_correction(mu_n, mu_s, n_mc, rng)
688
689        mu_ns_ind = mu_n * mu_s
690        mu_ns_joint = mu_ns_ind * correction
691
692        return pd.DataFrame({
693            "mu_n": mu_n,
694            "mu_s": mu_s,
695            "mu_ns_independent": mu_ns_ind,
696            "mu_ns_joint": mu_ns_joint,
697            "correction_factor": correction,
698            "premium_independent": mu_ns_ind,
699            "premium_joint": mu_ns_joint,
700        })
701
702    def _sarmanov_correction(self, mu_n: np.ndarray, mu_s: np.ndarray) -> np.ndarray:
703        """
704        Analytical premium correction for Sarmanov copula.
705
706        E[N*S] = E[N]*E[S] + omega * Cov_sarmanov(N, S)
707               = E[N]*E[S] + omega * E[N*phi_1(N)] * E[S*phi_2(S)]
708
709        E[N*phi_1(N)] for NB with Laplace kernel phi_1(n) = exp(-theta*n) - M_N(theta):
710            = E[N * exp(-theta*N)] - mu_n * M_N(theta)
711            = -d/dtheta M_N(theta) - mu_n * M_N(theta)   [by differentiation of MGF]
712
713        For NB: M_N(theta) = (p/(1-(1-p)*e^{-theta}))^r
714            d/dtheta M_N(theta) = -r*(1-p)*e^{-theta}/(1-(1-p)*e^{-theta}) * M_N(theta)
715
716        E[S*phi_2(S)] for Gamma with Laplace kernel phi_2(s) = exp(-alpha*s) - M_S(alpha):
717            = E[S * exp(-alpha*S)] - mu_s * M_S(alpha)
718            = -d/dalpha M_S(alpha) - mu_s * M_S(alpha)
719
720        For Gamma(shape, scale=mu_s/shape):
721            M_S(alpha) = (1/(1+alpha*scale))^shape
722            d/dalpha M_S(alpha) = -shape * scale * M_S(alpha) / (1+alpha*scale)
723        """
724        omega = self.omega_
725        theta = self.kernel_theta
726        alpha_k = self.kernel_alpha
727
728        n_pol = len(mu_n)
729        correction = np.ones(n_pol)
730
731        for i in range(n_pol):
732            mu_ni = float(mu_n[i])
733            mu_si = float(mu_s[i])
734
735            if self._freq_family == "nb":
736                alpha_disp = self._alpha
737                r = 1.0 / alpha_disp
738                p = 1.0 / (1.0 + mu_ni * alpha_disp)
739                M1 = (p / (1.0 - (1.0 - p) * np.exp(-theta))) ** r
740                dM1 = -r * (1.0 - p) * np.exp(-theta) / (1.0 - (1.0 - p) * np.exp(-theta)) * M1
741                E_n_phi1 = -dM1 - mu_ni * M1
742            else:
743                # Poisson
744                M1 = np.exp(mu_ni * (np.exp(-theta) - 1.0))
745                dM1 = mu_ni * (-np.exp(-theta)) * M1
746                E_n_phi1 = -dM1 - mu_ni * M1
747
748            if self._sev_family == "gamma":
749                shape = self._shape
750                scale = mu_si / shape
751                M2 = (1.0 / (1.0 + alpha_k * scale)) ** shape
752                dM2 = -shape * scale / (1.0 + alpha_k * scale) * M2
753                E_s_phi2 = -dM2 - mu_si * M2
754            else:
755                # Lognormal: numerical
756                log_sigma = float(np.sqrt(np.log(1.0 + 1.0 / self._shape)))
757                log_mu_i = np.log(mu_si) - 0.5 * log_sigma**2
758                # E[S*exp(-alpha*S)] = integral s*exp(-alpha*s)*f_S(s) ds
759                # Approximate via: -d/dalpha M_S(alpha) (numerical derivative)
760                from insurance_frequency_severity.copula import LaplaceKernelLognormal
761                kern = LaplaceKernelLognormal(alpha=alpha_k)
762                da = 1e-5
763                kern_p = LaplaceKernelLognormal(alpha=alpha_k + da)
764                kern_m = LaplaceKernelLognormal(alpha=alpha_k - da)
765                M2_p = kern_p.mgf(log_mu_i, log_sigma)
766                M2_m = kern_m.mgf(log_mu_i, log_sigma)
767                dM2 = (M2_p - M2_m) / (2 * da)
768                M2 = kern.mgf(log_mu_i, log_sigma)
769                E_s_phi2 = -dM2 - mu_si * M2
770
771            covariance_term = omega * E_n_phi1 * E_s_phi2
772            E_NS_ind = mu_ni * mu_si
773            E_NS_joint = E_NS_ind + covariance_term
774
775            if E_NS_ind > 0:
776                correction[i] = E_NS_joint / E_NS_ind
777            else:
778                correction[i] = 1.0
779
780        return correction
781
782    def _mc_correction(
783        self,
784        mu_n: np.ndarray,
785        mu_s: np.ndarray,
786        n_mc: int,
787        rng: Optional[np.random.Generator],
788    ) -> np.ndarray:
789        """
790        Monte Carlo correction factor for Gaussian/FGM copulas.
791
792        .. note:: Portfolio-average approximation
793            This method computes a single correction factor at the portfolio-mean
794            marginal parameters (mean mu_n, mean mu_s) and stamps it identically
795            across all policies. This is a deliberate approximation: computing a
796            separate n_mc-sample MC simulation per policy would be prohibitively
797            slow for large books, and the copula correction factor is relatively
798            insensitive to moderate variation in mu_n and mu_s when the copula
799            parameter is small.
800
801            For books where mu_n varies by more than an order of magnitude across
802            policies, or where per-policy accuracy is critical, use the Sarmanov
803            copula instead (which has an analytical per-policy correction via
804            _sarmanov_correction).
805        """
806        if rng is None:
807            rng = np.random.default_rng()
808
809        # Use representative marginal parameters for MC
810        mu_n_bar = float(np.mean(mu_n))
811        mu_s_bar = float(np.mean(mu_s))
812
813        if self._freq_family == "nb":
814            fp = {"mu": mu_n_bar, "alpha": float(self._alpha)}
815        else:
816            fp = {"mu": mu_n_bar}
817
818        if self._sev_family == "gamma":
819            sp = {"mu": mu_s_bar, "shape": float(self._shape)}
820        else:
821            log_sigma = float(np.sqrt(np.log(1.0 + 1.0 / self._shape)))
822            sp = {
823                "log_mu": float(np.log(mu_s_bar) - 0.5 * log_sigma**2),
824                "log_sigma": log_sigma,
825            }
826
827        if self.copula_family == "gaussian":
828            copula_obj: GaussianCopulaMixed = self._copula_obj
829            n_samp, s_samp = copula_obj.sample(n_mc, fp, sp, self._freq_family, self._sev_family, rng=rng)
830        else:
831            # FGM: sample u, v from copula then invert marginals
832            fgm: FGMCopula = self._copula_obj
833            u, v = fgm.sample(n_mc, rng=rng)
834            # Invert frequency CDF
835            if self._freq_family == "nb":
836                r = 1.0 / float(self._alpha)
837                p = 1.0 / (1.0 + mu_n_bar * float(self._alpha))
838                n_samp = stats.nbinom.ppf(u, r, p).astype(int)
839            else:
840                n_samp = stats.poisson.ppf(u, mu_n_bar).astype(int)
841            # Invert severity CDF
842            if self._sev_family == "gamma":
843                scale = mu_s_bar / float(self._shape)
844                s_samp = stats.gamma.ppf(v, a=float(self._shape), scale=scale)
845            else:
846                log_sigma = float(np.sqrt(np.log(1.0 + 1.0 / self._shape)))
847                log_mu_bar = float(np.log(mu_s_bar) - 0.5 * log_sigma**2)
848                s_samp = stats.lognorm.ppf(v, s=log_sigma, scale=np.exp(log_mu_bar))
849
850        E_NS_joint = float(np.mean(n_samp * s_samp))
851        E_N_E_S = mu_n_bar * mu_s_bar
852
853        if E_N_E_S > 0:
854            factor_global = E_NS_joint / E_N_E_S
855        else:
856            factor_global = 1.0
857
858        # Scale correction by individual mu_n * mu_s ratios
859        # (The global MC gives average correction; per-policy varies by covariates)
860        return np.full(len(mu_n), factor_global)
861
862    def loss_cost(
863        self,
864        X: pd.DataFrame,
865        exposure: Optional[np.ndarray] = None,
866        n_mc: int = 50_000,
867        rng: Optional[np.random.Generator] = None,
868    ) -> np.ndarray:
869        """
870        Corrected pure premium predictions for new data.
871
872        Returns E[N*S|x] for each row of X, incorporating the dependence
873        correction.
874
875        Parameters
876        ----------
877        X : DataFrame
878            Feature matrix for new policies.
879        exposure : array-like, optional
880            Exposure vector for frequency scaling.
881        n_mc : int
882            Monte Carlo samples (Gaussian/FGM copulas only).
883
884        Returns
885        -------
886        ndarray of corrected pure premiums.
887        """
888        corrections_df = self.premium_correction(X, exposure=exposure, n_mc=n_mc, rng=rng)
889        return corrections_df["premium_joint"].to_numpy()
890
891    def dependence_summary(self) -> pd.DataFrame:
892        """
893        Summary of estimated dependence parameters.
894
895        Returns
896        -------
897        DataFrame with omega (or rho for Gaussian), Spearman's rho estimate,
898        95% confidence interval, AIC, BIC.
899        """
900        if self.omega_ is None:
901            raise RuntimeError("Must call .fit() first")
902
903        ci_lo, ci_hi = self.omega_ci_ if self.omega_ci_ else (np.nan, np.nan)
904
905        param_name = "omega" if self.copula_family == "sarmanov" else "rho"
906
907        return pd.DataFrame([{
908            "copula": self.copula_family,
909            param_name: self.omega_,
910            "ci_95_lo": ci_lo,
911            "ci_95_hi": ci_hi,
912            "spearman_rho": self.rho_,
913            "aic": self.aic_,
914            "bic": self.bic_,
915            "n_policies": self._n_obs,
916            "n_claims": self._n_claims,
917        }])

Joint frequency-severity model using Sarmanov copula (or alternatives).

Accepts fitted GLM objects from statsmodels (or any compatible object with .predict() and .fittedvalues). Does not refit the marginal models.

Parameters

freq_glm : fitted GLM Frequency model (Poisson or NegativeBinomial GLM). sev_glm : fitted GLM Severity model (Gamma or lognormal GLM). Should be fitted on positive-claim observations only. copula : 'sarmanov' | 'gaussian' | 'fgm' Copula family to use. Default 'sarmanov'. kernel_theta : float Laplace exponent for the frequency kernel in Sarmanov copula. kernel_alpha : float Laplace exponent for the severity kernel in Sarmanov copula.

Examples

>>> model = JointFreqSev(freq_glm=nb_glm, sev_glm=gamma_glm)
>>> model.fit(
...     claims_df,
...     n_col="claim_count",
...     s_col="avg_severity",
...     freq_X=X_freq,
...     sev_X=X_sev,
... )
>>> corrections = model.premium_correction(X_new)
>>> model.dependence_summary()
JointFreqSev( freq_glm: Any, sev_glm: Any, copula: Literal['sarmanov', 'gaussian', 'fgm'] = 'sarmanov', kernel_theta: float = 0.5, kernel_alpha: float = 0.001)
220    def __init__(
221        self,
222        freq_glm: Any,
223        sev_glm: Any,
224        copula: Literal["sarmanov", "gaussian", "fgm"] = "sarmanov",
225        kernel_theta: float = 0.5,
226        kernel_alpha: float = 0.001,
227    ):
228        self.freq_glm = freq_glm
229        self.sev_glm = sev_glm
230        self.copula_family = copula
231        self.kernel_theta = kernel_theta
232        self.kernel_alpha = kernel_alpha
233
234        # Set after .fit()
235        self.omega_: Optional[float] = None
236        self.rho_: Optional[float] = None
237        self.omega_ci_: Optional[Tuple[float, float]] = None
238        self.aic_: Optional[float] = None
239        self.bic_: Optional[float] = None
240        self._copula_obj: Optional[SarmanovCopula | GaussianCopulaMixed | FGMCopula] = None
241        self._freq_family: Optional[str] = None
242        self._sev_family: Optional[str] = None
243        self._alpha: Optional[float] = None
244        self._shape: Optional[float] = None
245        self._n_obs: Optional[int] = None
246        self._n_claims: Optional[int] = None
247        self._mu_n_fitted: Optional[np.ndarray] = None
248        self._mu_s_fitted: Optional[np.ndarray] = None
freq_glm
sev_glm
copula_family
kernel_theta
kernel_alpha
omega_: Optional[float]
rho_: Optional[float]
omega_ci_: Optional[Tuple[float, float]]
aic_: Optional[float]
bic_: Optional[float]
def fit( self, data: pandas.DataFrame, n_col: str = 'claim_count', s_col: str = 'avg_severity', freq_X: Optional[pandas.DataFrame] = None, sev_X: Optional[pandas.DataFrame] = None, exposure_col: Optional[str] = None, method: Literal['ifm', 'mle'] = 'ifm', ci_method: Literal['profile', 'bootstrap'] = 'profile', n_bootstrap: int = 200, rng: Optional[numpy.random._generator.Generator] = None) -> JointFreqSev:
250    def fit(
251        self,
252        data: pd.DataFrame,
253        n_col: str = "claim_count",
254        s_col: str = "avg_severity",
255        freq_X: Optional[pd.DataFrame] = None,
256        sev_X: Optional[pd.DataFrame] = None,
257        exposure_col: Optional[str] = None,
258        method: Literal["ifm", "mle"] = "ifm",
259        ci_method: Literal["profile", "bootstrap"] = "profile",
260        n_bootstrap: int = 200,
261        rng: Optional[np.random.Generator] = None,
262    ) -> "JointFreqSev":
263        """
264        Estimate the dependence parameter omega.
265
266        Parameters
267        ----------
268        data : DataFrame
269            Policy-level data. Must contain n_col (claim counts) and s_col
270            (average severity for claims, 0 or NaN for zero-claim policies).
271        n_col : str
272            Column name for claim counts.
273        s_col : str
274            Column name for average claim severity.
275        freq_X : DataFrame, optional
276            Covariates for frequency GLM prediction. If None, uses
277            glm.fittedvalues.
278        sev_X : DataFrame, optional
279            Covariates for severity GLM prediction (on claims rows only). If
280            None, uses glm.fittedvalues.
281        exposure_col : str, optional
282            Column of exposure (e.g., years at risk) for the frequency GLM.
283        method : 'ifm' | 'mle'
284            IFM (default): profile likelihood for omega given fitted marginals.
285            MLE: joint estimation (experimental, slower).
286        ci_method : 'profile' | 'bootstrap'
287            Method for omega confidence intervals.
288        n_bootstrap : int
289            Number of bootstrap replicates (ci_method='bootstrap' only).
290        rng : numpy Generator, optional
291            Random state for bootstrap.
292        """
293        n = np.asarray(data[n_col], dtype=float)
294        s_raw = np.asarray(data[s_col], dtype=float)
295
296        # Replace NaN/0 severity for zero-claim rows with a placeholder
297        # (1.0 works; those rows won't contribute severity to the likelihood)
298        s = np.where((n == 0) | np.isnan(s_raw) | (s_raw <= 0), 1.0, s_raw)
299
300        n_policies = len(n)
301        n_claims = int(np.sum(n > 0))
302
303        self._n_obs = n_policies
304        self._n_claims = n_claims
305
306        if n_policies < 1000:
307            warnings.warn(
308                f"Only {n_policies} policies. Omega estimation needs at least 1,000 "
309                f"observations for reliable inference. Results may be unstable.",
310                UserWarning,
311                stacklevel=2,
312            )
313        if n_claims < 500:
314            warnings.warn(
315                f"Only {n_claims} claim events. Standard errors on omega will be "
316                f"large. Consider the Garrido conditional method (ConditionalFreqSev) "
317                f"as a more stable alternative.",
318                UserWarning,
319                stacklevel=2,
320            )
321
322        # Extract marginal parameters
323        exposure = (
324            np.asarray(data[exposure_col], dtype=float) if exposure_col else None
325        )
326        mu_n, alpha, freq_family = _extract_freq_params(self.freq_glm, freq_X, exposure)
327        mu_s, shape, sev_family = _extract_sev_params(self.sev_glm, sev_X, None)
328
329        # mu_n has n_policies entries; mu_s has n_policies entries (using fittedvalues)
330        # If sev GLM was fitted on claims-only, we need to handle alignment.
331        if len(mu_s) != n_policies:
332            # Severity GLM was fitted on positive-claim rows only.
333            # We need E[S|x] for all policies (even zero-claim ones) for prediction.
334            # For zero-claim policies, E[S|x] is still well-defined as the GLM
335            # prediction under their covariates.
336            if sev_X is not None:
337                mu_s_all = np.asarray(self.sev_glm.predict(sev_X), dtype=float)
338            else:
339                # Cannot align — use a common mean
340                warnings.warn(
341                    "Severity GLM fittedvalues length does not match n_policies and "
342                    "no sev_X provided. Using mean severity for all policies.",
343                    UserWarning,
344                    stacklevel=2,
345                )
346                mu_s_all = np.full(n_policies, float(np.mean(mu_s)))
347        else:
348            mu_s_all = mu_s
349
350        self._freq_family = freq_family
351        self._sev_family = sev_family
352        self._alpha = alpha
353        self._shape = shape
354        self._mu_n_fitted = mu_n
355        self._mu_s_fitted = mu_s_all
356
357        # Build per-observation parameter arrays
358        if freq_family == "nb":
359            freq_params = [{"mu": float(mu_n[i]), "alpha": float(alpha)} for i in range(n_policies)]
360        else:
361            freq_params = [{"mu": float(mu_n[i])} for i in range(n_policies)]
362
363        if sev_family == "gamma":
364            sev_params = [{"mu": float(mu_s_all[i]), "shape": float(shape)} for i in range(n_policies)]
365        else:
366            # Lognormal: log_mu = log(E[S]) - log_sigma^2/2
367            # We treat shape as 1/phi; log_sigma^2 = log(1 + 1/shape)
368            log_sigma = float(np.sqrt(np.log(1.0 + 1.0 / shape)))
369            log_mu_arr = np.log(mu_s_all) - 0.5 * log_sigma**2
370            sev_params = [{"log_mu": float(log_mu_arr[i]), "log_sigma": log_sigma} for i in range(n_policies)]
371
372        # --- Fit copula ---
373        if self.copula_family == "sarmanov":
374            self._fit_sarmanov(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
375        elif self.copula_family == "gaussian":
376            self._fit_gaussian(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
377        elif self.copula_family == "fgm":
378            self._fit_fgm(n, s, freq_params, sev_params, freq_family, sev_family, ci_method, n_bootstrap, rng)
379        else:
380            raise ValueError(f"Unknown copula family: {self.copula_family}")
381
382        return self

Estimate the dependence parameter omega.

Parameters

data : DataFrame Policy-level data. Must contain n_col (claim counts) and s_col (average severity for claims, 0 or NaN for zero-claim policies). n_col : str Column name for claim counts. s_col : str Column name for average claim severity. freq_X : DataFrame, optional Covariates for frequency GLM prediction. If None, uses glm.fittedvalues. sev_X : DataFrame, optional Covariates for severity GLM prediction (on claims rows only). If None, uses glm.fittedvalues. exposure_col : str, optional Column of exposure (e.g., years at risk) for the frequency GLM. method : 'ifm' | 'mle' IFM (default): profile likelihood for omega given fitted marginals. MLE: joint estimation (experimental, slower). ci_method : 'profile' | 'bootstrap' Method for omega confidence intervals. n_bootstrap : int Number of bootstrap replicates (ci_method='bootstrap' only). rng : numpy Generator, optional Random state for bootstrap.

def premium_correction( self, X: Optional[pandas.DataFrame] = None, exposure: Optional[numpy.ndarray] = None, n_mc: int = 50000, rng: Optional[numpy.random._generator.Generator] = None) -> pandas.DataFrame:
633    def premium_correction(
634        self,
635        X: Optional[pd.DataFrame] = None,
636        exposure: Optional[np.ndarray] = None,
637        n_mc: int = 50_000,
638        rng: Optional[np.random.Generator] = None,
639    ) -> pd.DataFrame:
640        """
641        Compute per-policy premium correction factors.
642
643        The correction factor is: E[N*S] / (E[N] * E[S])
644
645        Under independence, E[N*S] = E[N]*E[S] so the factor equals 1.
646        With negative dependence (typical in UK motor), the factor < 1.
647
648        For the Sarmanov copula, we have the analytical result:
649            E[N*S] = E[N]*E[S] + omega * E[N*phi_1(N)] * E[S*phi_2(S)]
650
651        For Gaussian copula and FGM, we use Monte Carlo simulation.
652
653        Parameters
654        ----------
655        X : DataFrame, optional
656            New covariate data for prediction. If None, uses training fitted values.
657        exposure : array-like, optional
658            Exposure for frequency prediction.
659        n_mc : int
660            Monte Carlo samples for Gaussian/FGM copulas.
661
662        Returns
663        -------
664        DataFrame with columns: mu_n, mu_s, mu_ns_independent, mu_ns_joint,
665            correction_factor, premium_independent, premium_joint.
666        """
667        if self.omega_ is None:
668            raise RuntimeError("Must call .fit() before .premium_correction()")
669
670        if X is not None:
671            mu_n = np.asarray(self.freq_glm.predict(X), dtype=float)
672            mu_s = np.asarray(self.sev_glm.predict(X), dtype=float)
673        else:
674            mu_n = self._mu_n_fitted
675            mu_s = self._mu_s_fitted
676
677        if exposure is not None:
678            mu_n = mu_n * np.asarray(exposure, dtype=float)
679
680        n_policies = len(mu_n)
681        correction = np.ones(n_policies)
682
683        if self.copula_family == "sarmanov":
684            correction = self._sarmanov_correction(mu_n, mu_s)
685        else:
686            # Monte Carlo for Gaussian/FGM
687            correction = self._mc_correction(mu_n, mu_s, n_mc, rng)
688
689        mu_ns_ind = mu_n * mu_s
690        mu_ns_joint = mu_ns_ind * correction
691
692        return pd.DataFrame({
693            "mu_n": mu_n,
694            "mu_s": mu_s,
695            "mu_ns_independent": mu_ns_ind,
696            "mu_ns_joint": mu_ns_joint,
697            "correction_factor": correction,
698            "premium_independent": mu_ns_ind,
699            "premium_joint": mu_ns_joint,
700        })

Compute per-policy premium correction factors.

The correction factor is: E[N*S] / (E[N] * E[S])

Under independence, E[N*S] = E[N]*E[S] so the factor equals 1. With negative dependence (typical in UK motor), the factor < 1.

For the Sarmanov copula, we have the analytical result: E[NS] = E[N]E[S] + omega * E[Nphi_1(N)] * E[Sphi_2(S)]

For Gaussian copula and FGM, we use Monte Carlo simulation.

Parameters

X : DataFrame, optional New covariate data for prediction. If None, uses training fitted values. exposure : array-like, optional Exposure for frequency prediction. n_mc : int Monte Carlo samples for Gaussian/FGM copulas.

Returns

DataFrame with columns: mu_n, mu_s, mu_ns_independent, mu_ns_joint, correction_factor, premium_independent, premium_joint.

def loss_cost( self, X: pandas.DataFrame, exposure: Optional[numpy.ndarray] = None, n_mc: int = 50000, rng: Optional[numpy.random._generator.Generator] = None) -> numpy.ndarray:
862    def loss_cost(
863        self,
864        X: pd.DataFrame,
865        exposure: Optional[np.ndarray] = None,
866        n_mc: int = 50_000,
867        rng: Optional[np.random.Generator] = None,
868    ) -> np.ndarray:
869        """
870        Corrected pure premium predictions for new data.
871
872        Returns E[N*S|x] for each row of X, incorporating the dependence
873        correction.
874
875        Parameters
876        ----------
877        X : DataFrame
878            Feature matrix for new policies.
879        exposure : array-like, optional
880            Exposure vector for frequency scaling.
881        n_mc : int
882            Monte Carlo samples (Gaussian/FGM copulas only).
883
884        Returns
885        -------
886        ndarray of corrected pure premiums.
887        """
888        corrections_df = self.premium_correction(X, exposure=exposure, n_mc=n_mc, rng=rng)
889        return corrections_df["premium_joint"].to_numpy()

Corrected pure premium predictions for new data.

Returns E[N*S|x] for each row of X, incorporating the dependence correction.

Parameters

X : DataFrame Feature matrix for new policies. exposure : array-like, optional Exposure vector for frequency scaling. n_mc : int Monte Carlo samples (Gaussian/FGM copulas only).

Returns

ndarray of corrected pure premiums.

def dependence_summary(self) -> pandas.DataFrame:
891    def dependence_summary(self) -> pd.DataFrame:
892        """
893        Summary of estimated dependence parameters.
894
895        Returns
896        -------
897        DataFrame with omega (or rho for Gaussian), Spearman's rho estimate,
898        95% confidence interval, AIC, BIC.
899        """
900        if self.omega_ is None:
901            raise RuntimeError("Must call .fit() first")
902
903        ci_lo, ci_hi = self.omega_ci_ if self.omega_ci_ else (np.nan, np.nan)
904
905        param_name = "omega" if self.copula_family == "sarmanov" else "rho"
906
907        return pd.DataFrame([{
908            "copula": self.copula_family,
909            param_name: self.omega_,
910            "ci_95_lo": ci_lo,
911            "ci_95_hi": ci_hi,
912            "spearman_rho": self.rho_,
913            "aic": self.aic_,
914            "bic": self.bic_,
915            "n_policies": self._n_obs,
916            "n_claims": self._n_claims,
917        }])

Summary of estimated dependence parameters.

Returns

DataFrame with omega (or rho for Gaussian), Spearman's rho estimate, 95% confidence interval, AIC, BIC.

class ConditionalFreqSev:
 924class ConditionalFreqSev:
 925    """
 926    Garrido, Genest, Schulz (2016) conditional frequency-severity model.
 927
 928    The method adds claim count N (or the indicator I(N>0)) as a covariate in
 929    the severity GLM. The intuition: if high-count years have systematically
 930    different severity, this covariate captures the dependence directly.
 931
 932    Under Poisson frequency with log-link severity:
 933        log E[S|x, N] = beta'x + gamma * N
 934
 935    The pure premium becomes:
 936        E[N * S | x] = E[N|x] * E[S|x, N=1] * exp(gamma * E[N|x])
 937
 938    The correction factor is exp(gamma) * exp(mu_n * (exp(gamma) - 1)).
 939
 940    This is simpler and more robust than copula estimation. It uses only
 941    standard GLM infrastructure. The tradeoff is that it restricts the
 942    dependence structure to a specific parametric form.
 943
 944    Parameters
 945    ----------
 946    freq_glm : fitted GLM
 947        Frequency model. Used to predict E[N|x].
 948    sev_glm_base : fitted GLM
 949        Base severity model (fitted without N covariate). Used as starting
 950        point; we refit with N added.
 951    n_as_indicator : bool
 952        If True, use I(N>0) rather than N itself as the covariate.
 953        Indicator is more robust when some policies have very high N.
 954    """
 955
 956    def __init__(
 957        self,
 958        freq_glm: Any,
 959        sev_glm_base: Any,
 960        n_as_indicator: bool = False,
 961    ):
 962        self.freq_glm = freq_glm
 963        self.sev_glm_base = sev_glm_base
 964        self.n_as_indicator = n_as_indicator
 965
 966        self.gamma_: Optional[float] = None
 967        self.gamma_se_: Optional[float] = None
 968        self.sev_glm_conditional_: Optional[Any] = None
 969        self._freq_family: Optional[str] = None
 970        self._mu_n_fitted: Optional[np.ndarray] = None
 971        self._mu_s_fitted: Optional[np.ndarray] = None
 972
 973    def fit(
 974        self,
 975        data: pd.DataFrame,
 976        n_col: str = "claim_count",
 977        s_col: str = "avg_severity",
 978        sev_feature_cols: Optional[list] = None,
 979        freq_X: Optional[pd.DataFrame] = None,
 980        exposure_col: Optional[str] = None,
 981    ) -> "ConditionalFreqSev":
 982        """
 983        Fit the conditional severity model with N as covariate.
 984
 985        Parameters
 986        ----------
 987        data : DataFrame
 988            Policy-level data.
 989        n_col : str
 990            Claim count column.
 991        s_col : str
 992            Average severity column.
 993        sev_feature_cols : list, optional
 994            Feature columns to include in severity GLM alongside N.
 995        freq_X : DataFrame, optional
 996            Feature matrix for frequency GLM prediction.
 997        exposure_col : str, optional
 998            Exposure column for frequency.
 999        """
1000        import statsmodels.api as sm
1001
1002        n = np.asarray(data[n_col], dtype=float)
1003        s = np.asarray(data[s_col], dtype=float)
1004
1005        exposure = (
1006            np.asarray(data[exposure_col], dtype=float) if exposure_col else None
1007        )
1008        mu_n, alpha, freq_family = _extract_freq_params(self.freq_glm, freq_X, exposure)
1009        self._freq_family = freq_family
1010        self._mu_n_fitted = mu_n
1011
1012        # Restrict to positive-claim observations
1013        mask_pos = (n > 0) & (s > 0) & np.isfinite(s)
1014        n_pos = n[mask_pos]
1015        s_pos = s[mask_pos]
1016
1017        if mask_pos.sum() < 20:
1018            raise ValueError(
1019                f"Only {mask_pos.sum()} positive-claim observations. Cannot fit conditional model."
1020            )
1021
1022        # Build covariate matrix for conditional severity GLM
1023        if sev_feature_cols:
1024            X_sev = data.loc[mask_pos, sev_feature_cols].copy()
1025        else:
1026            X_sev = pd.DataFrame(index=data.index[mask_pos])
1027
1028        n_covariate = (n_pos > 0).astype(float) if self.n_as_indicator else n_pos
1029        X_sev = X_sev.copy()
1030        X_sev["_n_covariate"] = np.asarray(n_covariate)
1031        X_sev_const = sm.add_constant(X_sev, has_constant="add")
1032
1033        # Fit Gamma GLM with log link
1034        gamma_family = sm.families.Gamma(link=sm.families.links.Log())
1035        try:
1036            sev_cond = sm.GLM(
1037                s_pos,
1038                X_sev_const,
1039                family=gamma_family,
1040                var_weights=n_pos,
1041            ).fit(disp=True)
1042        except Exception as e:
1043            raise RuntimeError(f"Conditional severity GLM fit failed: {e}") from e
1044
1045        self.sev_glm_conditional_ = sev_cond
1046        self.gamma_ = float(sev_cond.params["_n_covariate"])
1047        self.gamma_se_ = float(sev_cond.bse["_n_covariate"])
1048
1049        # Baseline severity (N=0 prediction)
1050        # If freq_X is provided, predict on the full feature matrix so that
1051        # premium_correction() returns predictions for all policies (not just claimants).
1052        if freq_X is not None:
1053            self._mu_s_fitted = np.asarray(self.sev_glm_base.predict(freq_X), dtype=float)
1054        else:
1055            self._mu_s_fitted = np.asarray(self.sev_glm_base.fittedvalues, dtype=float)
1056
1057        return self
1058
1059    def premium_correction(
1060        self,
1061        X: Optional[pd.DataFrame] = None,
1062        exposure: Optional[np.ndarray] = None,
1063    ) -> pd.DataFrame:
1064        """
1065        Compute premium correction factors using the Garrido formula.
1066
1067        Correction = exp(gamma) * exp(mu_n * (exp(gamma) - 1))
1068
1069        For a log-link severity model with Poisson frequency, this is the
1070        exact closed-form correction. For general link functions or non-Poisson
1071        frequency, it is an approximation. For general link functions, it is an approximation.
1072
1073        Returns
1074        -------
1075        DataFrame with mu_n, correction_factor, premium columns.
1076        """
1077        if self.gamma_ is None:
1078            raise RuntimeError("Must call .fit() first")
1079
1080        if X is not None:
1081            mu_n = np.asarray(self.freq_glm.predict(X), dtype=float)
1082            mu_s = np.asarray(self.sev_glm_base.predict(X), dtype=float)
1083        else:
1084            mu_n = self._mu_n_fitted
1085            mu_s = self._mu_s_fitted
1086
1087        if exposure is not None:
1088            mu_n = mu_n * np.asarray(exposure, dtype=float)
1089
1090        correction = np.exp(self.gamma_ + mu_n * (np.exp(self.gamma_) - 1.0))
1091        premium_ind = mu_n * mu_s
1092        premium_joint = premium_ind * correction
1093
1094        return pd.DataFrame({
1095            "mu_n": mu_n,
1096            "mu_s": mu_s,
1097            "correction_factor": correction,
1098            "premium_independent": premium_ind,
1099            "premium_joint": premium_joint,
1100        })
1101
1102    def loss_cost(
1103        self,
1104        X: pd.DataFrame,
1105        exposure: Optional[np.ndarray] = None,
1106    ) -> np.ndarray:
1107        """Corrected pure premium predictions."""
1108        return self.premium_correction(X, exposure)["premium_joint"].to_numpy()
1109
1110    def dependence_summary(self) -> pd.DataFrame:
1111        """Summary of estimated gamma (N covariate in severity GLM)."""
1112        if self.gamma_ is None:
1113            raise RuntimeError("Must call .fit() first")
1114        return pd.DataFrame([{
1115            "method": "garrido_conditional",
1116            "gamma": self.gamma_,
1117            "gamma_se": self.gamma_se_,
1118            "gamma_t": self.gamma_ / self.gamma_se_ if self.gamma_se_ else np.nan,
1119            "n_as_indicator": self.n_as_indicator,
1120        }])

Garrido, Genest, Schulz (2016) conditional frequency-severity model.

The method adds claim count N (or the indicator I(N>0)) as a covariate in the severity GLM. The intuition: if high-count years have systematically different severity, this covariate captures the dependence directly.

Under Poisson frequency with log-link severity: log E[S|x, N] = beta'x + gamma * N

The pure premium becomes:

E[N * S | x] = E[N|x] * E[S|x, N=1] * exp(gamma * E[N|x])

The correction factor is exp(gamma) * exp(mu_n * (exp(gamma) - 1)).

This is simpler and more robust than copula estimation. It uses only standard GLM infrastructure. The tradeoff is that it restricts the dependence structure to a specific parametric form.

Parameters

freq_glm : fitted GLM Frequency model. Used to predict E[N|x]. sev_glm_base : fitted GLM Base severity model (fitted without N covariate). Used as starting point; we refit with N added. n_as_indicator : bool If True, use I(N>0) rather than N itself as the covariate. Indicator is more robust when some policies have very high N.

ConditionalFreqSev(freq_glm: Any, sev_glm_base: Any, n_as_indicator: bool = False)
956    def __init__(
957        self,
958        freq_glm: Any,
959        sev_glm_base: Any,
960        n_as_indicator: bool = False,
961    ):
962        self.freq_glm = freq_glm
963        self.sev_glm_base = sev_glm_base
964        self.n_as_indicator = n_as_indicator
965
966        self.gamma_: Optional[float] = None
967        self.gamma_se_: Optional[float] = None
968        self.sev_glm_conditional_: Optional[Any] = None
969        self._freq_family: Optional[str] = None
970        self._mu_n_fitted: Optional[np.ndarray] = None
971        self._mu_s_fitted: Optional[np.ndarray] = None
freq_glm
sev_glm_base
n_as_indicator
gamma_: Optional[float]
gamma_se_: Optional[float]
sev_glm_conditional_: Optional[Any]
def fit( self, data: pandas.DataFrame, n_col: str = 'claim_count', s_col: str = 'avg_severity', sev_feature_cols: Optional[list] = None, freq_X: Optional[pandas.DataFrame] = None, exposure_col: Optional[str] = None) -> ConditionalFreqSev:
 973    def fit(
 974        self,
 975        data: pd.DataFrame,
 976        n_col: str = "claim_count",
 977        s_col: str = "avg_severity",
 978        sev_feature_cols: Optional[list] = None,
 979        freq_X: Optional[pd.DataFrame] = None,
 980        exposure_col: Optional[str] = None,
 981    ) -> "ConditionalFreqSev":
 982        """
 983        Fit the conditional severity model with N as covariate.
 984
 985        Parameters
 986        ----------
 987        data : DataFrame
 988            Policy-level data.
 989        n_col : str
 990            Claim count column.
 991        s_col : str
 992            Average severity column.
 993        sev_feature_cols : list, optional
 994            Feature columns to include in severity GLM alongside N.
 995        freq_X : DataFrame, optional
 996            Feature matrix for frequency GLM prediction.
 997        exposure_col : str, optional
 998            Exposure column for frequency.
 999        """
1000        import statsmodels.api as sm
1001
1002        n = np.asarray(data[n_col], dtype=float)
1003        s = np.asarray(data[s_col], dtype=float)
1004
1005        exposure = (
1006            np.asarray(data[exposure_col], dtype=float) if exposure_col else None
1007        )
1008        mu_n, alpha, freq_family = _extract_freq_params(self.freq_glm, freq_X, exposure)
1009        self._freq_family = freq_family
1010        self._mu_n_fitted = mu_n
1011
1012        # Restrict to positive-claim observations
1013        mask_pos = (n > 0) & (s > 0) & np.isfinite(s)
1014        n_pos = n[mask_pos]
1015        s_pos = s[mask_pos]
1016
1017        if mask_pos.sum() < 20:
1018            raise ValueError(
1019                f"Only {mask_pos.sum()} positive-claim observations. Cannot fit conditional model."
1020            )
1021
1022        # Build covariate matrix for conditional severity GLM
1023        if sev_feature_cols:
1024            X_sev = data.loc[mask_pos, sev_feature_cols].copy()
1025        else:
1026            X_sev = pd.DataFrame(index=data.index[mask_pos])
1027
1028        n_covariate = (n_pos > 0).astype(float) if self.n_as_indicator else n_pos
1029        X_sev = X_sev.copy()
1030        X_sev["_n_covariate"] = np.asarray(n_covariate)
1031        X_sev_const = sm.add_constant(X_sev, has_constant="add")
1032
1033        # Fit Gamma GLM with log link
1034        gamma_family = sm.families.Gamma(link=sm.families.links.Log())
1035        try:
1036            sev_cond = sm.GLM(
1037                s_pos,
1038                X_sev_const,
1039                family=gamma_family,
1040                var_weights=n_pos,
1041            ).fit(disp=True)
1042        except Exception as e:
1043            raise RuntimeError(f"Conditional severity GLM fit failed: {e}") from e
1044
1045        self.sev_glm_conditional_ = sev_cond
1046        self.gamma_ = float(sev_cond.params["_n_covariate"])
1047        self.gamma_se_ = float(sev_cond.bse["_n_covariate"])
1048
1049        # Baseline severity (N=0 prediction)
1050        # If freq_X is provided, predict on the full feature matrix so that
1051        # premium_correction() returns predictions for all policies (not just claimants).
1052        if freq_X is not None:
1053            self._mu_s_fitted = np.asarray(self.sev_glm_base.predict(freq_X), dtype=float)
1054        else:
1055            self._mu_s_fitted = np.asarray(self.sev_glm_base.fittedvalues, dtype=float)
1056
1057        return self

Fit the conditional severity model with N as covariate.

Parameters

data : DataFrame Policy-level data. n_col : str Claim count column. s_col : str Average severity column. sev_feature_cols : list, optional Feature columns to include in severity GLM alongside N. freq_X : DataFrame, optional Feature matrix for frequency GLM prediction. exposure_col : str, optional Exposure column for frequency.

def premium_correction( self, X: Optional[pandas.DataFrame] = None, exposure: Optional[numpy.ndarray] = None) -> pandas.DataFrame:
1059    def premium_correction(
1060        self,
1061        X: Optional[pd.DataFrame] = None,
1062        exposure: Optional[np.ndarray] = None,
1063    ) -> pd.DataFrame:
1064        """
1065        Compute premium correction factors using the Garrido formula.
1066
1067        Correction = exp(gamma) * exp(mu_n * (exp(gamma) - 1))
1068
1069        For a log-link severity model with Poisson frequency, this is the
1070        exact closed-form correction. For general link functions or non-Poisson
1071        frequency, it is an approximation. For general link functions, it is an approximation.
1072
1073        Returns
1074        -------
1075        DataFrame with mu_n, correction_factor, premium columns.
1076        """
1077        if self.gamma_ is None:
1078            raise RuntimeError("Must call .fit() first")
1079
1080        if X is not None:
1081            mu_n = np.asarray(self.freq_glm.predict(X), dtype=float)
1082            mu_s = np.asarray(self.sev_glm_base.predict(X), dtype=float)
1083        else:
1084            mu_n = self._mu_n_fitted
1085            mu_s = self._mu_s_fitted
1086
1087        if exposure is not None:
1088            mu_n = mu_n * np.asarray(exposure, dtype=float)
1089
1090        correction = np.exp(self.gamma_ + mu_n * (np.exp(self.gamma_) - 1.0))
1091        premium_ind = mu_n * mu_s
1092        premium_joint = premium_ind * correction
1093
1094        return pd.DataFrame({
1095            "mu_n": mu_n,
1096            "mu_s": mu_s,
1097            "correction_factor": correction,
1098            "premium_independent": premium_ind,
1099            "premium_joint": premium_joint,
1100        })

Compute premium correction factors using the Garrido formula.

Correction = exp(gamma) * exp(mu_n * (exp(gamma) - 1))

For a log-link severity model with Poisson frequency, this is the exact closed-form correction. For general link functions or non-Poisson frequency, it is an approximation. For general link functions, it is an approximation.

Returns

DataFrame with mu_n, correction_factor, premium columns.

def loss_cost( self, X: pandas.DataFrame, exposure: Optional[numpy.ndarray] = None) -> numpy.ndarray:
1102    def loss_cost(
1103        self,
1104        X: pd.DataFrame,
1105        exposure: Optional[np.ndarray] = None,
1106    ) -> np.ndarray:
1107        """Corrected pure premium predictions."""
1108        return self.premium_correction(X, exposure)["premium_joint"].to_numpy()

Corrected pure premium predictions.

def dependence_summary(self) -> pandas.DataFrame:
1110    def dependence_summary(self) -> pd.DataFrame:
1111        """Summary of estimated gamma (N covariate in severity GLM)."""
1112        if self.gamma_ is None:
1113            raise RuntimeError("Must call .fit() first")
1114        return pd.DataFrame([{
1115            "method": "garrido_conditional",
1116            "gamma": self.gamma_,
1117            "gamma_se": self.gamma_se_,
1118            "gamma_t": self.gamma_ / self.gamma_se_ if self.gamma_se_ else np.nan,
1119            "n_as_indicator": self.n_as_indicator,
1120        }])

Summary of estimated gamma (N covariate in severity GLM).

class DependenceTest:
 29class DependenceTest:
 30    """
 31    Tests the null hypothesis of independence between claim frequency and severity.
 32
 33    Three test statistics are available:
 34    - 'kendall': Kendall tau with asymptotic normal approximation
 35    - 'spearman': Spearman rho with t-distribution approximation
 36    - 'lrt': Likelihood ratio test for omega=0 in the Sarmanov model
 37
 38    Only positive-claim observations are used (n=0 policies contribute no
 39    severity information).
 40
 41    Parameters
 42    ----------
 43    n_permutations : int
 44        Number of permutations for the permutation test. Set to 0 to use
 45        asymptotic approximation only.
 46
 47    Examples
 48    --------
 49    >>> test = DependenceTest()
 50    >>> result = test.fit(n_positive, s_positive)
 51    >>> result.summary()
 52    """
 53
 54    def __init__(self, n_permutations: int = 1000):
 55        self.n_permutations = n_permutations
 56        self.tau_: Optional[float] = None
 57        self.tau_pval_: Optional[float] = None
 58        self.rho_s_: Optional[float] = None
 59        self.rho_s_pval_: Optional[float] = None
 60        self.n_obs_: Optional[int] = None
 61
 62    def fit(
 63        self,
 64        n: np.ndarray,
 65        s: np.ndarray,
 66        rng: Optional[np.random.Generator] = None,
 67    ) -> "DependenceTest":
 68        """
 69        Run the dependence tests.
 70
 71        Parameters
 72        ----------
 73        n : array-like
 74            Claim counts for positive-claim policies only.
 75        s : array-like
 76            Average claim severities (must match n in length).
 77        rng : numpy Generator, optional
 78            For reproducible permutation tests.
 79        """
 80        n = np.asarray(n, dtype=float)
 81        s = np.asarray(s, dtype=float)
 82
 83        # Filter to valid positive-claim observations
 84        mask = (n > 0) & (s > 0) & np.isfinite(n) & np.isfinite(s)
 85        n_valid = n[mask]
 86        s_valid = s[mask]
 87
 88        if len(n_valid) < 10:
 89            raise ValueError(
 90                f"Only {len(n_valid)} valid positive-claim observations. "
 91                "Need at least 10 for dependence testing."
 92            )
 93
 94        self.n_obs_ = int(len(n_valid))
 95
 96        # Kendall tau
 97        tau, pval_tau = stats.kendalltau(n_valid, s_valid)
 98        self.tau_ = float(tau)
 99        self.tau_pval_ = float(pval_tau)
100
101        # Spearman rho
102        rho_s, pval_rho = stats.spearmanr(n_valid, s_valid)
103        self.rho_s_ = float(rho_s)
104        self.rho_s_pval_ = float(pval_rho)
105
106        # Permutation test for Kendall tau
107        if self.n_permutations > 0:
108            if rng is None:
109                rng = np.random.default_rng()
110            perm_taus = np.empty(self.n_permutations)
111            for i in range(self.n_permutations):
112                s_perm = rng.permutation(s_valid)
113                t, _ = stats.kendalltau(n_valid, s_perm)
114                perm_taus[i] = t
115            self.tau_pval_perm_ = float(np.mean(np.abs(perm_taus) >= np.abs(self.tau_)))
116        else:
117            self.tau_pval_perm_ = None
118
119        return self
120
121    def summary(self) -> pd.DataFrame:
122        """Return test results as a DataFrame."""
123        if self.tau_ is None:
124            raise RuntimeError("Must call .fit() first")
125
126        rows = [
127            {
128                "test": "Kendall tau (asymptotic)",
129                "statistic": self.tau_,
130                "p_value": self.tau_pval_,
131                "n_obs": self.n_obs_,
132                "conclusion": "reject H0 (dependence)" if self.tau_pval_ < 0.05 else "fail to reject H0 (independence)",
133            },
134            {
135                "test": "Spearman rho (asymptotic)",
136                "statistic": self.rho_s_,
137                "p_value": self.rho_s_pval_,
138                "n_obs": self.n_obs_,
139                "conclusion": "reject H0 (dependence)" if self.rho_s_pval_ < 0.05 else "fail to reject H0 (independence)",
140            },
141        ]
142
143        if hasattr(self, "tau_pval_perm_") and self.tau_pval_perm_ is not None:
144            rows.append({
145                "test": f"Kendall tau (permutation, B={self.n_permutations})",
146                "statistic": self.tau_,
147                "p_value": self.tau_pval_perm_,
148                "n_obs": self.n_obs_,
149                "conclusion": "reject H0 (dependence)" if self.tau_pval_perm_ < 0.05 else "fail to reject H0 (independence)",
150            })
151
152        return pd.DataFrame(rows)

Tests the null hypothesis of independence between claim frequency and severity.

Three test statistics are available:

  • 'kendall': Kendall tau with asymptotic normal approximation
  • 'spearman': Spearman rho with t-distribution approximation
  • 'lrt': Likelihood ratio test for omega=0 in the Sarmanov model

Only positive-claim observations are used (n=0 policies contribute no severity information).

Parameters

n_permutations : int Number of permutations for the permutation test. Set to 0 to use asymptotic approximation only.

Examples

>>> test = DependenceTest()
>>> result = test.fit(n_positive, s_positive)
>>> result.summary()
DependenceTest(n_permutations: int = 1000)
54    def __init__(self, n_permutations: int = 1000):
55        self.n_permutations = n_permutations
56        self.tau_: Optional[float] = None
57        self.tau_pval_: Optional[float] = None
58        self.rho_s_: Optional[float] = None
59        self.rho_s_pval_: Optional[float] = None
60        self.n_obs_: Optional[int] = None
n_permutations
tau_: Optional[float]
tau_pval_: Optional[float]
rho_s_: Optional[float]
rho_s_pval_: Optional[float]
n_obs_: Optional[int]
def fit( self, n: numpy.ndarray, s: numpy.ndarray, rng: Optional[numpy.random._generator.Generator] = None) -> DependenceTest:
 62    def fit(
 63        self,
 64        n: np.ndarray,
 65        s: np.ndarray,
 66        rng: Optional[np.random.Generator] = None,
 67    ) -> "DependenceTest":
 68        """
 69        Run the dependence tests.
 70
 71        Parameters
 72        ----------
 73        n : array-like
 74            Claim counts for positive-claim policies only.
 75        s : array-like
 76            Average claim severities (must match n in length).
 77        rng : numpy Generator, optional
 78            For reproducible permutation tests.
 79        """
 80        n = np.asarray(n, dtype=float)
 81        s = np.asarray(s, dtype=float)
 82
 83        # Filter to valid positive-claim observations
 84        mask = (n > 0) & (s > 0) & np.isfinite(n) & np.isfinite(s)
 85        n_valid = n[mask]
 86        s_valid = s[mask]
 87
 88        if len(n_valid) < 10:
 89            raise ValueError(
 90                f"Only {len(n_valid)} valid positive-claim observations. "
 91                "Need at least 10 for dependence testing."
 92            )
 93
 94        self.n_obs_ = int(len(n_valid))
 95
 96        # Kendall tau
 97        tau, pval_tau = stats.kendalltau(n_valid, s_valid)
 98        self.tau_ = float(tau)
 99        self.tau_pval_ = float(pval_tau)
100
101        # Spearman rho
102        rho_s, pval_rho = stats.spearmanr(n_valid, s_valid)
103        self.rho_s_ = float(rho_s)
104        self.rho_s_pval_ = float(pval_rho)
105
106        # Permutation test for Kendall tau
107        if self.n_permutations > 0:
108            if rng is None:
109                rng = np.random.default_rng()
110            perm_taus = np.empty(self.n_permutations)
111            for i in range(self.n_permutations):
112                s_perm = rng.permutation(s_valid)
113                t, _ = stats.kendalltau(n_valid, s_perm)
114                perm_taus[i] = t
115            self.tau_pval_perm_ = float(np.mean(np.abs(perm_taus) >= np.abs(self.tau_)))
116        else:
117            self.tau_pval_perm_ = None
118
119        return self

Run the dependence tests.

Parameters

n : array-like Claim counts for positive-claim policies only. s : array-like Average claim severities (must match n in length). rng : numpy Generator, optional For reproducible permutation tests.

def summary(self) -> pandas.DataFrame:
121    def summary(self) -> pd.DataFrame:
122        """Return test results as a DataFrame."""
123        if self.tau_ is None:
124            raise RuntimeError("Must call .fit() first")
125
126        rows = [
127            {
128                "test": "Kendall tau (asymptotic)",
129                "statistic": self.tau_,
130                "p_value": self.tau_pval_,
131                "n_obs": self.n_obs_,
132                "conclusion": "reject H0 (dependence)" if self.tau_pval_ < 0.05 else "fail to reject H0 (independence)",
133            },
134            {
135                "test": "Spearman rho (asymptotic)",
136                "statistic": self.rho_s_,
137                "p_value": self.rho_s_pval_,
138                "n_obs": self.n_obs_,
139                "conclusion": "reject H0 (dependence)" if self.rho_s_pval_ < 0.05 else "fail to reject H0 (independence)",
140            },
141        ]
142
143        if hasattr(self, "tau_pval_perm_") and self.tau_pval_perm_ is not None:
144            rows.append({
145                "test": f"Kendall tau (permutation, B={self.n_permutations})",
146                "statistic": self.tau_,
147                "p_value": self.tau_pval_perm_,
148                "n_obs": self.n_obs_,
149                "conclusion": "reject H0 (dependence)" if self.tau_pval_perm_ < 0.05 else "fail to reject H0 (independence)",
150            })
151
152        return pd.DataFrame(rows)

Return test results as a DataFrame.

class CopulaGOF:
155class CopulaGOF:
156    """
157    Goodness-of-fit testing for a fitted copula model.
158
159    Uses the Rosenblatt probability integral transform (RPIT) to map the
160    bivariate sample onto uniform margins, then tests uniformity of the
161    transformed data using a Kolmogorov-Smirnov test on each marginal and
162    a joint test.
163
164    For the Sarmanov copula, the RPIT involves computing the conditional CDF
165    F_{S|N}(s|n) = f(n,s) / f_N(n), which is tractable analytically.
166
167    Parameters
168    ----------
169    joint_model : JointFreqSev
170        A fitted JointFreqSev model.
171    """
172
173    def __init__(self, joint_model: Any):
174        self.joint_model = joint_model
175        self.ks_stat_u_: Optional[float] = None
176        self.ks_stat_v_: Optional[float] = None
177        self.ks_pval_u_: Optional[float] = None
178        self.ks_pval_v_: Optional[float] = None
179        self.n_obs_: Optional[int] = None
180
181    def fit(
182        self,
183        n: np.ndarray,
184        s: np.ndarray,
185        freq_params: list,
186        sev_params: list,
187    ) -> "CopulaGOF":
188        """
189        Compute RPIT and test uniformity.
190
191        For positive-claim observations only.
192        """
193        n = np.asarray(n, dtype=float)
194        s = np.asarray(s, dtype=float)
195        mask = (n > 0) & (s > 0) & np.isfinite(s)
196        n_pos = n[mask]
197        s_pos = s[mask]
198        fp_pos = [freq_params[i] for i in range(len(n)) if mask[i]]
199        sp_pos = [sev_params[i] for i in range(len(n)) if mask[i]]
200        self.n_obs_ = int(len(n_pos))
201
202        # Marginal CDF transforms (PIT for each margin)
203        # U_N = mid-point PIT for discrete N
204        # V_S = CDF of S
205        from insurance_frequency_severity.copula import GaussianCopulaMixed
206        gc = GaussianCopulaMixed(rho=0.0)
207
208        freq_family = self.joint_model._freq_family
209        sev_family = self.joint_model._sev_family
210
211        fp_dict = {
212            "mu": np.array([p["mu"] for p in fp_pos]),
213            **({
214                "alpha": np.array([p["alpha"] for p in fp_pos])
215            } if freq_family == "nb" else {}),
216        }
217        if sev_family == "gamma":
218            sp_dict = {
219                "mu": np.array([p["mu"] for p in sp_pos]),
220                "shape": np.array([p["shape"] for p in sp_pos]),
221            }
222        else:
223            sp_dict = {
224                "log_mu": np.array([p["log_mu"] for p in sp_pos]),
225                "log_sigma": np.array([p["log_sigma"] for p in sp_pos]),
226            }
227
228        u = gc._pit_freq(n_pos, fp_dict, freq_family)
229        v = gc._pit_sev(s_pos, sp_dict, sev_family)
230        u = np.clip(u, 1e-8, 1 - 1e-8)
231        v = np.clip(v, 1e-8, 1 - 1e-8)
232
233        self._u = u
234        self._v = v
235
236        # KS test against Uniform(0,1)
237        ks_u = stats.kstest(u, "uniform")
238        ks_v = stats.kstest(v, "uniform")
239
240        self.ks_stat_u_ = float(ks_u.statistic)
241        self.ks_stat_v_ = float(ks_v.statistic)
242        self.ks_pval_u_ = float(ks_u.pvalue)
243        self.ks_pval_v_ = float(ks_v.pvalue)
244
245        return self
246
247    def summary(self) -> pd.DataFrame:
248        """Return GOF test results."""
249        if self.ks_stat_u_ is None:
250            raise RuntimeError("Must call .fit() first")
251
252        return pd.DataFrame([
253            {
254                "margin": "frequency (PIT)",
255                "ks_statistic": self.ks_stat_u_,
256                "ks_pvalue": self.ks_pval_u_,
257                "assessment": "acceptable" if self.ks_pval_u_ > 0.05 else "poor fit",
258            },
259            {
260                "margin": "severity (CDF)",
261                "ks_statistic": self.ks_stat_v_,
262                "ks_pvalue": self.ks_pval_v_,
263                "assessment": "acceptable" if self.ks_pval_v_ > 0.05 else "poor fit",
264            },
265        ])

Goodness-of-fit testing for a fitted copula model.

Uses the Rosenblatt probability integral transform (RPIT) to map the bivariate sample onto uniform margins, then tests uniformity of the transformed data using a Kolmogorov-Smirnov test on each marginal and a joint test.

For the Sarmanov copula, the RPIT involves computing the conditional CDF F_{S|N}(s|n) = f(n,s) / f_N(n), which is tractable analytically.

Parameters

joint_model : JointFreqSev A fitted JointFreqSev model.

CopulaGOF(joint_model: Any)
173    def __init__(self, joint_model: Any):
174        self.joint_model = joint_model
175        self.ks_stat_u_: Optional[float] = None
176        self.ks_stat_v_: Optional[float] = None
177        self.ks_pval_u_: Optional[float] = None
178        self.ks_pval_v_: Optional[float] = None
179        self.n_obs_: Optional[int] = None
joint_model
ks_stat_u_: Optional[float]
ks_stat_v_: Optional[float]
ks_pval_u_: Optional[float]
ks_pval_v_: Optional[float]
n_obs_: Optional[int]
def fit( self, n: numpy.ndarray, s: numpy.ndarray, freq_params: list, sev_params: list) -> CopulaGOF:
181    def fit(
182        self,
183        n: np.ndarray,
184        s: np.ndarray,
185        freq_params: list,
186        sev_params: list,
187    ) -> "CopulaGOF":
188        """
189        Compute RPIT and test uniformity.
190
191        For positive-claim observations only.
192        """
193        n = np.asarray(n, dtype=float)
194        s = np.asarray(s, dtype=float)
195        mask = (n > 0) & (s > 0) & np.isfinite(s)
196        n_pos = n[mask]
197        s_pos = s[mask]
198        fp_pos = [freq_params[i] for i in range(len(n)) if mask[i]]
199        sp_pos = [sev_params[i] for i in range(len(n)) if mask[i]]
200        self.n_obs_ = int(len(n_pos))
201
202        # Marginal CDF transforms (PIT for each margin)
203        # U_N = mid-point PIT for discrete N
204        # V_S = CDF of S
205        from insurance_frequency_severity.copula import GaussianCopulaMixed
206        gc = GaussianCopulaMixed(rho=0.0)
207
208        freq_family = self.joint_model._freq_family
209        sev_family = self.joint_model._sev_family
210
211        fp_dict = {
212            "mu": np.array([p["mu"] for p in fp_pos]),
213            **({
214                "alpha": np.array([p["alpha"] for p in fp_pos])
215            } if freq_family == "nb" else {}),
216        }
217        if sev_family == "gamma":
218            sp_dict = {
219                "mu": np.array([p["mu"] for p in sp_pos]),
220                "shape": np.array([p["shape"] for p in sp_pos]),
221            }
222        else:
223            sp_dict = {
224                "log_mu": np.array([p["log_mu"] for p in sp_pos]),
225                "log_sigma": np.array([p["log_sigma"] for p in sp_pos]),
226            }
227
228        u = gc._pit_freq(n_pos, fp_dict, freq_family)
229        v = gc._pit_sev(s_pos, sp_dict, sev_family)
230        u = np.clip(u, 1e-8, 1 - 1e-8)
231        v = np.clip(v, 1e-8, 1 - 1e-8)
232
233        self._u = u
234        self._v = v
235
236        # KS test against Uniform(0,1)
237        ks_u = stats.kstest(u, "uniform")
238        ks_v = stats.kstest(v, "uniform")
239
240        self.ks_stat_u_ = float(ks_u.statistic)
241        self.ks_stat_v_ = float(ks_v.statistic)
242        self.ks_pval_u_ = float(ks_u.pvalue)
243        self.ks_pval_v_ = float(ks_v.pvalue)
244
245        return self

Compute RPIT and test uniformity.

For positive-claim observations only.

def summary(self) -> pandas.DataFrame:
247    def summary(self) -> pd.DataFrame:
248        """Return GOF test results."""
249        if self.ks_stat_u_ is None:
250            raise RuntimeError("Must call .fit() first")
251
252        return pd.DataFrame([
253            {
254                "margin": "frequency (PIT)",
255                "ks_statistic": self.ks_stat_u_,
256                "ks_pvalue": self.ks_pval_u_,
257                "assessment": "acceptable" if self.ks_pval_u_ > 0.05 else "poor fit",
258            },
259            {
260                "margin": "severity (CDF)",
261                "ks_statistic": self.ks_stat_v_,
262                "ks_pvalue": self.ks_pval_v_,
263                "assessment": "acceptable" if self.ks_pval_v_ > 0.05 else "poor fit",
264            },
265        ])

Return GOF test results.

def compare_copulas( n: numpy.ndarray, s: numpy.ndarray, freq_glm: Any, sev_glm: Any, freq_X: Optional[pandas.DataFrame] = None, sev_X: Optional[pandas.DataFrame] = None, exposure: Optional[numpy.ndarray] = None, rng: Optional[numpy.random._generator.Generator] = None) -> pandas.DataFrame:
268def compare_copulas(
269    n: np.ndarray,
270    s: np.ndarray,
271    freq_glm: Any,
272    sev_glm: Any,
273    freq_X: Optional[pd.DataFrame] = None,
274    sev_X: Optional[pd.DataFrame] = None,
275    exposure: Optional[np.ndarray] = None,
276    rng: Optional[np.random.Generator] = None,
277) -> pd.DataFrame:
278    """
279    Fit all three copula families and compare by AIC/BIC.
280
281    This is the primary diagnostic for deciding whether dependence modelling
282    is warranted. The workflow:
283
284    1. Fit independence model (omega=0 in Sarmanov). Establishes baseline.
285    2. Fit Sarmanov, Gaussian, FGM copulas.
286    3. Compare AIC/BIC. If independence model wins, the dependence is not
287       statistically significant.
288
289    Parameters
290    ----------
291    n : array-like
292        Claim counts for all policies.
293    s : array-like
294        Average severities (0 or NaN for zero-claim policies).
295    freq_glm : fitted GLM
296        Frequency model.
297    sev_glm : fitted GLM
298        Severity model.
299    freq_X, sev_X : DataFrame, optional
300        Feature matrices for prediction.
301    exposure : array-like, optional
302        Exposure for frequency.
303    rng : numpy Generator, optional
304
305    Returns
306    -------
307    DataFrame with copula, omega/rho, spearman_rho, aic, bic, delta_aic columns.
308    """
309    from insurance_frequency_severity.joint import JointFreqSev
310
311    data = pd.DataFrame({"claim_count": n, "avg_severity": s})
312    if exposure is not None:
313        data["_exposure"] = exposure
314
315    results = []
316
317    for copula_family in ["sarmanov", "gaussian", "fgm"]:
318        model = JointFreqSev(freq_glm=freq_glm, sev_glm=sev_glm, copula=copula_family)
319        try:
320            import warnings
321            with warnings.catch_warnings():
322                warnings.simplefilter("ignore")
323                model.fit(
324                    data,
325                    n_col="claim_count",
326                    s_col="avg_severity",
327                    freq_X=freq_X,
328                    sev_X=sev_X,
329                    exposure_col="_exposure" if exposure is not None else None,
330                    ci_method="profile" if copula_family == "sarmanov" else "profile",
331                    rng=rng,
332                )
333            summary = model.dependence_summary()
334            results.append({
335                "copula": copula_family,
336                "omega_or_rho": float(model.omega_),
337                "spearman_rho": float(model.rho_) if model.rho_ is not None else np.nan,
338                "aic": float(model.aic_),
339                "bic": float(model.bic_),
340                "converged": True,
341            })
342        except Exception as e:
343            results.append({
344                "copula": copula_family,
345                "omega_or_rho": np.nan,
346                "spearman_rho": np.nan,
347                "aic": np.nan,
348                "bic": np.nan,
349                "converged": False,
350            })
351
352    df = pd.DataFrame(results)
353    if df["aic"].notna().any():
354        min_aic = df["aic"].min()
355        df["delta_aic"] = df["aic"] - min_aic
356        df = df.sort_values("aic")
357
358    return df.reset_index(drop=True)

Fit all three copula families and compare by AIC/BIC.

This is the primary diagnostic for deciding whether dependence modelling is warranted. The workflow:

  1. Fit independence model (omega=0 in Sarmanov). Establishes baseline.
  2. Fit Sarmanov, Gaussian, FGM copulas.
  3. Compare AIC/BIC. If independence model wins, the dependence is not statistically significant.

Parameters

n : array-like Claim counts for all policies. s : array-like Average severities (0 or NaN for zero-claim policies). freq_glm : fitted GLM Frequency model. sev_glm : fitted GLM Severity model. freq_X, sev_X : DataFrame, optional Feature matrices for prediction. exposure : array-like, optional Exposure for frequency. rng : numpy Generator, optional

Returns

DataFrame with copula, omega/rho, spearman_rho, aic, bic, delta_aic columns.

class JointModelReport:
 30class JointModelReport:
 31    """
 32    Generates an HTML report for a fitted JointFreqSev model.
 33
 34    Parameters
 35    ----------
 36    joint_model : JointFreqSev
 37        A fitted joint model.
 38    dependence_test : DependenceTest, optional
 39        Pre-fitted dependence test results.
 40    copula_comparison : DataFrame, optional
 41        Output from compare_copulas().
 42
 43    Examples
 44    --------
 45    >>> report = JointModelReport(model, test)
 46    >>> report.to_html("model_report.html")
 47    >>> d = report.to_dict()
 48    """
 49
 50    def __init__(
 51        self,
 52        joint_model: Any,
 53        dependence_test: Optional[Any] = None,
 54        copula_comparison: Optional[pd.DataFrame] = None,
 55    ):
 56        self.joint_model = joint_model
 57        self.dependence_test = dependence_test
 58        self.copula_comparison = copula_comparison
 59
 60    def to_dict(self) -> Dict:
 61        """
 62        Return report data as a plain dictionary.
 63
 64        Useful for programmatic consumption without generating HTML.
 65        """
 66        result = {}
 67
 68        model = self.joint_model
 69        if hasattr(model, "omega_") and model.omega_ is not None:
 70            result["copula_family"] = model.copula_family
 71            result["omega"] = model.omega_
 72            result["spearman_rho"] = model.rho_
 73            result["omega_ci"] = model.omega_ci_
 74            result["aic"] = model.aic_
 75            result["bic"] = model.bic_
 76            result["n_policies"] = model._n_obs
 77            result["n_claims"] = model._n_claims
 78
 79        if self.dependence_test is not None and hasattr(self.dependence_test, "tau_"):
 80            dt = self.dependence_test
 81            result["kendall_tau"] = dt.tau_
 82            result["kendall_pval"] = dt.tau_pval_
 83            result["spearman_rho_test"] = dt.rho_s_
 84            result["spearman_pval"] = dt.rho_s_pval_
 85
 86        if self.copula_comparison is not None:
 87            result["copula_comparison"] = self.copula_comparison.to_dict(orient="records")
 88
 89        return result
 90
 91    def _make_fig_base64(self, fig) -> str:
 92        """Convert matplotlib figure to base64 PNG string."""
 93        buf = io.BytesIO()
 94        fig.savefig(buf, format="png", bbox_inches="tight", dpi=100)
 95        buf.seek(0)
 96        return base64.b64encode(buf.read()).decode("utf-8")
 97
 98    def _plot_correction_distribution(
 99        self, correction_df: pd.DataFrame
100    ) -> Optional[str]:
101        """Histogram of premium correction factors."""
102        try:
103            import matplotlib
104            matplotlib.use("Agg")
105            import matplotlib.pyplot as plt
106
107            factors = correction_df["correction_factor"].values
108            fig, ax = plt.subplots(figsize=(8, 4))
109
110            ax.hist(factors, bins=40, edgecolor="white", color="#2271b5", alpha=0.85)
111            ax.axvline(1.0, color="#cc0000", linewidth=1.5, linestyle="--", label="Independence (1.0)")
112            ax.axvline(float(np.mean(factors)), color="#f28c28", linewidth=1.5,
113                       linestyle="-", label=f"Mean = {np.mean(factors):.4f}")
114
115            ax.set_xlabel("Premium correction factor")
116            ax.set_ylabel("Number of policies")
117            ax.set_title("Distribution of premium correction factors")
118            ax.legend()
119            ax.grid(True, alpha=0.3)
120
121            result = self._make_fig_base64(fig)
122            plt.close(fig)
123            return result
124        except Exception:
125            return None
126
127    def _plot_dependence_scatter(
128        self, n: np.ndarray, s: np.ndarray
129    ) -> Optional[str]:
130        """Scatter of N vs S for positive-claim observations."""
131        try:
132            import matplotlib
133            matplotlib.use("Agg")
134            import matplotlib.pyplot as plt
135
136            mask = (n > 0) & (s > 0) & np.isfinite(s)
137            n_pos = n[mask]
138            s_pos = s[mask]
139
140            fig, ax = plt.subplots(figsize=(7, 5))
141            ax.scatter(n_pos, s_pos, alpha=0.3, s=10, color="#2271b5")
142            ax.set_xlabel("Claim count (N)")
143            ax.set_ylabel("Average claim severity (S)")
144            ax.set_title("Claim count vs. average severity")
145
146            if self.dependence_test and hasattr(self.dependence_test, "tau_"):
147                tau = self.dependence_test.tau_
148                rho = self.dependence_test.rho_s_
149                ax.text(
150                    0.05, 0.95,
151                    f"Kendall tau = {tau:.3f}\nSpearman rho = {rho:.3f}",
152                    transform=ax.transAxes, va="top",
153                    bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.7),
154                )
155
156            ax.grid(True, alpha=0.3)
157            result = self._make_fig_base64(fig)
158            plt.close(fig)
159            return result
160        except Exception:
161            return None
162
163    def to_html(
164        self,
165        output_path: Optional[str] = None,
166        n: Optional[np.ndarray] = None,
167        s: Optional[np.ndarray] = None,
168        correction_df: Optional[pd.DataFrame] = None,
169    ) -> str:
170        """
171        Generate HTML report.
172
173        Parameters
174        ----------
175        output_path : str, optional
176            If provided, write HTML to this file.
177        n : array-like, optional
178            Claim counts for dependence scatter plot.
179        s : array-like, optional
180            Severities for scatter plot.
181        correction_df : DataFrame, optional
182            Output from model.premium_correction(). If None, no histogram.
183
184        Returns
185        -------
186        HTML string.
187        """
188        data = self.to_dict()
189        model = self.joint_model
190
191        sections = []
192
193        # --- Header ---
194        sections.append("""
195        <h1>Joint Frequency-Severity Model Report</h1>
196        <p style="color: #666;">Burning Cost | insurance-frequency-severity</p>
197        <hr>
198        """)
199
200        # --- Dependence test ---
201        if self.dependence_test is not None and hasattr(self.dependence_test, "tau_"):
202            dt = self.dependence_test
203            summary_df = dt.summary()
204            sections.append("<h2>Dependence Tests</h2>")
205            sections.append("<p>Tests H<sub>0</sub>: independence between N and S "
206                           "(positive-claim observations only).</p>")
207            sections.append(summary_df.to_html(index=False, classes="table"))
208
209        # --- Copula fit results ---
210        if hasattr(model, "omega_") and model.omega_ is not None:
211            dep_df = model.dependence_summary()
212            sections.append("<h2>Fitted Dependence Parameter</h2>")
213            sections.append(dep_df.to_html(index=False, classes="table"))
214
215            ci = model.omega_ci_
216            if ci:
217                direction = "negative" if model.omega_ < 0 else "positive"
218                sections.append(
219                    f"<p><strong>Interpretation:</strong> The estimated dependence parameter "
220                    f"({model.copula_family} omega = {model.omega_:.4f}) indicates "
221                    f"<strong>{direction} frequency-severity dependence</strong>. "
222                    f"95% CI: ({ci[0]:.4f}, {ci[1]:.4f}). "
223                )
224                if model.rho_ is not None:
225                    sections.append(
226                        f"Spearman's rank correlation (Monte Carlo) = {model.rho_:.4f}.</p>"
227                    )
228
229        # --- Copula comparison ---
230        if self.copula_comparison is not None:
231            sections.append("<h2>Copula Family Comparison</h2>")
232            sections.append(self.copula_comparison.to_html(index=False, classes="table"))
233            best = self.copula_comparison.iloc[0]["copula"]
234            sections.append(f"<p>Best fitting copula by AIC: <strong>{best}</strong>.</p>")
235
236        # --- Plots ---
237        if n is not None and s is not None:
238            n = np.asarray(n)
239            s = np.asarray(s)
240            scatter_b64 = self._plot_dependence_scatter(n, s)
241            if scatter_b64:
242                sections.append("<h2>Frequency vs Severity</h2>")
243                sections.append(
244                    f'<img src="data:image/png;base64,{scatter_b64}" '
245                    f'style="max-width:700px;"><br>'
246                )
247
248        if correction_df is not None:
249            hist_b64 = self._plot_correction_distribution(correction_df)
250            if hist_b64:
251                sections.append("<h2>Premium Correction Factor Distribution</h2>")
252                sections.append(
253                    f'<img src="data:image/png;base64,{hist_b64}" '
254                    f'style="max-width:800px;"><br>'
255                )
256                mean_corr = float(correction_df["correction_factor"].mean())
257                pct_gt1 = float((correction_df["correction_factor"] > 1.0).mean() * 100)
258                sections.append(
259                    f"<p>Mean correction factor: {mean_corr:.4f}. "
260                    f"{pct_gt1:.1f}% of policies have correction > 1.0 "
261                    f"(i.e., dependence increases expected cost).</p>"
262                )
263
264        # --- CSS ---
265        css = """
266        <style>
267        body { font-family: Arial, sans-serif; max-width: 900px; margin: 40px auto; }
268        h1 { color: #2271b5; }
269        h2 { color: #444; border-bottom: 1px solid #ddd; padding-bottom: 4px; }
270        table.table { border-collapse: collapse; width: 100%; margin: 12px 0; }
271        table.table th, table.table td {
272            border: 1px solid #ddd; padding: 8px 12px; text-align: left;
273        }
274        table.table th { background: #f5f5f5; font-weight: bold; }
275        table.table tr:nth-child(even) { background: #fafafa; }
276        </style>
277        """
278
279        html = f"""<!DOCTYPE html>
280<html>
281<head>
282<meta charset="utf-8">
283<title>Joint Frequency-Severity Model Report</title>
284{css}
285</head>
286<body>
287{"".join(sections)}
288</body>
289</html>"""
290
291        if output_path:
292            with open(output_path, "w") as f:
293                f.write(html)
294
295        return html

Generates an HTML report for a fitted JointFreqSev model.

Parameters

joint_model : JointFreqSev A fitted joint model. dependence_test : DependenceTest, optional Pre-fitted dependence test results. copula_comparison : DataFrame, optional Output from compare_copulas().

Examples

>>> report = JointModelReport(model, test)
>>> report.to_html("model_report.html")
>>> d = report.to_dict()
JointModelReport( joint_model: Any, dependence_test: Optional[Any] = None, copula_comparison: Optional[pandas.DataFrame] = None)
50    def __init__(
51        self,
52        joint_model: Any,
53        dependence_test: Optional[Any] = None,
54        copula_comparison: Optional[pd.DataFrame] = None,
55    ):
56        self.joint_model = joint_model
57        self.dependence_test = dependence_test
58        self.copula_comparison = copula_comparison
joint_model
dependence_test
copula_comparison
def to_dict(self) -> Dict:
60    def to_dict(self) -> Dict:
61        """
62        Return report data as a plain dictionary.
63
64        Useful for programmatic consumption without generating HTML.
65        """
66        result = {}
67
68        model = self.joint_model
69        if hasattr(model, "omega_") and model.omega_ is not None:
70            result["copula_family"] = model.copula_family
71            result["omega"] = model.omega_
72            result["spearman_rho"] = model.rho_
73            result["omega_ci"] = model.omega_ci_
74            result["aic"] = model.aic_
75            result["bic"] = model.bic_
76            result["n_policies"] = model._n_obs
77            result["n_claims"] = model._n_claims
78
79        if self.dependence_test is not None and hasattr(self.dependence_test, "tau_"):
80            dt = self.dependence_test
81            result["kendall_tau"] = dt.tau_
82            result["kendall_pval"] = dt.tau_pval_
83            result["spearman_rho_test"] = dt.rho_s_
84            result["spearman_pval"] = dt.rho_s_pval_
85
86        if self.copula_comparison is not None:
87            result["copula_comparison"] = self.copula_comparison.to_dict(orient="records")
88
89        return result

Return report data as a plain dictionary.

Useful for programmatic consumption without generating HTML.

def to_html( self, output_path: Optional[str] = None, n: Optional[numpy.ndarray] = None, s: Optional[numpy.ndarray] = None, correction_df: Optional[pandas.DataFrame] = None) -> str:
163    def to_html(
164        self,
165        output_path: Optional[str] = None,
166        n: Optional[np.ndarray] = None,
167        s: Optional[np.ndarray] = None,
168        correction_df: Optional[pd.DataFrame] = None,
169    ) -> str:
170        """
171        Generate HTML report.
172
173        Parameters
174        ----------
175        output_path : str, optional
176            If provided, write HTML to this file.
177        n : array-like, optional
178            Claim counts for dependence scatter plot.
179        s : array-like, optional
180            Severities for scatter plot.
181        correction_df : DataFrame, optional
182            Output from model.premium_correction(). If None, no histogram.
183
184        Returns
185        -------
186        HTML string.
187        """
188        data = self.to_dict()
189        model = self.joint_model
190
191        sections = []
192
193        # --- Header ---
194        sections.append("""
195        <h1>Joint Frequency-Severity Model Report</h1>
196        <p style="color: #666;">Burning Cost | insurance-frequency-severity</p>
197        <hr>
198        """)
199
200        # --- Dependence test ---
201        if self.dependence_test is not None and hasattr(self.dependence_test, "tau_"):
202            dt = self.dependence_test
203            summary_df = dt.summary()
204            sections.append("<h2>Dependence Tests</h2>")
205            sections.append("<p>Tests H<sub>0</sub>: independence between N and S "
206                           "(positive-claim observations only).</p>")
207            sections.append(summary_df.to_html(index=False, classes="table"))
208
209        # --- Copula fit results ---
210        if hasattr(model, "omega_") and model.omega_ is not None:
211            dep_df = model.dependence_summary()
212            sections.append("<h2>Fitted Dependence Parameter</h2>")
213            sections.append(dep_df.to_html(index=False, classes="table"))
214
215            ci = model.omega_ci_
216            if ci:
217                direction = "negative" if model.omega_ < 0 else "positive"
218                sections.append(
219                    f"<p><strong>Interpretation:</strong> The estimated dependence parameter "
220                    f"({model.copula_family} omega = {model.omega_:.4f}) indicates "
221                    f"<strong>{direction} frequency-severity dependence</strong>. "
222                    f"95% CI: ({ci[0]:.4f}, {ci[1]:.4f}). "
223                )
224                if model.rho_ is not None:
225                    sections.append(
226                        f"Spearman's rank correlation (Monte Carlo) = {model.rho_:.4f}.</p>"
227                    )
228
229        # --- Copula comparison ---
230        if self.copula_comparison is not None:
231            sections.append("<h2>Copula Family Comparison</h2>")
232            sections.append(self.copula_comparison.to_html(index=False, classes="table"))
233            best = self.copula_comparison.iloc[0]["copula"]
234            sections.append(f"<p>Best fitting copula by AIC: <strong>{best}</strong>.</p>")
235
236        # --- Plots ---
237        if n is not None and s is not None:
238            n = np.asarray(n)
239            s = np.asarray(s)
240            scatter_b64 = self._plot_dependence_scatter(n, s)
241            if scatter_b64:
242                sections.append("<h2>Frequency vs Severity</h2>")
243                sections.append(
244                    f'<img src="data:image/png;base64,{scatter_b64}" '
245                    f'style="max-width:700px;"><br>'
246                )
247
248        if correction_df is not None:
249            hist_b64 = self._plot_correction_distribution(correction_df)
250            if hist_b64:
251                sections.append("<h2>Premium Correction Factor Distribution</h2>")
252                sections.append(
253                    f'<img src="data:image/png;base64,{hist_b64}" '
254                    f'style="max-width:800px;"><br>'
255                )
256                mean_corr = float(correction_df["correction_factor"].mean())
257                pct_gt1 = float((correction_df["correction_factor"] > 1.0).mean() * 100)
258                sections.append(
259                    f"<p>Mean correction factor: {mean_corr:.4f}. "
260                    f"{pct_gt1:.1f}% of policies have correction > 1.0 "
261                    f"(i.e., dependence increases expected cost).</p>"
262                )
263
264        # --- CSS ---
265        css = """
266        <style>
267        body { font-family: Arial, sans-serif; max-width: 900px; margin: 40px auto; }
268        h1 { color: #2271b5; }
269        h2 { color: #444; border-bottom: 1px solid #ddd; padding-bottom: 4px; }
270        table.table { border-collapse: collapse; width: 100%; margin: 12px 0; }
271        table.table th, table.table td {
272            border: 1px solid #ddd; padding: 8px 12px; text-align: left;
273        }
274        table.table th { background: #f5f5f5; font-weight: bold; }
275        table.table tr:nth-child(even) { background: #fafafa; }
276        </style>
277        """
278
279        html = f"""<!DOCTYPE html>
280<html>
281<head>
282<meta charset="utf-8">
283<title>Joint Frequency-Severity Model Report</title>
284{css}
285</head>
286<body>
287{"".join(sections)}
288</body>
289</html>"""
290
291        if output_path:
292            with open(output_path, "w") as f:
293                f.write(html)
294
295        return html

Generate HTML report.

Parameters

output_path : str, optional If provided, write HTML to this file. n : array-like, optional Claim counts for dependence scatter plot. s : array-like, optional Severities for scatter plot. correction_df : DataFrame, optional Output from model.premium_correction(). If None, no histogram.

Returns

HTML string.