from html import escape
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
import crabbymetrics as cm
np.set_printoptions(precision=4, suppress=True)
def html_table(headers, rows):
parts = [
"<table>",
"<thead>",
"<tr>",
*[f"<th>{escape(str(header))}</th>" for header in headers],
"</tr>",
"</thead>",
"<tbody>",
]
for row in rows:
parts.append("<tr>")
for cell in row:
parts.append(f"<td>{cell}</td>")
parts.append("</tr>")
parts.extend(["</tbody>", "</table>"])
return "".join(parts)
def hc2_covariance(design, residuals):
xtx_inv = np.linalg.inv(design.T @ design)
leverage = np.sum(design * (design @ xtx_inv), axis=1)
scale = residuals**2 / np.clip(1.0 - leverage, 1e-12, None)
meat = design.T @ (design * scale[:, None])
return xtx_inv @ meat @ xtx_inv
def bridging_gmm_moments(theta, data):
x = data["x"]
z = data["z"]
y = data["y"]
p = x.shape[1]
mu = theta[:p]
alpha = theta[p]
tau = theta[p + 1]
beta = theta[p + 2 : p + 2 + p]
gamma = theta[p + 2 + p :]
w = x - mu
resid = y - alpha - tau * z - w @ beta - (w * z[:, None]) @ gamma
l = np.column_stack([np.ones(x.shape[0]), z, w, w * z[:, None]])
return np.column_stack(
[
x - mu,
l * resid[:, None],
]
)
def fit_tau_from_raw_rows(z, y, x):
mu = x.mean(axis=0)
w = x - mu
interacted_design = np.column_stack([z, w, z[:, None] * w])
model = cm.OLS()
model.fit(interacted_design, y)
return float(np.asarray(model.summary(vcov="hc1")["coef"])[0])
def raw_row_bootstrap_se(z, y, x, n_bootstrap, seed):
rng = np.random.default_rng(seed)
n = z.shape[0]
tau_draws = np.empty(n_bootstrap)
for draw in range(n_bootstrap):
idx = rng.integers(0, n, size=n)
tau_draws[draw] = fit_tau_from_raw_rows(z[idx], y[idx], x[idx])
return float(tau_draws.std(ddof=1))
def fit_bridging_methods(z, y, x, n_bootstrap=300, bootstrap_seed=0):
p = x.shape[1]
mu = x.mean(axis=0)
w = x - mu
interacted_design = np.column_stack([z, w, z[:, None] * w])
ols = cm.OLS()
ols.fit(interacted_design, y)
ols_summary = ols.summary(vcov="hc1")
coef = np.asarray(ols_summary["coef"])
params = np.concatenate([[ols_summary["intercept"]], coef])
full_design = np.column_stack([np.ones(z.shape[0]), interacted_design])
residuals = y - full_design @ params
hc2_vcov = hc2_covariance(full_design, residuals)
tau_hat = float(coef[0])
se_hc2 = float(np.sqrt(hc2_vcov[1, 1]))
gamma_hat = coef[-p:]
superpop_correction = float(gamma_hat @ np.cov(w.T) @ gamma_hat / z.shape[0])
se_closed_form = float(np.sqrt(hc2_vcov[1, 1] + superpop_correction))
se_bootstrap = raw_row_bootstrap_se(z, y, x, n_bootstrap=n_bootstrap, seed=bootstrap_seed)
theta0 = np.concatenate([mu, [params[0], tau_hat], coef[1:]])
gmm = cm.GMM(
bridging_gmm_moments,
max_iterations=200,
tolerance=1e-8,
fd_eps=1e-5,
)
gmm.fit({"x": x, "z": z, "y": y}, theta0, weighting="identity")
gmm_summary = gmm.summary(vcov="sandwich", omega="iid")
tau_index = p + 1
tau_gmm = float(np.asarray(gmm_summary["coef"])[tau_index])
se_gmm = float(np.asarray(gmm_summary["se"])[tau_index])
return {
"tau_ols": tau_hat,
"se_hc2": se_hc2,
"se_closed_form": se_closed_form,
"se_bootstrap": se_bootstrap,
"tau_gmm": tau_gmm,
"se_gmm": se_gmm,
}
def simulate_bridging(reps, n, seed, n_bootstrap):
tau_hc2 = []
tau_closed = []
tau_bootstrap = []
tau_gmm = []
se_hc2 = []
se_closed = []
se_bootstrap = []
se_gmm = []
sate_targets = []
for rep in range(reps):
rng = np.random.default_rng(seed + rep)
x = rng.normal(size=(n, 2))
y0 = x[:, 0] + x[:, 0] ** 2 + rng.uniform(-0.5, 0.5, n)
y1 = x[:, 1] + x[:, 1] ** 2 + rng.uniform(-1.0, 1.0, n)
z = rng.binomial(1, 0.6, n).astype(float)
y = y0 * (1.0 - z) + y1 * z
fit = fit_bridging_methods(z, y, x, n_bootstrap=n_bootstrap, bootstrap_seed=rep)
tau_hc2.append(fit["tau_ols"])
tau_closed.append(fit["tau_ols"])
tau_bootstrap.append(fit["tau_ols"])
tau_gmm.append(fit["tau_gmm"])
se_hc2.append(fit["se_hc2"])
se_closed.append(fit["se_closed_form"])
se_bootstrap.append(fit["se_bootstrap"])
se_gmm.append(fit["se_gmm"])
sate_targets.append(float(np.mean(y1 - y0)))
def summarize(estimates, ses):
estimates = np.asarray(estimates)
ses = np.asarray(ses)
sate_targets_arr = np.asarray(sate_targets)
sate_errors = estimates - sate_targets_arr
pate_errors = estimates
return {
"mean_estimate": float(estimates.mean()),
"sate_error_sd": float(sate_errors.std(ddof=1)),
"pate_error_sd": float(pate_errors.std(ddof=1)),
"mean_se": float(ses.mean()),
"coverage_sate": float((np.abs(sate_errors) <= 1.96 * ses).mean()),
"coverage_pate": float((np.abs(pate_errors) <= 1.96 * ses).mean()),
}
return [
{"method": "HC2 finite-population", **summarize(tau_hc2, se_hc2)},
{"method": "Closed-form superpopulation", **summarize(tau_closed, se_closed)},
{"method": "Bootstrap superpopulation", **summarize(tau_bootstrap, se_bootstrap)},
{"method": "Stacked GMM superpopulation", **summarize(tau_gmm, se_gmm)},
]