import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')
matplotlib.style.use('ggplot')
# %matplotlib notebook
from cmfsapy.data import gen_ncube
n = 2500 # sample size
d = 5 # Dimensionality
w = np.arange(0.01, 30, 0.01) # axis for analytical pdf
X = gen_ncube(n, d)
# set neighborhood size
k1 = 5
from cmfsapy.dimension.fsa import fsa
from cmfsapy.theoretical import theoretical_fsa_pdf
We apply periodic boundary conditions to avoid edge effects:
dims_fsa = fsa(X, k1, boxsize=1)[0]
pdf = theoretical_fsa_pdf(w, k1, d)
bins = np.arange(0, 100, 0.2)
fig1, ax1 = plt.subplots(1,1)
_ = ax1.hist(dims_fsa[:, -1], density=True, bins=bins, alpha=0.3, label='FSA estimates (local)')
plt.plot(w, pdf, label='theoretical pdf')
ax1.axvline(np.nanmedian(dims_fsa[:, -1]), color='b', label='mFSA estimate (global)', zorder=101, ls='--')
ax1.axvline(np.nanmean(dims_fsa[:, -1]), color='grey', label='FSA estimate (global)')
ax1.axvline(d, color='k', label='d')
ax1.legend()
ax1.set_xlim(0, 15)
(0.0, 15.0)
Let's check these results when edge effects are turned on:
dims_fsa_edge = fsa(X, k1, boxsize=None)[0]
bins = np.arange(0, 100, 0.2)
fig1, ax1 = plt.subplots(1,1)
_ = ax1.hist(dims_fsa_edge[:, -1], density=True, bins=bins, alpha=0.3, label='FSA estimates (local)')
ax1.plot(w, pdf, label='theoretical pdf')
ax1.axvline(np.nanmedian(dims_fsa_edge[:, -1]), color='blue', label='mFSA estimate (global)', ls='--')
ax1.axvline(np.nanmean(dims_fsa_edge[:, -1]), color='grey', label='FSA estimate (global)')
ax1.axvline(d, color='k', label='d')
ax1.legend()
ax1.set_xlim(0, 15)
(0.0, 15.0)
from cmfsapy.dimension.cmfsa import calibrate, cmfsa
coefs = calibrate(n, k1, np.arange(1,10), N_realiz=15)
100%|██████████| 9/9 [00:00<00:00, 9.75it/s]
d_cmfsa = cmfsa(X, k1, powers=[-1, 1, 2, 3], alphas=coefs)[-1]
fig2, ax2 = plt.subplots(1,1)
_ = ax2.hist(dims_fsa_edge[:, -1], density=True, bins=bins, alpha=0.3, label='FSA estimates (local)')
ax2.plot(w, pdf, label='theoretical pdf')
ax2.axvline(np.nanmedian(dims_fsa_edge[:, -1]), color='blue', label='mFSA estimate (global)', ls='--')
ax2.axvline(np.nanmean(dims_fsa_edge[:, -1]), color='grey', label='FSA estimate (global)')
_ = plt.axvline(d_cmfsa, color='r', linestyle='--', zorder=101, label='cmFSA')
_ = plt.axvline(d, color='k', label='d')
plt.legend()
ax2.set_xlim(0, 15)
(0.0, 15.0)
from cmfsapy.dimension.fsa import ml_dims
from scipy.stats import hmean
d_ml = ml_dims(X, k2=k1)[0]
plt.figure()
plt.hist(d_ml, bins=2*bins, density=True, alpha=0.5, label='local ML estimates')
plt.axvline(d_ml.mean(), color='r', label='Levina & Bickel')
plt.axvline(1 / np.mean(1/d_ml), color='grey', label='MacKay & Ghahramani')
plt.axvline(np.nanmedian(d_ml), color='blue', label='median')
plt.axvline(d, color='k', label='d')
plt.legend()
plt.title('Levina-Bickel estimator')
plt.xlim(0, 20)
plt.tight_layout()