""" A collection of functions for generating synthetic datasets."""
import numpy as np
from scipy import stats
# Utility functions
from . import utilities
from .utilities import shift_until_PSD, chol2inv, cov2corr
# Tree methods
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance as ssd
DEFAULT_DF_T = 3
[docs]def Wishart(d=100, p=100, tol=1e-2):
"""
Let W be a random `d` x `p` matrix with i.i.d. Gaussian
entries. Then Sigma = ``cov2corr``(W^T W).
"""
W = np.random.randn(d, p)
V = np.dot(W.T, W)
V = cov2corr(V)
return cov2corr(shift_until_PSD(V, tol=tol))
[docs]def DirichletCorr(p=100, temp=1, tol=1e-6):
"""
Generates a correlation matrix by sampling p eigenvalues
from a dirichlet distribution whose `p` parameters are i.i.d.
uniform [`tol`, `temp`] and generating a random covariance matrix
with those eigenvalues.
"""
alpha = np.random.uniform(temp, size=p)
d = stats.dirichlet(alpha=alpha).rvs().reshape(-1)
# We have to round to prevent errors from random_correlation,
# which is supperr sensitive to d.sum() != p even when the
# error is a floating point error.
d = np.around(d + tol, 6)
d = p * d / d.sum()
d[0] += p - d.sum() # This is like 1e-10 but otherwise throws an error
# Create and return matrix
V = stats.random_correlation.rvs(d)
return V
[docs]def AR1(p=30, a=1, b=1, tol=None, max_corr=1, rho=None):
"""
Generates `p`-dimensional correlation matrix for
AR(1) Gaussian process, where successive correlations
are drawn from Beta(`a`,`b`) independelty. If `rho` is
specified, then the process is stationary with correlation
`rho`.
"""
# Generate rhos, take log to make multiplication easier
if rho is None:
rhos = np.log(np.minimum(
stats.beta.rvs(size=p, a=a, b=b), max_corr
))
else:
if np.abs(rho) > 1:
raise ValueError(f"rho {rho} must be a correlation between -1 and 1")
rhos = np.log(np.array([rho for _ in range(p)]))
rhos[0] = 0
# Log correlations between x_1 and x_i for each i
cumrhos = np.cumsum(rhos).reshape(p, 1)
# Use cumsum tricks to calculate all correlations
log_corrs = -1 * np.abs(cumrhos - cumrhos.transpose())
corr_matrix = np.exp(log_corrs.astype(np.float32))
# Ensure PSD-ness
if tol is not None:
corr_matrix = cov2corr(shift_until_PSD(corr_matrix, tol))
return corr_matrix
[docs]def NestedAR1(p=500, a=7, b=1, tol=1e-3, num_nests=5, nest_size=2):
"""
Generates correlation matrix for AR(1) Gaussian process
with hierarchical correlation structure.
"""
# First set of correlations
rhos = np.log(stats.beta.rvs(size=p, a=a, b=b))
rhos[0] = 0
# Create nested structure
for j in range(1, num_nests + 1):
rho_samples = np.log(stats.beta.rvs(size=p, a=a, b=b))
# These indexes will get smaller correlations
nest_inds = np.array(
[x for x in range(p) if x % (nest_size ** j) == int(nest_size / 2)]
).astype("int")
mask = np.zeros(p)
mask[nest_inds] = 1
rhos += mask * rho_samples
# Generate cov matrix
rhos[0] = 0
cumrhos = np.cumsum(rhos).reshape(p, 1)
# Use cumsum tricks to calculate all correlations
log_corrs = -1 * np.abs(cumrhos - cumrhos.transpose())
corr_matrix = np.exp(log_corrs)
# Ensure PSD-ness
corr_matrix = cov2corr(shift_until_PSD(corr_matrix, 1e-3))
return corr_matrix
[docs]def ErdosRenyi(p=300, delta=0.2, lower=0.1, upper=1, tol=1e-1):
"""
Randomly samples bernoulli flags as well as values
for partial correlations to generate sparse square
matrices. Follows https://arxiv.org/pdf/1908.11611.pdf.
"""
# Initialization
V = np.zeros((p, p))
triang_size = int((p ** 2 + p) / 2)
# Sample the upper triangle
mask = stats.bernoulli.rvs(delta, size=triang_size)
vals = np.random.uniform(lower, upper, size=triang_size)
flags = 2 * np.random.binomial(1, 0.5, triang_size) - 1
triang = mask * flags * vals
# Set values and add diagonal
upper_inds = np.triu_indices(p, 0)
V[upper_inds] = triang
V = V + V.T
V += np.eye(p) - np.diag(np.diag(V))
# Force to be positive definite -
V = shift_until_PSD(V, tol=tol)
return V
[docs]def FactorModel(
p=500, rank=2,
):
"""
Generates random correlation matrix from a factor model
with dimension `p` and rank `rank`.
"""
diag_entries = np.random.uniform(low=0.01, high=1, size=p)
noise = np.random.randn(p, rank) / np.sqrt(rank)
V = np.diag(diag_entries) + np.dot(noise, noise.T)
V = utilities.cov2corr(V)
return V, None
[docs]def PartialCorr(
p=300, rho=0.3,
):
"""
Creates a correlation matrix of dimension `p` with partial correlation `rho`.
"""
Q_init = rho * np.ones((p, p)) + (1 - rho) * np.eye(p)
V = np.linalg.inv(Q_init)
V, Q = utilities.cov2corr(V, Q_init)
return V, Q
[docs]def block_equi_graph(
n=3000,
p=1000,
block_size=5,
sparsity=0.1,
rho=0.5,
gamma=0,
coeff_size=3.5,
coeff_dist=None,
sign_prob=0.5,
iid_signs=True,
corr_signals=False,
beta=None,
mu=None,
**kwargs,
):
"""
Samples data according to a block-equicorrelated Gaussian design,
where ``rho`` is the within-block correlation and ``gamma * rho``
is the between-block correlation.
Parameters
----------
n : int
The number of data points
p : int
The dimensionality of the data
block_size : int
The size of blocks. Defaults to 5.
sparsity : float
The proportion of groups which are null. Defaults to 0.1
rho : float
The within-group correlation
gamma : float
The between-group correlation is ``gamma * rho``
beta : np.ndarray
If supplied, the ``(p,)``-shaped set of coefficients. Else,
will be generated by calling
``knockpy.dgp.create_sparse_coefficients.``
mu : np.ndarray
The ``p``-dimensional mean of the covariates. Defaults to 0.
kwargs : dict
Args passed to ``knockpy.dgp.sample_response``.
Notes
-----
This defaults to the same data-generating process as
Dai and Barber 2016 (see https://arxiv.org/abs/1602.03589).
"""
# Set default values
num_groups = int(p / block_size)
# Create blocks / blocks
groups = np.array([int(i / (p / num_groups)) for i in range(p)])
# Helper fn for covariance matrix
def get_corr(g1, g2):
if g1 == g2:
return rho
else:
return gamma * rho
get_corr = np.vectorize(get_corr)
# Create correlation matrix, invert
Xcoords, Ycoords = np.meshgrid(groups, groups)
Sigma = get_corr(Xcoords, Ycoords)
Sigma = Sigma + np.eye(p) - np.diagflat(np.diag(Sigma))
Q = np.linalg.inv(Sigma)
# Create beta
if beta is None:
beta = create_sparse_coefficients(
p=p,
sparsity=sparsity,
coeff_size=coeff_size,
groups=groups,
sign_prob=sign_prob,
iid_signs=iid_signs,
coeff_dist=coeff_dist,
corr_signals=corr_signals,
n=n,
)
# Sample design matrix
if mu is None:
mu = np.zeros(p)
X = np.dot(
np.random.randn(n, p),
np.linalg.cholesky(Sigma).T
) + mu.reshape(1, -1)
# Sample y
y = sample_response(X, beta, **kwargs)
return X, y, beta, Q, Sigma, groups + 1
[docs]def create_sparse_coefficients(
p,
sparsity=0.5,
groups=None,
coeff_size=1,
coeff_dist=None,
sign_prob=0.5,
iid_signs=True,
corr_signals=False,
n=None,
):
"""
Generate a set of sparse coefficients for single index or sparse additive models.
p : int
Dimensionality of coefficients
sparsity : float
Proportion of non-null coefficients. Generates ``np.floor(p*sparsity)`` non-nulls.
coeff_size: float
Size of non-zero coefficients
groups : np.ndarray
A p-length array of integers from 1 to num_groups such that
``groups[j] == i`` indicates that variable `j` is a member of group `i`.
Defaults to ``None``. Else, floor(sparsity * num_groups) groups will be
chosen to be non-null, where all elements of each group are non-null.
sign_prob : float
The probability that each nonzero coefficient will be positive.
iid_signs : bool
If True, the signs of the coeffs are assigned independently.
Else, exactly sign_prob*sparsity*p coefficients will be positive.
coeff_dist : str
Specifies the distribution of nonzero coefficients. Three options:
- ``None``: all coefficients have absolute value coeff_size.
- ``normal``: all nonzero coefficients are drawn from Normal(coeff_size, 1).
- ``uniform``: nonzero coeffs are drawn from Unif(coeff_size/2, coeff_size).
corr_signals : bool
If true, all of the nonzero coefficients will lie in a consecutive block.
Returns
-------
beta : np.ndarray
``(p,)``-shaped array of sparse coefficients
"""
# First, decide which coefficients are nonzero, one of two methods
if groups is not None:
# Method one: a certain number of groups are nonzero
num_groups = np.unique(groups).shape[0]
num_nonzero_groups = int(np.floor(sparsity * num_groups))
chosen_groups = np.random.choice(
np.unique(groups), num_nonzero_groups, replace=False
)
beta = np.array([coeff_size if i in chosen_groups else 0 for i in groups])
else:
# Method two (default): a certain percentage of coefficients are nonzero
num_nonzero = int(np.floor(sparsity * p))
if corr_signals:
nonnull_start = np.random.randint(0, p - num_nonzero + 1)
beta = np.zeros(p)
beta[nonnull_start : nonnull_start + num_nonzero] = coeff_size
else:
beta = np.array([coeff_size] * num_nonzero + [0] * (p - num_nonzero))
np.random.shuffle(beta)
beta_nonzeros = beta != 0
num_nonzero = beta_nonzeros.sum()
# Now draw random signs
if iid_signs:
signs = 1 - 2 * stats.bernoulli.rvs(sign_prob, size=p)
beta = beta * signs
else:
num_pos = int(np.floor(num_nonzero * sign_prob))
signs = np.concatenate(
[np.ones((num_pos)), -1 * np.ones((num_nonzero - num_pos))]
)
np.random.shuffle(signs)
beta[beta_nonzeros] = beta[beta_nonzeros] * signs
# Possibly change the absolute values of beta
if coeff_dist is not None:
if str(coeff_dist).lower() == "normal":
beta = (np.sqrt(coeff_size) * np.random.randn(p)) * beta_nonzeros
elif str(coeff_dist).lower() == "uniform":
beta = beta * np.random.uniform(size=p) / 2 + beta / 2
elif str(coeff_dist).lower() == "dsliu2020":
if num_nonzero != 50:
raise ValueError(
f"To replicate dsliu2020 paper, need num_nonzero ({num_nonzero})=50"
)
variance = 10 * np.sqrt(np.log(p) / n)
beta = beta_nonzeros * np.sqrt(variance) * np.random.randn(p)
elif str(coeff_dist).lower() == "gmliu2019":
if num_nonzero != 60:
raise ValueError(
f"To replicate gmliu2019 paper, need num_nonzero ({num_nonzero})=60"
)
variance = 20 / np.sqrt(n)
beta = beta_nonzeros * np.sqrt(variance) * np.random.randn(p)
elif str(coeff_dist).lower() == "none":
pass
else:
raise ValueError(
f"coeff_dist ({coeff_dist}) must be 'none', 'normal', or 'uniform'"
)
return beta
[docs]def sample_response(X, beta, cond_mean="linear", y_dist="gaussian"):
"""
Given a design matrix X and coefficients beta, samples a response y.
X : np.ndarray
``(n, p)``-shaped design matrix
beta : np.ndarray
``(p, )``-shaped coefficient vector
cond_mean : str
How to calculate the conditional mean of y given X, denoted mu(X).
Six options:
1. "linear" denotes ``np.dot(X, beta)``
2. "cubic" denotes ``np.dot(X**3, beta) - np.dot(X, beta)``
3. "trunclinear" ``((X * beta >= 1).sum(axis = 1))``
Stands for truncated linear.
4. "pairint": pairs up non-null coefficients according to the
order of beta, multiplies them and their beta values, then
sums. "pairint" stands for pairwise-interactions.
5. "cos": ``mu(X) = sign(beta) * (beta != 0) * np.cos(X)``
6. "quadratic": ``mu(X) = np.dot(np.power(X, 2), beta)``
y_dist : str
If "gaussian", y is the conditional mean plus gaussian noise.
If "binomial", Pr(y=1) = softmax(cond_mean).
Returns
-------
y : np.ndarray
``(n,)``-shaped response vector
"""
n = X.shape[0]
p = X.shape[1]
if cond_mean == "linear":
cond_mean = np.dot(X, beta)
elif cond_mean == "quadratic":
cond_mean = np.dot(np.power(X, 2), beta)
elif cond_mean == "cubic":
cond_mean = np.dot(np.power(X, 3), beta) - np.dot(X, beta)
elif cond_mean == "trunclinear":
cond_mean = ((X * beta >= 1) * np.sign(beta)).sum(axis=1)
elif cond_mean == "cos":
cond_mean = (np.sign(beta) * (beta != 0) * np.cos(X)).sum(axis=1)
elif cond_mean == "pairint":
# Pair up the coefficients
pairs = [[]]
for j in range(p):
if beta[j] != 0:
if len(pairs[-1]) < 2:
pairs[-1].append(j)
else:
pairs.append([j])
# Multiply pairs and add to cond_mean
cond_mean = np.zeros(n)
for pair in pairs:
interaction_term = np.ones(n)
for j in pair:
interaction_term = interaction_term * beta[j]
interaction_term = interaction_term * X[:, j]
cond_mean += interaction_term
else:
raise ValueError(
f"cond_mean must be one of 'linear', 'quadratic', cubic', 'trunclinear', 'cos', 'pairint', not {cond_mean}"
)
# Create y, one of two families
if y_dist == "gaussian":
y = cond_mean + np.random.standard_normal((n))
elif y_dist == "binomial":
probs = 1 / (1 + np.exp(np.minimum(-1 * cond_mean, 30)))
y = stats.bernoulli.rvs(probs)
else:
raise ValueError(f"y_dist must be one of 'gaussian', 'binomial', not {y_dist}")
return y
[docs]def sample_ar1t(
rhos, n=50, df_t=DEFAULT_DF_T,
):
"""
Samples t-distributed variables according to a Markov chain.
Parameters
----------
rhos : np.ndarray
``(p-1, )``-length array of correlations between consecutive
variables.
n : int
The number of data points to sample
df_t : float
The degrees of freedom of the t variables
Returns
-------
X : np.ndarray
``(n, p)``-shaped design matrix
"""
# Initial t samples
p = rhos.shape[0] + 1
tvars = stats.t(df=df_t).rvs(size=(n, p))
# Initialize X
X = np.zeros((n, p))
scale = np.sqrt((df_t - 2) / df_t)
X[:, 0] = scale * tvars[:, 0]
# Loop through variables according to markov chain
conjugates = np.sqrt(1 - rhos ** 2)
for j in range(1, p):
X[:, j] = rhos[j - 1] * X[:, j - 1] + conjugates[j - 1] * scale * tvars[:, j]
return X
[docs]def cov2blocks(V, tol=1e-5):
"""
Decomposes a PREORDERED block-diagonal matrix V into its blocks.
"""
p = V.shape[0]
blocks = []
block_start = 0
block_inds = []
for j in range(p + 1):
# Detect if we have exited the block
if j == p:
blocks.append(V[block_start:j, block_start:j])
block_inds.append(list(range(block_start, j)))
elif np.abs(V[block_start, j]) < tol:
# If so, reset the block_start
blocks.append(V[block_start:j, block_start:j])
block_inds.append(list(range(block_start, j)))
block_start = j
return blocks, block_inds
[docs]def sample_block_tmvn(
blocks, n=50, df_t=DEFAULT_DF_T,
):
"""
Samples a blocks of multivariate-t distributed variables
according to a list of covariance matrices called ``blocks``.
Parameters
----------
blocks : list
A list of square, symmetric numpy arrays. These are
the covariance matrices for each block of variables.
n : int
The number of data points to sample
df_t : float
The degrees of freedom of the t-distribution
Notes
-----
The variables are scaled such that the marginal variance
of each variable equals a diagonal element in one of the blocks.
"""
# Possibly calculate sqrt roots
#if block_sqrts is None:
block_sqrts = [np.linalg.cholesky(block) for block in blocks]
# if len(block_sqrts) != len(blocks):
# raise ValueError(f"Blocks and block_sqrts must have same length")
# Loop through blocks and sample multivariate t
X = []
for i, block_sqrt in enumerate(block_sqrts):
# Dimensionality and also sample chisquares
p_block = block_sqrt.shape[0]
chi_block = stats.chi2.rvs(df=df_t, size=(n, 1))
# Linear transformatino + chi square multiplication
Z_block = np.random.randn(n, p_block) # n x p
t_block = np.dot(Z_block, block_sqrt.T) # n x p
t_block = np.sqrt(df_t / chi_block) * t_block
X.append(t_block)
# Append back together and scale such that variance is 1
scale = np.sqrt((df_t - 2) / (df_t))
X = scale * np.concatenate(X, axis=1)
return X
### Helper functions for Gibbs Sampling
[docs]def num2coords(i, gridwidth=10):
"""
Coordinates of variable i in a Gibbs grid
Parameters
----------
i : int
Position of variable in ordering
gridwidth : int
Width of the grid
Returns
-------
length_coord, width_coord (coordinates)
"""
length_coord = i % gridwidth
width_coord = i // gridwidth
return int(length_coord), int(width_coord)
[docs]def coords2num(l, w, gridwidth=10):
""" Takes coordinates of variable in a Gibbs grid, returns position"""
if l < 0 or w < 0:
return -1
if l >= gridwidth or w >= gridwidth:
return -1
return int(w * gridwidth + l)
[docs]def graph2cliques(Q):
"""
Turns graph Q of connections into binary cliques for Gibbs grid
"""
p = Q.shape[0]
clique_dict = {i: [] for i in range(p)}
for i in range(p):
for j in range(i):
if Q[i, j] != 0:
clique_dict[i].append((i, j))
clique_dict[j].append((j, i))
# Remove duplicates
for i in range(p):
clique_dict[i] = list(set(clique_dict[i]))
return clique_dict
[docs]def construct_gibbs_grid(n, p, temp=1):
"""
Creates gridlike ``gibbs_graph`` parameter for ``sample_gibbs``.
See ``sample_gibbs``.
"""
# Infer dimensionality
gridwidth = int(np.sqrt(p))
variables = set(list(range(p)))
# Generate
gibbs_graph = np.zeros((p, p))
for i1 in range(p):
lc, wc = num2coords(i1, gridwidth=gridwidth)
for ladd in [-1, 1]:
i2 = coords2num(lc + ladd, wc, gridwidth=gridwidth)
if i2 != -1:
sign = 1 - 2 * np.random.binomial(1, 0.5)
gibbs_graph[i1, i2] = temp * sign
gibbs_graph[i2, i1] = temp * sign
for wadd in [-1, 1]:
i2 = coords2num(lc, wc + wadd, gridwidth=gridwidth)
if i2 != -1:
sign = 1 - 2 * np.random.binomial(1, 0.5)
gibbs_graph[i1, i2] = temp * sign
gibbs_graph[i2, i1] = temp * sign
return gibbs_graph
[docs]def sample_gibbs(
n, p, gibbs_graph=None, temp=1, num_iter=15, K=20, max_val=2.5,
):
""" Samples from a discrete Gibbs measure using a gibbs sampler.
The joint likelihood for the `p`-dimensional data X1 through Xp
is the product of all terms of the following form:
```np.exp(-1*gibbs_graph[i,j]*np.abs(Xi - Xj))```
where `gibbs_graph` is assumed to be symmetric (and is usually
sparse). Each feature takes values on an evenly spaced grid from
``-1*max_val`` to ``max_val`` with ``K`` values.
Parameters
----------
n : int
How many data points to sample
p : int
Dimensionality of the data
gibbs_graph : np.ndarray
A symmetric ``(p, p)``-shaped array. See likelihood equation.
By default, this is corresponds to an undirected graphical
model with a square grid (like an Ising model) with nonzero
entries set to 1 or -1 with equal probability.
temp : float
Governs the strength of interactions between features---see
the likelihood equation.
num_iter : int
Number of iterations in the Gibbs sampler; defaults to 15.
K : int
Number of discrete values each sampled feature can take.
max_val : float
The maximum absolute value each feature can take.
Returns
-------
X : np.ndarray
``(n, p)``-shaped array of data
gibbs_graph : np.ndarray
The generated `gibbs_graph`.
"""
# Create buckets from (approximately) -max_val to max_val
buckets = np.arange(-K + 1, K + 1, 2) / (K / max_val)
# Log potentials generator
def log_potential(X1, X2=None, temp=1):
if X2 is None:
X2 = X1[:, 1]
X1 = X1[:, 0]
return -1 * temp * np.abs(X1 - X2)
# Construct cliques from gibbs_graph
if gibbs_graph is None:
gibbs_graph = construct_gibbs_grid(n=n, p=p, temp=temp)
clique_dict = graph2cliques(gibbs_graph)
# Initialize
X = np.random.randn(n, p, 1)
dists = np.abs(X - buckets.reshape(1, 1, -1))
indices = dists.argmin(axis=-1)
X = buckets[indices]
# Marginalize / gibbs sample
for _ in range(num_iter):
for j in range(p):
# Cliques and marginals for this node
cliques = clique_dict[j]
marginals = np.zeros((n, K))
for clique in cliques:
# Calculate log-potential from this clique
X1 = buckets.reshape(1, -1)
X2 = X[:, clique[-1]].reshape(-1, 1)
marginals += log_potential(
X1=X1, X2=X2, temp=gibbs_graph[j, clique[-1]]
)
# Resample distribution of this value of X
# Note the exp is a bottleneck here for efficiency
marginals = np.exp(marginals.astype(np.float32))
marginals = marginals / marginals.sum(axis=-1, keepdims=True)
marginals = marginals.cumsum(axis=-1)
# Batched multinomial sampling using uniforms
# (faster than for loops + numpy)
unifs = np.random.uniform(size=(n, 1))
Xnew = buckets[np.argmax(unifs <= marginals, axis=-1)]
X[:, j] = Xnew
return X, gibbs_graph
[docs]class DGP:
"""
A utility class which creates a (random) data-generating process
for a design matrix X and a response y. If the parameters are
not specified, they will be randomly generated during the
``sample_data`` method.
Parameters
----------
mu : np.ndarray
``(p,)``-shaped mean vector for X data
Sigma : np.ndarray
``(p, p)``-shaped covariance matrix for X data
invSigma : np.ndarray
``(p, p)``-shaped precision matrix for X data
beta : np.ndarray
coefficients used to generate y from X in a single
index or sparse additive model.
gibbs_graph : np.ndarray
``(p, p)``-shaped matrix of coefficients for gibbs
grid method.
Attributes
----------
mu : np.ndarray
See above
Sigma : np.ndarray
See above
invSigma : np.ndarray
See above
beta : np.ndarray
See above
gibbs_Graph : np.ndarray
See above
X : np.ndarray
``(n, p)``-shaped design matrix
y : np.ndarray
``(n, )``-shaped response vector
"""
def __init__(
self,
mu=None,
Sigma=None,
invSigma=None,
beta=None,
gibbs_graph=None
):
self.mu = mu
self.Sigma = Sigma
self.invSigma = invSigma
self.beta = beta
self.gibbs_graph = gibbs_graph
[docs] def sample_data(
self,
p=100,
n=50,
x_dist="gaussian",
method="AR1",
y_dist="gaussian",
cond_mean="linear",
coeff_size=1,
coeff_dist=None,
sparsity=0.5,
groups=None,
sign_prob=0.5,
iid_signs=True,
corr_signals=False,
df_t=DEFAULT_DF_T,
**kwargs,
):
"""
(Possibly) generates random data-generating parameters
and then samples data using those parameters. By default,
(X, y) are jointly Gaussian and y is a linear response to
X.
Parameters
----------
n : int
The number of data points
p : int
The dimensionality of the data
x_dist : str
Specifies the distribution of X. One of "Gaussian", "blockt", "ar1t", or "gibbs".
method : str
How to generate the covariance matrix of X. One of
"AR1", "NestedAR1", "partialcorr", "factor",
"blockequi", "ver", "qer", "dirichlet", or "uniformdot".
See the docs for each of these methods.
y_dist : str
Specifies the distribution of y. One of "Gaussian" or "binomial".
cond_mean : str
How to calculate the conditional mean of y given X. Defaults to
"linear". See ``knockpy.dgp.sample_response`` for more options.
coeff_size: float
Size of non-zero coefficients
coeff_dist : str
Specifies the distribution of nonzero coefficients. Three options:
- ``None``: all coefficients have absolute value coeff_size.
- ``normal``: all nonzero coefficients are drawn from Normal(coeff_size, 1).
- ``uniform``: nonzero coeffs are drawn from Unif(coeff_size/2, coeff_size).
sparsity : float
Proportion of non-null coefficients.
Generates ``np.floor(p*sparsity)`` non-nulls.
groups : np.ndarray
A p-length array of integers from 1 to num_groups such that
``groups[j] == i`` indicates that variable `j` is a member of group `i`.
Defaults to ``None``. Else, floor(sparsity * num_groups) groups will be
chosen to be non-null, where all elements of each group are non-null.
sign_prob : float
The probability that each nonzero coefficient will be positive.
iid_signs : bool
If True, the signs of the coeffs are assigned independently.
Else, exactly sign_prob*sparsity*p coefficients will be positive.
corr_signals : bool
If true, all of the nonzero coefficients will lie in a consecutive block.
df_t : float
If the X variables are marginally t-distributed, the degrees of freedom.
kwargs: dict
keyword arguments to pass to method for generating the covariance matrix.
"""
# Defaults
if self.mu is None:
self.mu = np.zeros(p)
# grid / Gibbs Sampling
if x_dist == "gibbs":
# Sample X, Q
self.X, self.gibbs_graph = sample_gibbs(
n=n, p=p, gibbs_graph=self.gibbs_graph, **kwargs
)
# Estimate cov matrix
if self.Sigma is None:
self.Sigma, self.invSigma = utilities.estimate_covariance(
self.X, tol=1e-6, shrinkage=None
)
# Create covariance matrix
if self.invSigma is None and self.Sigma is None:
method = str(method).lower()
if method == "ar1":
self.Sigma = AR1(p=p, **kwargs)
elif method == "nestedar1":
self.Sigma = NestedAR1(p=p, **kwargs)
elif method == "partialcorr":
self.Sigma, self.invSigma = PartialCorr(p=p, **kwargs)
elif method == "factor":
self.Sigma, self.invSigma = FactorModel(p=p, **kwargs)
elif method == "equi" or method == "blockequi":
_,_,self.beta,self.invSigma,self.Sigma,_ = block_equi_graph(
p=p,
n=n,
coeff_size=coeff_size,
coeff_dist=coeff_dist,
sparsity=sparsity,
sign_prob=sign_prob,
iid_signs=iid_signs,
corr_signals=corr_signals,
beta=self.beta,
**kwargs,
)
elif method == "ver":
max_corr = kwargs.pop('max_corr', 1) # enforce maximum corr
self.Sigma = cov2corr(ErdosRenyi(p=p, **kwargs))
self.Sigma = np.maximum(np.minimum(self.Sigma, max_corr), -1*max_corr)
self.Sigma += np.eye(p) - np.diag(np.diag(self.Sigma))
elif method == "qer":
invSigma = ErdosRenyi(p=p, **kwargs)
Sigma = np.linalg.inv(invSigma)
self.Sigma, self.invSigma = utilities.cov2corr(Sigma, invSigma)
elif method == "dirichlet":
self.Sigma = DirichletCorr(p=p, **kwargs)
elif method == "wishart":
self.Sigma = Wishart(p=p, **kwargs)
elif method == "uniformdot":
self.Sigma = UniformDot(p=p, **kwargs)
else:
raise ValueError(f"Other method {method} not implemented yet")
# elif self.invSigma is None:
# self.invSigma = chol2inv(self.Sigma)
elif self.Sigma is None:
self.Sigma = np.linalg.inv(self.invSigma)
self.Sigma, self.invSigma = cov2corr(self.Sigma, self.invSigma)
else:
pass
# Create sparse coefficients
if self.beta is None:
self.beta = create_sparse_coefficients(
p=p,
sparsity=sparsity,
coeff_size=coeff_size,
groups=groups,
sign_prob=sign_prob,
iid_signs=iid_signs,
coeff_dist=coeff_dist,
corr_signals=corr_signals,
n=n,
)
# Sample design matrix
if x_dist == "gibbs":
pass
elif x_dist == "gaussian":
self.X = np.dot(
np.random.randn(n, p),
np.linalg.cholesky(self.Sigma).T
) + self.mu.reshape(1, -1)
elif x_dist == "ar1t":
if str(method).lower() != "ar1":
raise ValueError(
f"For x_dist={x_dist}, method ({method}) should equal 'ar1'"
)
self.X = sample_ar1t(n=n, rhos=np.diag(self.Sigma, 1), df_t=df_t)
elif x_dist == "blockt":
blocks, _ = cov2blocks(self.Sigma)
self.X = sample_block_tmvn(blocks, n=n, df_t=df_t)
else:
raise ValueError(
f"x_dist must be one of 'gaussian', 'gibbs', 'ar1t', 'blockt'"
)
# Sample y
self.y = sample_response(
X=self.X, beta=self.beta, y_dist=y_dist, cond_mean=cond_mean
)
return self.X, self.y, self.beta, self.invSigma, self.Sigma
[docs]def create_correlation_tree(corr_matrix, method="single"):
""" Creates hierarchical clustering (correlation tree)
from a correlation matrix
Parameters
----------
corr_matrix : np.ndarray
``(p, p)``-shaped correlation matrix
method : str
the method of hierarchical clustering:
'single', 'average', 'fro', or 'complete'.
Defaults to 'single'.
Returns
-------
link : np.ndarray
The `link` of the correlation tree, as in scipy
"""
# Distance matrix for tree method
if method == "fro":
dist_matrix = np.around(1 - np.power(corr_matrix, 2), decimals=7)
else:
dist_matrix = np.around(1 - np.abs(corr_matrix), decimals=7)
dist_matrix -= np.diagflat(np.diag(dist_matrix))
condensed_dist_matrix = ssd.squareform(dist_matrix)
# Create linkage
if method == "single":
link = hierarchy.single(condensed_dist_matrix)
elif method == "average" or method == "fro":
link = hierarchy.average(condensed_dist_matrix)
elif method == "complete":
link = hierarchy.complete(condensed_dist_matrix)
else:
raise ValueError(
f'Only "single", "complete", "average", "fro" are valid methods, not {method}'
)
return link
[docs]def create_grouping(Sigma, cutoff=0.95, **kwargs):
""" Creates a knockoff grouping from a covariance matrix
using hierarchical clustering.
Parameters
----------
Sigma : np.ndarray
``(p, p)``-shaped covariance matrix
cutoff : np.ndarray
The correlation at which to "cut" the correlation tree.
For method='single', cutoff=0.95 ensures that
no two variables in different groups have absolute correlation
greater than 0.95.
Returns
-------
groups : np.ndarray
`(p,)`-shaped array of groups for each variable
"""
# Create correlation tree
corr_matrix = cov2corr(Sigma)
link = create_correlation_tree(
corr_matrix, **kwargs
)
return hierarchy.fcluster(link, 1-cutoff, criterion="distance")