Source code for knockpy.knockoffs

import warnings
import numpy as np
import scipy as sp
from scipy import stats
import scipy.linalg

from .utilities import calc_group_sizes, preprocess_groups
from .utilities import shift_until_PSD, scale_until_PSD
from . import utilities, smatrix, constants


### Base Knockoff Class and Gaussian Knockoffs


[docs]class KnockoffSampler: """ Base class for sampling knockoffs.""" def __init__(self): pass
[docs] def sample_knockoffs(self): raise NotImplementedError()
[docs] def fetch_S(self): """ Fetches knockoff S-matrix.""" raise NotImplementedError
[docs] def check_PSD_condition(self, Sigma, S): """ Checks that the feature-knockoff cov matrix is PSD. Parameters ---------- Sigma : np.ndarray ``(p, p)``-shaped covariance matrix of the features. If None, this is estimated using the ``shrinkage`` option. This is ignored for fixed-X knockoffs. S : np.ndarray the ``(p, p)``-shaped knockoff S-matrix used to generate knockoffs. Raises ------ Raises an error if S is not PSD or 2 Sigma - S is not PSD. """ # Check PSD condition try: np.linalg.cholesky(2 * Sigma - S) except np.linalg.LinAlgError: min_eig = utilities.calc_mineig(2 * Sigma - S) raise np.linalg.LinAlgError( f"Minimum eigenvalue of 2Sigma - S is {min_eig}, meaning FDR control violations are extremely likely" )
[docs] def many_ks_tests(self, sample1s, sample2s): """ Samples1s, Sample2s = list of arrays Gets p values by running ks tests and then does a multiple testing correction. """ # KS tests pvals = [] for s, sk in zip(sample1s, sample2s): result = stats.ks_2samp(s, sk) pvals.append(result.pvalue) pvals = np.array(pvals) # Naive Bonferroni correction adj_pvals = np.minimum(pvals.shape[0]*pvals, 1) return pvals, adj_pvals
[docs] def check_xk_validity(self, X, Xk, testname="", alpha=0.001): """ Runs a variety of KS tests on X and Xk to (informally) check that Xk are valid knockoffs for X. Uses the BHQ adjustment for multiple testing. Parameters ---------- X : np.ndarray the ``(n, p)``-shaped design Xk : np.ndarray the ``(n, p)``-shaped matrix of knockoffs testname : str a testname that shows up in the error alpha : float The significance level. Defaults to 0.001 """ n = X.shape[0] p = X.shape[1] # Marginal KS tests marg_pvals, marg_adj_pvals = self.many_ks_tests( sample1s=[X[:, j] for j in range(p)], sample2s=[Xk[:, j] for j in range(p)] ) min_adj_pval = marg_adj_pvals.min() if min_adj_pval < alpha: raise ValueError( f"For testname={testname}, MARGINAL ks tests reject with min_adj_pval={min_adj_pval}" ) # Pairwise KS tests pair_pvals, pair_adj_pvals = self.many_ks_tests( sample1s=[X[:, j] * X[:, j + 1] for j in range(p - 1)], sample2s=[Xk[:, j] * Xk[:, j + 1] for j in range(p - 1)], ) min_adj_pval = pair_adj_pvals.min() if min_adj_pval < alpha: raise ValueError( f"For testname={testname}, PAIRED ks tests reject with min_adj_pval={min_adj_pval}" ) # Pair-swapped KS tests pswap_pvals, pswap_adj_pvals = self.many_ks_tests( sample1s=[X[:, j] * Xk[:, j + 1] for j in range(p - 1)], sample2s=[Xk[:, j] * X[:, j + 1] for j in range(p - 1)], ) min_adj_pval = pswap_adj_pvals.min() if min_adj_pval < alpha: raise ValueError( f"For testname={testname}, PAIR SWAPPED ks tests reject with min_adj_pval={min_adj_pval}" )
[docs]def produce_MX_gaussian_knockoffs(X, mu, invSigma, S, sample_tol, copies): # Calculate MX knockoff moments... n, p = X.shape invSigma_S = np.dot(invSigma, S) mu_k = X - np.dot(X - mu.reshape(1, -1), invSigma_S) # This is a bottleneck?? Vk = 2 * S - np.dot(S, invSigma_S) # Account for numerical errors try: Lk = np.linalg.cholesky(Vk) except np.linalg.LinAlgError: min_eig = utilities.calc_mineig(Vk) warnings.warn( f"Minimum eigenvalue of Vk is {min_eig}, cholesky decomp failed, FDR violations possible" ) Vk = shift_until_PSD(Vk, sample_tol) Lk = np.linalg.cholesky(Vk) # ...and sample MX knockoffs! knockoffs = np.dot(Lk, np.random.randn(n, p, copies)) # result is (p, n, copies) knockoffs = np.transpose(knockoffs, [1, 0, 2]) # Add mu mu_k = np.expand_dims(mu_k, axis=2) knockoffs = knockoffs + mu_k return knockoffs
[docs]class GaussianSampler(KnockoffSampler): """ Samples MX Gaussian (group) knockoffs. Parameters ---------- X : np.ndarray the ``(n, p)``-shaped design mu : np.ndarray ``(p, )``-shaped mean of the features. If None, this defaults to the empirical mean of the features. Sigma : np.ndarray ``(p, p)``-shaped covariance matrix of the features. If None, this is estimated using the ``utilities.estimate_covariance`` function. groups : np.ndarray For group knockoffs, a p-length array of integers from 1 to num_groups such that ``groups[j] == i`` indicates that variable `j` is a member of group `i`. Defaults to None (regular knockoffs). S : np.ndarray the ``(p, p)``-shaped knockoff S-matrix used to generate knockoffs. This is defined such that Cov(X, tilde(X)) = Sigma - S. When None, will be constructed by knockoff generator. Defaults to None. method : str Specifies how to construct S matrix. This will be ignored if ``S`` is not None. There are several options: - 'mvr': Minimum Variance-Based Reconstructability knockoffs. - 'mmi': Minimizes the mutual information between X and the knockoffs. - 'ci': Conditional independence knockoffs. - 'sdp': minimize the mean absolute covariance (MAC) between the features and the knockoffs. - 'equicorrelated': Minimizes the MAC under the constraint that the the correlation between each feature and its knockoff is the same. The default is to use mvr for non-group knockoffs, and to use the group-SDP for grouped knockoffs (the implementation for group mvr knockoffs is currently fairly slow). In both cases we use a block-diagonal approximation if the number if features is greater than 1000. objective : str How to optimize the S matrix if using the SDP for group knockoffs. There are several options: - 'abs': minimize sum(abs(Sigma - S)) between groups and the group knockoffs. - 'pnorm': minimize Lp-th matrix norm. Equivalent to abs when p = 1. - 'norm': minimize different type of matrix norm (see norm_type below). sample_tol : float Minimum eigenvalue allowed for feature-knockoff covariance matrix. Keep this small but nonzero (1e-5) to prevent numerical errors. verbose : bool If True, prints progress over time rec_prop : float The proportion of knockoffs to recycle (see Barber and Candes 2018, https://arxiv.org/abs/1602.03574). If method = 'mvr', then S_generation takes this into account and should increase the power of recycled knockoffs. sparsely-correlated, high-dimensional settings. kwargs : dict Other kwargs for S-matrix solvers. """ def __init__( self, X, mu=None, Sigma=None, invSigma=None, groups=None, sample_tol=1e-5, S=None, method=None, verbose=False, **kwargs, ): # Save parameters with defaults self.X = X self.n = X.shape[0] self.p = X.shape[1] if mu is None: mu = X.mean(axis=0) self.mu = mu if Sigma is None: Sigma, invSigma = utilities.estimate_covariance(X, tol=1e-2) self.Sigma = Sigma if invSigma is None: invSigma = np.linalg.inv(Sigma) self.invSigma = invSigma if groups is None: groups = np.arange(1, self.p + 1, 1) self.groups = groups self.sample_tol = sample_tol self.verbose = verbose # Save S information and possibly compute S matrix self.S = S self.method = method self.kwargs = kwargs if self.S is None: if self.verbose: print(f"Computing knockoff S matrix...") self.S = smatrix.compute_smatrix( Sigma=self.Sigma, groups=self.groups, method=self.method, **self.kwargs )
[docs] def fetch_S(self): return self.S
[docs] def sample_knockoffs(self, check_psd=False): """ Samples knockoffs. returns n x p knockoff matrix. Parameters ---------- check_psd : bool If True, will check and enforce that S is a valid S-matrix. Defalts to False. """ if check_psd: self.check_PSD_condition(self.Sigma, self.S) self.Xk = produce_MX_gaussian_knockoffs( X=self.X, mu=self.mu, invSigma=self.invSigma, S=self.S, sample_tol=self.sample_tol, copies=1 )[:, :, 0] return self.Xk
[docs]def produce_FX_knockoffs(X, invSigma, S, copies=1): """ See equation (1.4) of https://arxiv.org/pdf/1404.5609.pdf """ # Calculate C matrix n, p = X.shape # invSigma_S = np.dot(invSigma, S) CTC = 2 * S - np.dot(S, np.dot(invSigma, S)) C = scipy.linalg.cholesky(CTC) # Calculate U matrix Q, _ = scipy.linalg.qr(np.concatenate([X, np.zeros((n, p))], axis=1)) U = Q[:, p : 2 * p] # Randomize if copies > 1 knockoff_base = np.dot(X, np.eye(p) - np.dot(invSigma, S)) if copies > 1: knockoffs = [] for j in range(copies): # Multiply U by random orthonormal matrix Qj, _ = scipy.linalg.qr(np.random.randn(p, p)) Uj = np.dot(U, Qj) # Calculate knockoffs knockoff_j = knockoff_base + np.dot(Uj, C) knockoffs.append(knockoff_j) else: # Calculate knockoffs and return knockoffs = [(knockoff_base + np.dot(U, C))] knockoffs = np.stack(knockoffs, axis=-1) return knockoffs
[docs]class FXSampler(KnockoffSampler): """ Samples FX knockoffs. See the GaussianSampler documentation for description of the arguments. """ def __init__( self, X, groups=None, sample_tol=1e-5, S=None, method=None, verbose=False, **kwargs, ): # Save data self.X = X.copy() self.n = X.shape[0] self.p = X.shape[1] if self.n < 2 * self.p: raise np.linalg.LinAlgError( f"FX knockoffs can't be generated with n ({self.n}) < 2p ({2*self.p})" ) self.Sigma = np.dot(self.X.T, self.X) self.invSigma = np.linalg.inv(self.Sigma) kwargs.pop("Sigma", None) kwargs.pop("invSigma", None) # Other parameters if groups is None: groups = np.arange(1, self.p + 1, 1) self.groups = groups self.sample_tol = sample_tol self.verbose = verbose # Save S information and possibly compute S matrix self.S = S self.method = method self.kwargs = kwargs if self.S is None: if self.verbose: print(f"Computing knockoff S matrix...") self.kwargs['tol'] = self.kwargs.get('tol', constants.DEFAULT_TOL / self.n) self.S = smatrix.compute_smatrix( Sigma=self.Sigma, groups=self.groups, method=self.method, **self.kwargs )
[docs] def fetch_S(self): """ Rescales S to the same scale as the initial X input """ return self.S
[docs] def sample_knockoffs(self, check_psd=False): """ Samples knockoffs. returns n x p knockoff matrix. Parameters ---------- check_psd : bool If True, will check and enforce that S is a valid S-matrix. Defalts to False. """ if check_psd: self.check_PSD_condition(self.Sigma, self.S) self.Xk = produce_FX_knockoffs( X=self.X, invSigma=self.invSigma, S=self.S, copies=1, )[:, :, 0] return self.Xk