import warnings
import numpy as np
import scipy as sp
import scipy.sparse.linalg
from scipy.sparse.linalg import eigs, eigsh
import sklearn.covariance
import itertools
from multiprocessing import Pool
from functools import partial
### Group helpers
[docs]def preprocess_groups(groups):
"""
Maps the m unique elements of a 1D "groups" array to the integers from 1 to m.
"""
unique_vals = np.sort(np.unique(groups))
conversion = {unique_vals[i]: i for i in range(unique_vals.shape[0])}
return np.array([conversion[x] + 1 for x in groups])
[docs]def fetch_group_nonnulls(non_nulls, groups):
"""
Combines feature-level null hypotheses into group-level hypothesis.
"""
if not isinstance(non_nulls, np.ndarray):
non_nulls = np.array(non_nulls)
if not isinstance(groups, np.ndarray):
groups = np.array(groups)
# Initialize
m = np.unique(groups).shape[0]
group_nonnulls = np.zeros(m)
# Calculate and return
for j in range(m):
flag = np.abs(non_nulls[groups == j + 1]).sum() > 0
group_nonnulls[j] = float(flag)
return group_nonnulls
[docs]def calc_group_sizes(groups):
"""
Given a list of groups, finds the sizes of the groups.
Parameters
----------
groups : np.ndarray
``(p, )``-shaped array which takes m integer values from
1 to m. If ``groups[i] == j``, this indicates that coordinate
``i`` belongs to group ``j``.
:param groups: p-length array of integers between 1 and m,
Returns
-------
sizes : np.ndarray
``(m, )``-length array of group sizes.
"""
if not isinstance(groups, np.ndarray):
groups = np.array(groups)
if np.all(groups.astype("int32") != groups):
raise TypeError(
"groups cannot contain non-integer values: apply preprocess_groups first"
)
else:
groups = groups.astype("int32")
if np.min(groups) == 0:
raise ValueError(
"groups cannot contain 0: add one or apply preprocess_groups first"
)
m = groups.max()
group_sizes = np.zeros(m)
for j in groups:
group_sizes[j - 1] += 1
group_sizes = group_sizes.astype("int32")
return group_sizes
### Matrix helpers for S-matrix computation
[docs]def cov2corr(M, invM=None):
"""
Rescales a p x p cov. matrix M to be a correlation matrix.
If invM is not None, also rescales invM to be the inverse
of M.
"""
scale = np.sqrt(np.diag(M))
if invM is None:
return M / np.outer(scale, scale)
outer = np.outer(scale, scale)
return M / outer, invM * outer
[docs]def chol2inv(X):
""" Uses cholesky decomp to get inverse of matrix """
triang = np.linalg.cholesky(X)
tinv, _ = scipy.linalg.lapack.dtrtri(
c=triang, lower=True, overwrite_c=False
)
return np.dot(tinv.T, tinv)
[docs]def calc_mineig(M):
"""
Calculates the minimum eigenvalue of a square symmetric matrix M
"""
if M.shape[0] < 1500:
return np.linalg.eigh(M)[0].min()
else:
try:
return eigsh(
M,
1,
which='SA',
return_eigenvectors=False,
maxiter=1000,
tol=1e-5
)[0]
except scipy.sparse.linalg.eigen.arpack.ArpackNoConvergence:
return np.linalg.eigh(M)[0].min()
[docs]def shift_until_PSD(M, tol):
""" Add the identity until a p x p matrix M has eigenvalues of at least tol"""
p = M.shape[0]
mineig = calc_mineig(M)
if mineig < tol:
M = M + (tol - mineig) * np.eye(p)
return M
[docs]def scale_until_PSD(Sigma, S, tol, num_iter):
"""
Perform a binary search to find the largest ``gamma`` such that the minimum
eigenvalue of ``2*Sigma - gamma*S`` is at least ``tol``.
Returns
-------
gamma * S : np.ndarray
See description.
gamma : float
See description
"""
# Raise value error if S is not PSD
try:
np.linalg.cholesky(S)
except np.linalg.LinAlgError:
S = shift_until_PSD(S, tol)
# Binary search to find minimum gamma
lower_bound = 0 # max feasible gamma
upper_bound = 1 # min infeasible gamma
for j in range(num_iter):
gamma = (lower_bound + upper_bound) / 2
V = 2 * Sigma - gamma * S
try:
np.linalg.cholesky(V - tol * np.eye(V.shape[0]))
lower_bound = gamma
except np.linalg.LinAlgError:
upper_bound = gamma
# Scale S properly, be a bit conservative
S = lower_bound * S
return S, lower_bound
[docs]def permute_matrix_by_groups(groups):
"""
Create indices which permute a (covariance) matrix according to a list of groups.
"""
# Create sorting indices
inds_and_groups = [(i, group) for i, group in enumerate(groups)]
inds_and_groups = sorted(inds_and_groups, key=lambda x: x[1])
inds = [i for (i, j) in inds_and_groups]
# Make sure we can unsort
p = groups.shape[0]
inv_inds = np.zeros(p)
for i, j in enumerate(inds):
inv_inds[j] = i
inv_inds = inv_inds.astype("int32")
return inds, inv_inds
[docs]def blockdiag_to_blocks(M, groups):
"""
Given a square array `M`, returns a list of diagonal blocks of `M` as specified by `groups`.
Parameters
----------
M : np.ndarray
``(p, p)``-shaped array
groups : np.ndarray
``(p, )``-shaped array with m unique values. If ``groups[i] == j``,
this indicates that coordinate ``i`` belongs to group ``j``.
Returns
-------
blocks : list
A list of square np.ndarrays. blocks[i] corresponds to group identified
by the ith smallest unique value of ``groups``.
"""
blocks = []
for j in np.sort(np.unique(groups)):
inds = np.where(groups == j)[0]
full_inds = np.ix_(inds, inds)
blocks.append(M[full_inds].copy())
return blocks
### Feature-statistic helpers
[docs]def random_permutation_inds(length):
""" Returns indexes which will randomly permute/unpermute a numpy
array of length `length`. Also returns indices which will
undo this permutation.
Returns
-------
inds : np.ndarray
``(length,)``-shaped ndarray corresponding to a random permutation
from 0 to `length`-1.
rev_inds : np.ndarray
``(length,)``-shaped ndarray such that for any ``(length,)``-shaped
array called ``x``, ``x[inds][rev_inds]`` equals ``x``.
"""
# Create inds and rev inds
inds = np.arange(0, length, 1)
np.random.shuffle(inds)
rev_inds = [0 for _ in range(length)]
for (i, j) in enumerate(inds):
rev_inds[j] = i
return inds, rev_inds
### Helper for MX knockoffs when we infer Sigma
[docs]def estimate_factor(Sigma, num_factors=20, num_iter=10):
"""
Approximates ``Sigma = np.diag(D) + np.dot(U, U.T)``.
See https://arxiv.org/pdf/2006.08790.pdf.
Parameters
----------
Sigma : np.ndarray
``(p, p)``-shaped covariance matrix of X
num_factors : int
Dimensionality of ``U``.
Notes
-----
TODO: allow X as an input when Sigma does not
fit in memory.
Returns
-------
D : np.ndarray
``(p,)``-shaped array of diagonal elements.
U : np.ndarray
``(p, num_factors)``-shaped array.
"""
p = Sigma.shape[0]
# Problem is trivial in this case
if num_factors >= p:
return np.zeros((p, p)), sp.linalg.sqrtm(Sigma)
# Coordinate ascent
D = np.zeros(p)
for i in range(num_iter):
evals, evecs = eigsh(Sigma-np.diag(D), num_factors, which='LM')
U = np.dot(evecs, np.diag(np.maximum(0, np.sqrt(evals))))
D = np.diag(Sigma - np.power(U, 2).sum(axis=1))
#loss = np.power(Sigma - np.diag(D) - np.dot(U, U.T), 2).sum()
return D, U
[docs]def estimate_covariance(X, tol=1e-4, shrinkage="ledoitwolf", **kwargs):
""" Estimates covariance matrix of X.
Parameters
----------
X : np.ndarray
``(n, p)``-shaped design matrix
shrinkage : str
The type of shrinkage to apply during estimation. One of
"ledoitwolf", "graphicallasso", or None (no shrinkage).
tol : float
If shrinkage is None but the minimum eigenvalue of the MLE
is below tol, apply LedoitWolf shrinkage anyway.
kwargs : dict
kwargs to pass to the shrinkage estimator.
Returns
-------
Sigma : np.ndarray
``(p, p)``-shaped estimated covariance matrix of X
invSigma : np.ndarray
``(p, p)``-shaped estimated precision matrix of X
"""
# Parse none strng
if str(shrinkage).lower() == "none" or str(shrinkage).lower() == 'mle':
shrinkage = None
if shrinkage is None:
Sigma = np.cov(X.T)
mineig = calc_mineig(Sigma)
if mineig < tol:
shrinkage = 'ledoitwolf'
# Possibly shrink Sigma
if shrinkage is not None:
# Which shrinkage to use
if str(shrinkage).lower() == "ledoitwolf":
ShrinkEst = sklearn.covariance.LedoitWolf(**kwargs)
elif str(shrinkage).lower() == "graphicallasso":
kwargs['alpha'] = kwargs.get('alpha', 0.1) # Default regularization
ShrinkEst = sklearn.covariance.GraphicalLasso(**kwargs)
else:
raise ValueError(
f"Shrinkage arg must be one of None, 'ledoitwolf', 'graphicallasso', not {shrinkage}"
)
# Fit shrinkage. Sometimes the Graphical Lasso raises errors
# so we handle these here.
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ShrinkEst.fit(X)
except FloatingPointError:
warnings.warn(f"Graphical lasso failed, LedoitWolf matrix")
ShrinkEst = sklearn.covariance.LedoitWolf()
ShrinkEst.fit(X)
# Return
Sigma = ShrinkEst.covariance_
invSigma = ShrinkEst.precision_
return Sigma, invSigma
# Else return empirical estimate
return Sigma, None
### Multiprocessing helper
def _one_arg_function(list_of_inputs, args, func, kwargs):
"""
Globally-defined helper function for pickling in multiprocessing.
:param list of inputs: List of inputs to a function
:param args: Names/args for those inputs
:param func: A function
:param kwargs: Other kwargs to pass to the function.
"""
new_kwargs = {}
for i, inp in enumerate(list_of_inputs):
new_kwargs[args[i]] = inp
return func(**new_kwargs, **kwargs)
[docs]def apply_pool(func, constant_inputs={}, num_processes=1, **kwargs):
"""
Spawns num_processes processes to apply func to many different arguments.
This wraps the multiprocessing.pool object plus the functools partial function.
Parameters
----------
func : function
An arbitrary function
constant_inputs : dictionary
A dictionary of arguments to func which do not change in each
of the processes spawned, defaults to {}.
num_processes : int
The maximum number of processes spawned, defaults to 1.
kwargs : dict
Each key should correspond to an argument to func and should
map to a list of different arguments.
Returns
-------
outputs : list
List of outputs for each input, in the order of the inputs.
Examples
--------
If we are varying inputs 'a' and 'b', we might have
``apply_pool(
func=my_func, a=[1,3,5], b=[2,4,6]
)``
which would return ``[my_func(a=1, b=2), my_func(a=3,b=4), my_func(a=5,b=6)]``.
"""
# Construct input sequence
args = sorted(kwargs.keys())
num_inputs = len(kwargs[args[0]])
for arg in args:
if len(kwargs[arg]) != num_inputs:
raise ValueError(f"Number of inputs differs for {args[0]} and {arg}")
inputs = [[] for _ in range(num_inputs)]
for arg in args:
for j in range(num_inputs):
inputs[j].append(kwargs[arg][j])
# Construct partial function
partial_func = partial(
_one_arg_function, args=args, func=func, kwargs=constant_inputs,
)
# Don't use the pool object if num_processes=1
num_processes = min(num_processes, len(inputs))
if num_processes == 1:
all_outputs = []
for inp in inputs:
all_outputs.append(partial_func(inp))
else:
with Pool(num_processes) as thepool:
all_outputs = thepool.map(partial_func, inputs)
return all_outputs
### Dependency management
def check_kpytorch_available(purpose):
try:
import torch
except ImportError as err:
raise ValueError(
f"Pytorch is required for {purpose}, but importing torch raised {err}. See https://pytorch.org/get-started/."
)
def check_pyglmnet_available(purpose):
try:
import pyglmnet
except ImportError as err:
raise ValueError(
f"pyglmnet is required for {purpose}, but importing pyglmnet raised {err}. See https://github.com/glm-tools/pyglmnet/."
)