Source code for cocoatree.deconvolution

import numpy as np
from .__params import __freq_regularization_ref
from cocoatree.randomize import _randomize_seqs_conserving_col_compo
from cocoatree.msa import compute_seq_weights
from cocoatree.statistics.pairwise import compute_sca_matrix
from cocoatree.pysca import _compute_ica, _icList


[docs] def extract_independent_components(coevo_matrix, method=None, n_components=3, nrandom_pySCA=10, sequences=None, learnrate_ICA=0.1, nb_iterations_ICA=100000, freq_regul=__freq_regularization_ref, verbose_random_iter=True): """ Extract independent components from a coevolution matrix The current method is fully applicable to SCA analysis. For other metrics, we set n_components = 3 (to improve) Parameters ---------- coevo_matrix : np.ndarray coevolution matrix sequences : list of sequences, optional, default: None when using pySCA's strategy to estimate the number of components, sequences needs to be provided. method : {None, "pysca"}, default=None Methods to use to estimate the number of components to extract. By default, relies on the number of components provided by the user. n_components : int, default=3, Number of independent components to extract nrandom_pySCA : int, default=10, Number of MSA randomizations to perform if method='pySCA' learnrate_ICA : int, default=0.1, Learning rate / relaxation parameter used if method='pySCA' nb_iteration_ICA : int, default=100000, Number of iterations if method='pySCA' freq_regul : regularization parameter (default=__freq_regularization_ref) verbose_random_iter : Boolean Returns ------- idpt_components : ndarray of shape (n_components, n_pos) corresponding to a list of independent components """ if method is not None: if method == 'pySCA': if sequences is None: raise ValueError( "Sequences need to be provided to estimate" "the number of components automatically") n_components = _compute_n_components_as_pySCA( sequences, coevo_matrix, nrandom=nrandom_pySCA, freq_regul=freq_regul, verbose_random_iter=verbose_random_iter) else: raise ValueError( f"{method} is not a valid method. Options are None, 'pySCA'") V, S, Vt = np.linalg.svd(coevo_matrix) Vica, _ = _compute_ica(V, n_components, learnrate=learnrate_ICA, iterations=nb_iterations_ICA) idpt_components = Vica.T return idpt_components
def _compute_n_components_as_pySCA(sequences, coevo_matrix, seq_weights=None, nrandom=10, freq_regul=__freq_regularization_ref, verbose_random_iter=True): """ Compute the number of independent components as in pySCA Given the eigenvalues of the coevolution matrix, and the eigenvalues for the set of randomized matrices, return the number of significant eigenmodes as those above the average second eigenvalue plus 2 standard deviations. Based on S1 text of Rivoire et al. (2016) Rem: it concerns only SCA metrics For other merics (MI, adding corrections) this should be adapted Parameters ---------- sequences : list of sequences coevo_matrix : np.ndarray of shape (n_pos, n_pos) coevolution matrix seq_weights : np.array (nseq, ) of each sequence weight nrandom : int Number of randomizations performed freq_regul : regularization parameter (default=__freq_regularization_ref) verbose_random_iter : boolean, default=True Print the advance of the randomization procedure Returns ------- n_components : int Number of independent components to select """ if seq_weights is None: seq_weights, m_eff = compute_seq_weights(sequences) else: m_eff = np.sum(seq_weights) second_eigen_values_random = [] for irand in range(nrandom): if verbose_random_iter: print('%d/%d randomized msa (to compute number of\ significant components) ' % (irand+1, nrandom), end='\r') rand_sequences = _randomize_seqs_conserving_col_compo(sequences) seq_weights_rand, m_eff_rand = compute_seq_weights(rand_sequences) # to get the correct m_eff seq_weights_rand = seq_weights_rand / m_eff_rand * m_eff SCA_rand = compute_sca_matrix(rand_sequences, seq_weights_rand, freq_regul=freq_regul) _, S, _ = np.linalg.svd(SCA_rand) second_eigen_values_random.append(S[1]) mean_second_ev_rand = np.mean(second_eigen_values_random) std_second_ev_rand = np.std(second_eigen_values_random) _, S_input, _ = np.linalg.svd(coevo_matrix) n_components = len(S_input[S_input > mean_second_ev_rand + 2 * std_second_ev_rand]) return n_components
[docs] def extract_principal_components(coevo_matrix): """ Perform principal component decomposition of a coevolution matrix Parameters ---------- coevo_matrix : np.ndarray coevolution matrix Returns ------- principal_components : np.ndarray (n_pos, n_pos) Principal components obtained from the PCA of the coevolution matrix """ _, _, principal_components = np.linalg.svd(coevo_matrix) return principal_components
[docs] def extract_sectors(idpt_components, coevo_matrix): """ Extract residue positions of sectors Parameters ---------- idpt_components : independent components obtained from an ICA coevo_matrix : coevolution matrix Returns ------- sectors : lists of residue positions on the filtered MSA for each of the n_components sector """ Vica = idpt_components.T _, sector_sizes, sorted_pos, _, _, _ = _icList( Vica, len(idpt_components), coevo_matrix) sectors = [[sorted_pos[i] for i in range(sector_sizes[0])]] ref_index = sector_sizes[0] for isize in range(1, len(sector_sizes)): sectors.append([sorted_pos[i] for i in range(ref_index, ref_index + sector_sizes[isize])]) ref_index += sector_sizes[isize] return sectors
[docs] def substract_first_principal_component(coevo_matrix): """ Remove global correlations In the sector literature (and data analysis), this corresponds to removing global correlations (from e.g. phylogenetic effects) Parameters ---------- coevo_matrix : np.ndarray (n_pos, n_pos), coevolution matrix Returns ------- coevo_matrix_sub : np.ndarray (n_pos, n_pos), coevolution matrix without global correlations """ U, S, Vt = np.linalg.svd(coevo_matrix) S[0] = 0 coevo_matrix_sub = np.maximum(np.linalg.multi_dot([U, np.diag(S), Vt]), 0) return coevo_matrix_sub