Source code for cocoatree.statistics.position

import numpy as np
from ..__params import lett2num, __freq_regularization_ref, __aa_count, __freq0
from ..msa import compute_seq_weights


def _compute_aa_freqs(sequences, seq_weights=None,
                      freq_regul=__freq_regularization_ref):
    """Computes frequencies of amino acids at each position of the alignment.

    .. math::
        f_i^a = (\\sum_s w_s x_{si}^a + \\lambda/21)/(M_{eff} + \\lambda)

    where

    .. math::

        M_{eff} = \\sum_s w_s

    represents the effective number of sequences in the alignment and *lambda*
    is a regularization parameter (pseudocount).

    Parameters
    ----------
    sequences : list of sequences as imported by load_msa()

    seq_weights : numpy 1D array, optional
            Gives more or less importance to certain sequences. If
            seq_weights=None, all sequences are attributed an equal weight
            of 1.

    freq_regul : regularization parameter (default=__freq_regularization_ref)

    Returns
    -------
    aa_freq : np.ndarray of shape (Npos, aa_count)
            frequency of amino acid *a* at position *i*
    """

    tmp = np.array([[char for char in row] for row in sequences])
    binary_array = np.array([tmp == aa for aa in lett2num.keys()]).astype(int)
    # weights
    if seq_weights is None:
        seq_weights = np.ones(len(sequences))
    m_eff = np.sum(seq_weights)
    weighted_binary_array = \
        binary_array * seq_weights[np.newaxis, :, np.newaxis]
    aa_freq = (np.sum(weighted_binary_array, axis=1).T
               + freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)

    return aa_freq


def _compute_background_freqs(aa_freqs, sequences, seq_weights=None,
                              freq_regul=__freq_regularization_ref):
    """Computes (regularized) background frequencies of amino acids

    Parameters
    ----------
    aa_freqs : np.ndarray of the positional amino acid frequencies

    sequences : list of sequences for which seq_weights give weights

    seq_weights : numpy 1D array, optional
            Gives more or less importance to certain sequences.
            If seq_weights=None, all sequences are attributed an equal weight
            of 1.

    freq_regul : regularization parameter (default=__freq_regularization_ref)


    Returns
    -------
    bkgd_freqs :  np.ndarray (21, )
        A (21,) np.array containing the background amino acid frequencies
        at each position; it is computed from the mean frequency of amino acid
        *a* in all proteins in the NCBI non-redundant database
        (see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
    """

    # q0 : fraction of gaps in the alignment
    q0 = np.mean(aa_freqs[:, 0])
    # background_freq : correction factor on __freq0 in order to take the
    # proportion of gaps into account
    bkgd_freqs = list((1 - q0) * __freq0)
    bkgd_freqs.insert(0, q0)
    bkgd_freqs = np.array(bkgd_freqs)

    # weights
    if seq_weights is None:
        seq_weights = np.ones(len(sequences))
    m_eff = np.sum(seq_weights)

    # regularization
    bkgd_freqs = (bkgd_freqs * m_eff +
                  freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)

    return bkgd_freqs


def _compute_first_order_freqs(sequences, seq_weights=None,
                               freq_regul=__freq_regularization_ref):
    """
    Compute amino acid frequencies at each position and background frequencies

    Parameters
    ----------
    sequences : list of sequences for which seq_weights gives weights

    seq_weights : numpy 1D array, optional, default=None
        Gives more or less importance to certain sequences.
        If seq_weights=None, will compute sequence weights

    freq_regul : regularization parameter (default=__freq_regularization_ref)

    Returns
    -------
    aa_freqs : np.ndarray of the positional amino acid frequencies

    bkgd_freqs : np.ndarray (21, )
    A (21, ) np.array containing the background amino acid frequencies at each
    position. It is computed from the mean frequency of amino acid *a* in all
    proteins in the NCBI non-redundant database.

    (see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
    """

    if seq_weights is None:
        seq_weights, _ = compute_seq_weights(sequences)

    aa_freqs = _compute_aa_freqs(
        sequences,
        freq_regul=freq_regul,
        seq_weights=seq_weights)

    bkgd_freqs = _compute_background_freqs(
        aa_freqs,
        sequences,
        seq_weights=seq_weights,
        freq_regul=__freq_regularization_ref)

    return aa_freqs, bkgd_freqs


[docs] def compute_entropy(aa_freq): """Computes Shannon's entropy for each position in the alignment .. math:: H(a) = -\\sum_i f_{ia} \\log f_{ia} where *H(a)* is the relative entropy of amino acid *a*, *fia* is the frequency of amino acid *a* at position *i* Parameters ---------- aa_freq : np.ndarray, amino acid frequencies per position Returns ------- s: array of shape (N_pos) """ s = -np.sum(aa_freq * np.log(aa_freq), axis=1) return s
[docs] def compute_conservation(sequences, seq_weights=None, freq_regul=__freq_regularization_ref): """ Compute the conservation of amino acid at each position. The conservation is computed as the relative entropy (e.g., the Kullback-Leibler divergence) .. math:: D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \ \\frac{1 - f_i^a}{1 - q^a} where :math:`f_i^a` is the observed frequency of amino acid `a` at position i`, :math:`q^a` is the background expectation :math:`D_i^a` indicates how unlikely the observed frequencies of amino acid `a` at position `i` would be if `a` occurred randomly with probability :math:`q^a`. Parameters ---------- sequences : list of sequences seq_weights : ndarray (nseq), optional, default: None if None, will compute sequence weights freq_regul : regularization parameter (default=__freq_regularization_ref) Returns ------- Di : np.ndarray (npos,) where each entry corresponds to the conservation at this position in the sequences. """ aa_freqs, bkgd_freqs = _compute_first_order_freqs(sequences, seq_weights, freq_regul) _, Di = _compute_rel_entropy(aa_freqs, bkgd_freqs) return Di
def _compute_rel_entropy(aa_freqs, bkgd_freqs): """Compute the relative entropy Also know as the Kullback-Leibler divergence .. math:: D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \ \\frac{1 - f_i^a}{1 - q^a} where f_i^a is the observed frequency of amino acid *a* at position *i*, q^a is the background expectation D_i^a is known as the Kullback-Leibler relative entropy (Cover and Thomas, 2012) and indicates how unlikely the observed frequencies of amino acid *a* at position *i* would be if *a* occurred randomly with probability q^a. Parameters ---------- aa_freqs: np.ndarray, amino acid frequencies per position bck_freq: np.ndarray, background frequenvies of amino acids returns ------- Dia: np.ndarray, relative entropy of aa_freq given the background distribution of amino acids. Indicates how unlikely the observed frequency of amino acid *a* at position *i* would be if a occurred randomly with probability background_freq Di: np.ndarray, overall conservation of position *i* taking all amino acids into account """ Dia = aa_freqs * np.log(aa_freqs / bkgd_freqs) + \ (1 - aa_freqs) * np.log((1 - aa_freqs) / (1 - bkgd_freqs)) # sum on all amino acid at each position Di = np.sum(aa_freqs * np.log(aa_freqs / bkgd_freqs), axis=1) return Dia, Di