Source code for dandelion.utilities._utilities

#!/usr/bin/env python
# @Author: kt16
# @Date:   2020-05-12 14:01:32
# @Last Modified by:   Kelvin
# @Last Modified time: 2021-02-20 09:53:35

import os
from collections import defaultdict, Iterable
import pandas as pd
import numpy as np
from subprocess import run
from typing import Sequence, Tuple, Dict
try:
    from typing import Literal
except ImportError:
    try:
        from typing_extensions import Literal
    except ImportError:
        class LiteralMeta(type):
            def __getitem__(cls, values):
                if not isinstance(values, tuple):
                    values = (values,)
                return type("Literal_", (Literal,), dict(__args__=values))

        class Literal(metaclass=LiteralMeta):
            pass


class Tree(defaultdict):
    '''
    Create a recursive defaultdict
    '''

    def __init__(self, value=None):
        super(Tree, self).__init__(Tree)
        self.value = value


def dict_from_table(meta: pd.DataFrame, columns: Tuple[str, str]) -> Dict:
    """
    Generates a dictionary from a dataframe
    Parameters
    ----------
    meta
        pandas dataframe or file path
    columns
        column names in dataframe

    Returns
    -------
        dictionary
    """
    if (isinstance(meta, pd.DataFrame)) & (columns is not None):
        meta_ = meta
        if len(columns) == 2:
            sample_dict = dict(zip(meta_[columns[0]], meta_[columns[1]]))
    elif (os.path.isfile(str(meta))) & (columns is not None):
        meta_ = pd.read_csv(meta, sep='\t', dtype='object')
        if len(columns) == 2:
            sample_dict = dict(zip(meta_[columns[0]], meta_[columns[1]]))

    sample_dict = clean_nan_dict(sample_dict)
    return(sample_dict)


def clean_nan_dict(d: Dict) -> Dict:
    """
    Parameters
    ----------
    d : Dict
        dictionary

    Returns
    -------
        dictionary with no NAs.
    """

    return {
        k: v
        for k, v in d.items()
        if v is not np.nan
    }


def flatten(l: Sequence) -> Sequence:
    """
    Parameters
    ----------
    l : Sequence
        a list-in-list list

    Returns
    -------
        a flattened list.
    """
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el


[docs]def makeblastdb(ref: str): """ Runs makeblastdb on constant region fasta file Parameters ---------- ref : str constant region fasta file Returns ------- """ cmd = ['makeblastdb', '-dbtype', 'nucl', '-parse_seqids', '-in', ref] run(cmd)
def bh(pvalues: np.array) -> np.array: """ Computes the Benjamini-Hochberg FDR correction. Parameters ---------- pvalues : np.array array of p-values to correct Returns ------- np.array of corrected p-values """ n = int(pvalues.shape[0]) new_pvalues = np.empty(n) values = [(pvalue, i) for i, pvalue in enumerate(pvalues)] values.sort() values.reverse() new_values = [] for i, vals in enumerate(values): rank = n - i pvalue, index = vals new_values.append((n/rank) * pvalue) for i in range(0, int(n)-1): if new_values[i] < new_values[i+1]: new_values[i+1] = new_values[i] for i, vals in enumerate(values): pvalue, index = vals new_pvalues[index] = new_values[i] return new_pvalues def is_categorical(array_like) -> bool: return array_like.dtype.name == 'category' def type_check(dataframe, key) -> bool: return dataframe[key].dtype == str or dataframe[key].dtype == object or is_categorical(dataframe[key]) or dataframe[key].dtype == bool def isGZIP(filename: str) -> bool: if filename.split('.')[-1] == 'gz': return True return False def isBZIP(filename: str) -> bool: if filename.split('.')[-1] == 'pbz2': return True return False