#!/usr/bin/env python
# @Author: kt16
# @Date: 2020-05-12 14:01:32
# @Last Modified by: Kelvin
# @Last Modified time: 2021-02-20 09:53:35
import os
from collections import defaultdict, Iterable
import pandas as pd
import numpy as np
from subprocess import run
from typing import Sequence, Tuple, Dict
try:
from typing import Literal
except ImportError:
try:
from typing_extensions import Literal
except ImportError:
class LiteralMeta(type):
def __getitem__(cls, values):
if not isinstance(values, tuple):
values = (values,)
return type("Literal_", (Literal,), dict(__args__=values))
class Literal(metaclass=LiteralMeta):
pass
class Tree(defaultdict):
'''
Create a recursive defaultdict
'''
def __init__(self, value=None):
super(Tree, self).__init__(Tree)
self.value = value
def dict_from_table(meta: pd.DataFrame, columns: Tuple[str, str]) -> Dict:
"""
Generates a dictionary from a dataframe
Parameters
----------
meta
pandas dataframe or file path
columns
column names in dataframe
Returns
-------
dictionary
"""
if (isinstance(meta, pd.DataFrame)) & (columns is not None):
meta_ = meta
if len(columns) == 2:
sample_dict = dict(zip(meta_[columns[0]], meta_[columns[1]]))
elif (os.path.isfile(str(meta))) & (columns is not None):
meta_ = pd.read_csv(meta, sep='\t', dtype='object')
if len(columns) == 2:
sample_dict = dict(zip(meta_[columns[0]], meta_[columns[1]]))
sample_dict = clean_nan_dict(sample_dict)
return(sample_dict)
def clean_nan_dict(d: Dict) -> Dict:
"""
Parameters
----------
d : Dict
dictionary
Returns
-------
dictionary with no NAs.
"""
return {
k: v
for k, v in d.items()
if v is not np.nan
}
def flatten(l: Sequence) -> Sequence:
"""
Parameters
----------
l : Sequence
a list-in-list list
Returns
-------
a flattened list.
"""
for el in l:
if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
yield from flatten(el)
else:
yield el
[docs]def makeblastdb(ref: str):
"""
Runs makeblastdb on constant region fasta file
Parameters
----------
ref : str
constant region fasta file
Returns
-------
"""
cmd = ['makeblastdb',
'-dbtype', 'nucl',
'-parse_seqids',
'-in', ref]
run(cmd)
def bh(pvalues: np.array) -> np.array:
"""
Computes the Benjamini-Hochberg FDR correction.
Parameters
----------
pvalues : np.array
array of p-values to correct
Returns
-------
np.array of corrected p-values
"""
n = int(pvalues.shape[0])
new_pvalues = np.empty(n)
values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
values.sort()
values.reverse()
new_values = []
for i, vals in enumerate(values):
rank = n - i
pvalue, index = vals
new_values.append((n/rank) * pvalue)
for i in range(0, int(n)-1):
if new_values[i] < new_values[i+1]:
new_values[i+1] = new_values[i]
for i, vals in enumerate(values):
pvalue, index = vals
new_pvalues[index] = new_values[i]
return new_pvalues
def is_categorical(array_like) -> bool:
return array_like.dtype.name == 'category'
def type_check(dataframe, key) -> bool:
return dataframe[key].dtype == str or dataframe[key].dtype == object or is_categorical(dataframe[key]) or dataframe[key].dtype == bool
def isGZIP(filename: str) -> bool:
if filename.split('.')[-1] == 'gz':
return True
return False
def isBZIP(filename: str) -> bool:
if filename.split('.')[-1] == 'pbz2':
return True
return False