import sys
import numpy
import scipy.stats
import scipy.optimize
import warnings
[docs]class NormMethod:
name = "undefined"
[docs] @staticmethod
def normalize():
raise NotImplemented
[docs]class NZMeanNorm(NormMethod):
name = "nzmean"
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
"""Returns the normalization factors for the data, using the NZMean method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
Returns:
numpy array: Array with the normalization factors for the nzmean method.
:Example:
>>> import pytransit._tools.norm_tools as norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.nzmean_factors(data)
>>> print(factors)
array([[ 1.14836149],
[ 0.88558737]])
.. seealso:: :class:`normalize_data`
"""
(K, N) = data.shape
total_hits = numpy.sum(data, 1)
TAs_hit = numpy.sum(data > 0, 1)
mean_hits = total_hits / TAs_hit
grand_total = numpy.sum(mean_hits)
grand_mean = grand_total / float(K)
factors = numpy.zeros((K, 1))
factors[:, 0] = grand_mean / mean_hits
data = factors * data
return (data, factors)
[docs]class TotReadsNorm(NormMethod):
name = "totreads"
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
"""Returns the normalization factors for the data, using the total reads
method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
Returns:
numpy array: Array with the normalization factors for the totreads method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.totreads_factors(data)
>>> print(factors)
array([[ 1.2988762],
[ 0.8129396]])
.. seealso:: :class:`normalize_data`
"""
(K, N) = data.shape
total_hits = numpy.sum(data, 1)
TAs = float(N)
mean_hits = total_hits / TAs
grand_total = numpy.sum(mean_hits)
grand_mean = grand_total / float(K)
factors = numpy.zeros((K, 1))
factors[:, 0] = grand_mean / mean_hits
data = factors * data
return (data, factors)
[docs]class TTRNorm(NormMethod):
name = "emphist"
[docs] def empirical_theta(X):
"""Calculates the observed density of the data.
This is used as an estimate insertion density by some normalization methods.
May be improved by more sophisticated ways later on.
Arguments:
data (numpy array): (N) numpy array defining read-counts at N sites.
Returns:
float: Density of the given dataset.
:Example:
>>> from pytransit.specific_tools import tnseq_tools
>>> from pytransit.specific_tools import norm_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> theta = norm_tools.empirical_theta(data)
>>> print(theta)
0.467133570136
.. seealso:: :class:`TTR_factors`
"""
return numpy.mean(X > 0)
[docs] def trimmed_empirical_mu(X, t=0.05):
"""Estimates the trimmed mean of the data.
This is used as an estimate of mean count by some normalization methods.
May be improved by more sophisticated ways later on.
Arguments:
data (numpy array): (N) numpy array defining read-counts at N sites.
t (float): Float specifying fraction of start and end to trim.
Returns:
float: (Trimmed) Mean of the given dataset.
:Example:
>>> from pytransit.specific_tools import tnseq_tools
>>> from pytransit.specific_tools import norm_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> mu = norm_tools.trimmed_empirical_mu(data)
>>> print(mu)
120.73077107
.. seealso:: :class:`TTR_factors`
"""
return scipy.stats.trim_mean(X[X > 0], t)
[docs] @staticmethod
def normalize(
data,
wig_list=[],
annotation_path="",
theta_est=empirical_theta,
mu_est=trimmed_empirical_mu,
target=100.0,
):
"""Returns the normalization factors for the data, using the TTR method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
theta_est (function): Function used to estimate density. Should take a list
of counts as input.
mu_est (function): Function used to estimate mean count. Should take a list
of counts as input.
Returns:
numpy array: Array with the normalization factors for the TTR method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.TTR_factors(data)
>>> print(factors)
array([[ 1. ],
[ 0.62862886]])
.. seealso:: :class:`normalize_data`
"""
K = len(data)
N = len(data[0])
factors = numpy.zeros((K, 1))
for j in range(K):
factors[j] = float(target) / (theta_est(data[j]) * mu_est(data[j]))
data = factors * data
return (data, factors)
[docs]class EmpHistNorm(NormMethod):
name = "emphist"
[docs] @staticmethod
def zero_inflated_nb_objective_function(params, args):
"""Objective function for the zero-inflated NB method."""
pi, mu, r = params
Fdata = args
temp0 = numpy.nan_to_num(
numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata == 0], mu, r))
)
tempnz = numpy.nan_to_num(
numpy.log(1.0 - pi) + scipy.stats.nbinom.logpmf(Fdata[Fdata > 0], mu, r)
)
negLL = -(numpy.sum(temp0) + numpy.sum(tempnz))
return negLL
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
"""Returns the normalized data, using the empirical hist method.
Arguments:
wig_list (list): List of paths to wig formatted datasets.
annotation_path (str): Path to annotation in .prot_table or GFF3 format.
Returns:
numpy array: Array with the normalization factors for the emphist method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.emphist_factors(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"], "transit/data/genomes/H37Rv.prot_table")
>>> print(factors)
array([[ 1. ],
[ 0.63464722]])
.. seealso:: :class:`normalize_data`
"""
from pytransit.specific_tools import tnseq_tools
G = tnseq_tools.Genes(wig_list, annotation_path)
K = len(wig_list)
temp = []
for j in range(K):
reads_per_gene = []
for gene in G:
tempdata = numpy.array(gene.reads)
if len(tempdata[0]) > 0:
reads_per_gene.append(numpy.sum(tempdata[j, :]))
temp.append(reads_per_gene)
temp = numpy.array(temp)
factors = numpy.ones((K, 1))
for j in range(1, K):
ii_good = numpy.logical_and(temp[0, :] > 0, temp[j, :] > 0)
logFC = numpy.log(temp[j, ii_good] / temp[0, ii_good])
mean = numpy.mean(logFC)
std = numpy.sqrt(numpy.var(logFC))
X = numpy.linspace(mean - (5 * std), mean + (std * 5), 50000)
R = scipy.stats.gaussian_kde(logFC)
Y = R(X)
peakLogFC = X[Y.argmax()]
if peakLogFC < 0:
factors[j, 0] = numpy.exp(abs(peakLogFC))
else:
factors[j, 0] = 1.0 / numpy.exp(abs(peakLogFC))
data = factors * data
return (data, factors)
[docs]class AdaptiveBGCNorm(NormMethod):
name = "aBGC"
[docs] def ecdf(S, x):
"""Calculates an empirical CDF of the given data."""
return numpy.sum(S <= x) / float(len(S))
[docs] def clean_from_geometric(x, rho):
"""Returns a 'clean' output from the geometric distribution."""
if x == float("inf"):
return scipy.stats.geom.ppf(0.9999999999999999, rho)
else:
return x
[docs] @staticmethod
def normalize(
data, wig_list=[], annotation_path="", doTotReads=True, bgsamples=200000
):
"""Returns the normalized data using the aBGC method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
doTotReads (bool): Boolean specifying whether to do TTR normalization as well.
bgsamples (int): Integeer specifying how many samples to take.
Returns:
numpy array: Array with the normalized data.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> normdata = norm_tools.aBGC_norm(data)
>>> print(normdata)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
.. seealso:: :class:`normalize_data`
"""
K, N = data.shape
norm_data = numpy.zeros(data.shape)
S = bgsamples
F = [i / 100.0 for i in range(0, 31) if i % 2 == 0]
BGC = []
param_list = []
bgc_factors = []
for j in range(K):
nzdata = data[j][data[j] > 0]
nzdata.sort()
Nall = len(data[j])
Nnz = len(nzdata)
GOF_list = []
for frac in F:
tQ = numpy.arange(0, Nnz) / float(Nnz)
rho = 1.0 / (scipy.stats.trim_mean(nzdata, frac))
rho_to_fit = rho
try:
A = (numpy.sum(numpy.power(numpy.log(1.0 - tQ), 2))) / (
numpy.sum(nzdata * numpy.log(1.0 - tQ))
)
Kp = (2.0 * numpy.exp(A) - 1) / (numpy.exp(A) + rho - 1)
temp = scipy.stats.geom.rvs(
scipy.stats.beta.rvs(Kp * rho, Kp * (1 - rho), size=S), size=S
)
bgc_factors.append((rho, Kp))
except Except as e:
print("aBGC Error:", str(e))
print("%rho=s\tKp=%s\tA=%s" % (rho, Kp, A))
temp = scipy.stats.geom.rvs(0.01, size=S)
corrected_nzdata = [
clean_from_geometric(
scipy.stats.geom.ppf(ecdf(temp, x), rho_to_fit), rho_to_fit
)
for x in nzdata
]
corrected_nzmean = numpy.mean(corrected_nzdata)
Fp = scipy.stats.geom.ppf(
numpy.arange(1, Nnz + 1) / float(Nnz), 1.0 / corrected_nzmean
)
ii_inf = Fp == float("inf")
Fp[ii_inf] = max(Fp[~ii_inf]) + 100
ch2_indiv = numpy.power(corrected_nzdata - Fp, 2) / Fp
GOF = max(ch2_indiv)
GOF_list.append((GOF, frac, rho_to_fit, Kp))
gof, frac, best_rho, best_Kp = sorted(GOF_list)[0]
BGsample = scipy.stats.geom.rvs(
scipy.stats.beta.rvs(
best_Kp * best_rho, best_Kp * (1 - best_rho), size=S
),
size=S,
)
# BGC.append(dict([(x, removeinf(scipy.stats.geom.ppf(ecdf(temp, x), best_rho), best_rho)) for x in data[j]]))
for i in range(N):
norm_data[j, i] = clean_from_geometric(
scipy.stats.geom.ppf(ecdf(BGsample, data[j, i]), best_rho), best_rho
)
if doTotReads:
(norm_data, factors) = TTRNorm.normalize(norm_data)
return (norm_data, bgc_factors)
[docs]class ZeroInflatedNBNorm(NormMethod):
name = "zinfb"
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
"""Returns the normalization factors for the data using the zero-inflated
negative binomial method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
Returns:
numpy array: Array with the normalization factors for the zinfnb method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.zinfnb_factors(data)
>>> print(factors)
[[ 0.0121883 ]
[ 0.00747111]]
.. seealso:: :class:`normalize_data`
"""
N = len(data)
G = len(data[0])
factors = numpy.zeros((N, 1))
for j in range(N):
initParams = [0.3, 10, 0.5]
M = "L-BFGS-B"
Fdata = numpy.array(data[j])
results = scipy.optimize.minimize(
zero_inflated_nb_objective_function,
initParams,
args=(Fdata,),
method=M,
bounds=[(0.0001, 0.9999), (0.0001, None), (0.0001, 0.9999)],
)
pi, n, p = results.x
mu = n * (1 - p) / p
factors[j, 0] = 1.0 / mu
data = factors * data
return (data, factors)
[docs]class QuantileNorm(NormMethod):
name = "quantile"
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
"""Performs Quantile Normalization as described by Bolstad et al. 2003
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
Returns:
numpy array: Array with the data normalized by the quantile normalization method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> normdata = norm_tools.quantile_norm(data)
>>> print(normdata)
.. seealso:: :class:`normalize_data`
"""
N = len(data)
G = len(data[0])
# Sort columns
s_data = numpy.array([sorted(col) for col in data])
# Get ranks of original data
ranks = numpy.zeros(data.shape, dtype=int)
for j in range(N):
ranks[j, :] = scipy.stats.rankdata(data[j], method="dense")
# Get empirical distribution
ranked_means = numpy.mean(s_data, 0)
# Create dictionary of rank to new empirical values
rank2count = dict(
[
(r, c)
for (r, c) in zip(
scipy.stats.rankdata(ranked_means, method="dense"), ranked_means
)
]
)
# Assign values
norm_data = numpy.zeros(data.shape)
for i in range(G):
norm_data[:, i] = [rank2count[ranks[j, i]] for j in range(N)]
return (norm_data, numpy.ones(1))
[docs]class BetaGeomNorm(NormMethod):
name = "betageom"
[docs] def ecdf(S, x):
"""Calculates an empirical CDF of the given data."""
return numpy.sum(S <= x) / float(len(S))
[docs] def clean_from_geometric(x, rho):
"""Returns a 'clean' output from the geometric distribution."""
if x == float("inf"):
return scipy.stats.geom.ppf(0.9999999999999999, rho)
else:
return x
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path="", doTTR=True, bgsamples=200000):
"""Returns normalized data according to the BGC method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
doTTR (bool): Boolean specifying whether to do TTR norm as well.
bgsamples (int): Integer specifying how many samples to take.
Returns:
numpy array: Array with the data normalized using the betageom method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> normdata = norm_tools.betageom_norm(data)
>>> print(normdata)
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
.. seealso:: :class:`normalize_data`
"""
(K, N) = data.shape
total_hits = numpy.sum(data, 1)
TAs_hit = numpy.sum(data > 0, 1)
mean_hits = total_hits / TAs_hit
grand_total = numpy.sum(mean_hits)
grand_mean = grand_total / float(K)
norm_data = numpy.zeros(data.shape)
bgc_factors = []
for j in range(K):
tQ = numpy.arange(0, N) / float(N)
eX = numpy.array([rd for rd in data[j]])
eX.sort()
rho = max(1.0 / scipy.stats.trim_mean(eX + 1, 0.001), 0.0001)
A = (numpy.sum(numpy.power(numpy.log(1.0 - tQ), 2))) / (
numpy.sum(eX * numpy.log(1.0 - tQ))
)
Kp = max((2.0 * numpy.exp(A) - 1) / (numpy.exp(A) + rho - 1), 10)
bgc_factors.append((rho, Kp))
try:
BGsample = scipy.stats.geom.rvs(
scipy.stats.beta.rvs(Kp * rho, Kp * (1 - rho), size=bgsamples),
size=bgsamples,
)
except Exception as e:
print("BGC ERROR with rho=%f, Kp=%f, A=%s" % (rho, Kp, A))
print(str(e))
BGsample = scipy.stats.geom.rvs(rho, size=bgsamples)
for i in range(N):
norm_data[j, i] = clean_from_geometric(
scipy.stats.geom.ppf(ecdf(BGsample, data[j, i]), 1.0 / grand_mean),
1.0 / grand_mean,
)
if doTTR:
(norm_data, factors) = TTRNorm.normalize(norm_data)
return (norm_data, bgc_factors)
[docs]class NoNorm(NormMethod):
name = "nonorm"
[docs] @staticmethod
def normalize(data, wig_list=[], annotation_path=""):
return (data, numpy.ones(1))
methods = {}
methods["nonorm"] = NoNorm
methods["TTR"] = TTRNorm
methods["nzmean"] = NZMeanNorm
methods["totreads"] = TotReadsNorm
methods["betageom"] = BetaGeomNorm
methods["zinfnb"] = ZeroInflatedNBNorm
methods["quantile"] = QuantileNorm
methods["aBGC"] = AdaptiveBGCNorm
methods["emphist"] = EmpHistNorm
#########################
[docs]def normalize_data(data, method="nonorm", wig_list=[], annotation_path=""):
"""Normalizes the numpy array by the given normalization method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
method (str): Name of the desired normalization method.
wig_list (list): List of paths for the desired wig-formatted datasets.
annotation_path (str): Path to the prot_table annotation file.
Returns:
numpy array: Array with the normalized data.
list: List containing the normalization factors. Empty if not used.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
(normdata, normfactors) = norm_tools.normalize_data(data, "TTR") # Some methods require annotation and path to wig files.
>>> print(normfactors)
array([[ 1. ],
[ 0.62862886]])
>> print(normdata)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
.. note:: Some normalization methods require the wig_list and annotation_path arguments.
"""
factors = []
if method in methods:
return methods[method].normalize(data, wig_list, annotation_path)
else:
warnstr = (
"Normalization method '%s' is unknown. Read-counts were not normalized."
% (method)
)
warnings.warn(warnstr)
return methods["nonorm"].normalize(data, wig_list, annotation_path)
[docs]def empirical_theta(X):
"""Calculates the observed density of the data.
This is used as an estimate insertion density by some normalization methods.
May be improved by more sophisticated ways later on.
Arguments:
data (numpy array): (N) numpy array defining read-counts at N sites.
Returns:
float: Density of the given dataset.
:Example:
>>> from pytransit.specific_tools import tnseq_tools
>>> from pytransit.specific_tools import norm_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> theta = norm_tools.empirical_theta(data)
>>> print(theta)
0.467133570136
.. seealso:: :class:`TTR_factors`
"""
return numpy.mean(X > 0)
[docs]def trimmed_empirical_mu(X, t=0.05):
"""Estimates the trimmed mean of the data.
This is used as an estimate of mean count by some normalization methods.
May be improved by more sophisticated ways later on.
Arguments:
data (numpy array): (N) numpy array defining read-counts at N sites.
t (float): Float specifying fraction of start and end to trim.
Returns:
float: (Trimmed) Mean of the given dataset.
:Example:
>>> from pytransit.specific_tools import tnseq_tools
>>> from pytransit.specific_tools import norm_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> mu = norm_tools.trimmed_empirical_mu(data)
>>> print(mu)
120.73077107
.. seealso:: :class:`TTR_factors`
"""
return scipy.stats.trim_mean(X[X > 0], t)
[docs]def zero_inflated_nb_objective_function(params, args):
"""Objective function for the zero-inflated NB method."""
pi, mu, r = params
Fdata = args
temp0 = numpy.nan_to_num(
numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata == 0], mu, r))
)
tempnz = numpy.nan_to_num(
numpy.log(1.0 - pi) + scipy.stats.nbinom.logpmf(Fdata[Fdata > 0], mu, r)
)
negLL = -(numpy.sum(temp0) + numpy.sum(tempnz))
return negLL
[docs]def zinfnb_factors(data):
"""Returns the normalization factors for the data using the zero-inflated
negative binomial method.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
Returns:
numpy array: Array with the normalization factors for the zinfnb method.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.zinfnb_factors(data)
>>> print(factors)
[[ 0.0121883 ]
[ 0.00747111]]
.. seealso:: :class:`normalize_data`
"""
N = len(data)
G = len(data[0])
factors = numpy.zeros((N, 1))
for j in range(N):
initParams = [0.3, 10, 0.5]
M = "L-BFGS-B"
Fdata = numpy.array(data[j])
results = scipy.optimize.minimize(
zero_inflated_nb_objective_function,
initParams,
args=(Fdata,),
method=M,
bounds=[(0.0001, 0.9999), (0.0001, None), (0.0001, 0.9999)],
)
pi, n, p = results.x
mu = n * (1 - p) / p
factors[j, 0] = 1.0 / mu
return numpy.array(factors)
[docs]def ecdf(S, x):
"""Calculates an empirical CDF of the given data."""
return numpy.sum(S <= x) / float(len(S))
[docs]def clean_from_geometric(x, rho):
"""Returns a 'clean' output from the geometric distribution."""
if x == float("inf"):
return scipy.stats.geom.ppf(0.9999999999999999, rho)
else:
return x
[docs]def norm_to_target(data, target):
"""Returns factors to normalize the data to the given target value.
Arguments:
data (numpy array): (K,N) numpy array defining read-counts at N sites
for K datasets.
target (float): Floating point specifying the target for the mean of the data/
Returns:
numpy array: Array with the factors necessary to normalize mean to target.
:Example:
>>> from pytransit.specific_tools import norm_tools
>>> from pytransit.specific_tools import tnseq_tools
>>> (data, position) = tnseq_tools.CombinedWig.gather_wig_data(["transit/data/cholesterol_glycerol.transit/glycerol_rep1.wig", "transit/data/cholesterol_glycerol.transit/glycerol_rep2.wig"])
>>> print(data)
array([[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 0., 0.]])
>>> factors = norm_tools.norm_to_target(data, 100)
>>> print(factors)
[[ 1.8548104 ]
[ 1.16088726]]
.. seealso:: :class:`normalize_data`
"""
(K, N) = data.shape
factors = numpy.zeros((K, 1))
factors[:, 0] = float(target) / numpy.mean(data, 1)
return factors