import sys
import math
import numpy
import scipy.stats
[docs]def sample_trunc_norm_post(data, S, mu0, s20, k0, nu0):
n = len(data)
s2 = numpy.var(data, ddof=1)
ybar = numpy.mean(data)
kn = k0 + n
nun = nu0 + n
mun = (k0 * mu0 + n * ybar) / float(kn)
s2n = (1.0 / nun) * (
nu0 * s20 + (n - 1) * s2 + (k0 * n / float(kn)) * numpy.power(ybar - mu0, 2)
)
s2_post = 1.0 / scipy.stats.gamma.rvs(nun / 2.0, scale=2.0 / (s2n * nun), size=S)
# Truncated Normal since counts can't be negative
min_mu = 0
max_mu = 1000000
trunc_a = (min_mu - mun) / numpy.sqrt(s2_post / float(kn))
trunc_b = (max_mu - mun) / numpy.sqrt(s2_post / float(kn))
mu_post = scipy.stats.truncnorm.rvs(
a=trunc_a, b=trunc_b, loc=mun, scale=numpy.sqrt(s2_post / float(kn)), size=S
)
return (mu_post, s2_post)
[docs]def fwer_bayes(X):
ii = numpy.argsort(numpy.argsort(X))
P_NULL = numpy.sort(X)
W = 1 - P_NULL
N = len(P_NULL)
P_ALT = numpy.zeros(N)
for i in range(N):
P_ALT[i] = 1.0 - numpy.prod(W[: i + 1])
return P_ALT[ii]
[docs]def b_fdr(X):
N = len(X)
ii = numpy.argsort(numpy.argsort(X))
P_NULL = numpy.sort(X)
P_ALT = numpy.zeros(N)
for i in range(N):
P_ALT[i] = numpy.mean(P_NULL[: i + 1])
return P_ALT[ii]
[docs]def hdi_from_mcmc(posterior_samples, credible_mass=0.95):
# Credit to 'user72564'
# https://stackoverflow.com/questions/22284502/highest-posterior-density-region-and-central-credible-region
# Computes highest density interval from a sample of representative values,
# estimated as the shortest credible interval
# Takes Arguments posterior_samples (samples from posterior) and credible mass (normally .95)
sorted_points = sorted(posterior_samples)
ciIdxInc = numpy.ceil(credible_mass * len(sorted_points)).astype("int")
nCIs = len(sorted_points) - ciIdxInc
ciWidth = [0] * nCIs
for i in range(0, nCIs):
ciWidth[i] = sorted_points[i + ciIdxInc] - sorted_points[i]
HDImin = sorted_points[ciWidth.index(min(ciWidth))]
HDImax = sorted_points[ciWidth.index(min(ciWidth)) + ciIdxInc]
return (HDImin, HDImax)
[docs]def fact(n):
if n == 0:
return 1
else:
return reduce(lambda x, y: x * y, range(1, n + 1))
[docs]def comb(n, k):
if k < 0 or k > n:
return 0
if k > n - k: # take advantage of symmetry
k = n - k
c = 1
for i in range(k):
c = c * (n - (k - (i + 1)))
c = c // (i + 1)
return c
[docs]def norm(x, mu, sigma):
"""Normal distribution"""
sigma = float(sigma)
return (
1
/ (sigma * (math.sqrt(2 * math.pi)))
* math.exp(-0.5 * math.pow((x - mu) / sigma, 2))
)
[docs]def binom(k, n, p):
"""Binomial distribution. Uses Normal approximation for large 'n' """
if n >= 100:
return norm(k, n * p, math.sqrt(n * p * (1 - p)))
else:
return comb(n, k) * math.pow(p, k) * math.pow(1 - p, n - k)
[docs]def binom_cdf(k, n, p):
"""CDF of the binomial distribution"""
return sum([binom(i, n, p) for i in range(0, k + 1)])
[docs]def binom_test(k, n, p, type="two-sided"):
"""Does a binomial test given success, trials and probability."""
if type == "less":
return binom_cdf(k, n, p)
elif type == "greater":
return 1 - binom_cdf(k - 1, n, p)
else:
if p == 0:
return 1 # return(k == 0)
elif p == 1:
return 1 # return(k == n)
else:
relErr = 1 + 1e-7
d = binom(k, n, p)
m = n * p
if k == m:
return 1
elif k < m:
ri = range(int(math.ceil(m)), n + 1)
y = sum([1 for j in ri if binom(j, n, p) <= d * relErr])
return binom_cdf(k, n, p) + (1 - binom_cdf(int(n - y), n, p))
else:
ri = range(0, int(math.floor(m)))
y = sum([1 for j in ri if binom(j, n, p) <= d * relErr])
return binom_cdf(y - 1, n, p) + (1 - binom_cdf(k - 1, n, p))
##############################
# Bernoulli Diff Distribution
[docs]def bernoulli_diff_distribution(d, peq, p01, p10):
N = numpy.size(d)
if N == 0:
return 0.0
if N == 1:
if type(d) == type(()):
d = d[0]
if d == 0:
return peq
else:
if d == -1:
return p01
if d == 1:
return p10
return 0.0
#
else:
d = numpy.array(d)
result = numpy.zeros(N)
result[d == -1] = p01
result[d == 0] = peq
result[d == 1] = p10
return result
[docs]def q_bernoulli_diff_distribution(d, peq, p01, p10):
return numpy.sum([bernoulli_diff_distribution(x, peq, p01, p10) for x in range(-1, d + 1)])
#############################
# Binomial Diff Distribution
[docs]def d_binomial_diff_distribution(d, n, P):
S = numpy.array(my_perm(d, n))
return numpy.sum(multinomial(S, P))
[docs]def q_binomial_diff_distribution(d, n, peq, p01, p10):
return numpy.sum([d_binomial_diff_distribution(x, n, peq, p01, p10) for x in range(-n, d + 1)])
[docs]def my_perm(d, n):
S = []
if d == 0:
for i in range(n + 1):
r = n - i
if is_even(r):
S.append((int(r / 2.0), i, int(r / 2.0)))
#
if d > 0:
for i in range(d, n + 1):
r = n - i
if i == d:
S.append((0, n - d, d))
elif i > d:
r = n - (i + (i - d))
if 0 <= r <= n:
S.append((i - d, r, i))
#
if d < 0:
for i in range(abs(d), n + 1):
r = n - i
if i == abs(d):
S.append((-d, n + d, 0))
elif i > d:
r = n - (i + (i + d))
if 0 <= r <= n:
S.append((i, r, i + d))
#
return S
[docs]def multinomial(K, P):
N = numpy.sum(K, 1)
if K.shape == P.shape:
return tri_coeff(N, K) * numpy.prod(
[numpy.power(P[i], K[i]) for i in range(len(K))], 1
)
else:
return tri_coeff(N, K) * numpy.prod(
[numpy.power(P, K[i]) for i in range(len(K))], 1
)
[docs]def log_fac(n):
return numpy.sum(numpy.log(numpy.arange(2, n + 1)))
[docs]def tri_coeff(N, S):
try:
LOG_FAC
except NameError:
LOG_FAC = []
for i in range(numpy.max(N) + 1):
LOG_FAC.append(log_fac(i))
#
LOG_FAC = numpy.array(LOG_FAC)
#
return numpy.exp(
LOG_FAC[N] - (LOG_FAC[S[:, 0]] + LOG_FAC[S[:, 1]] + LOG_FAC[S[:, 2]])
)
[docs]def is_even(x):
return x % 2 == 0
[docs]def regress(X, Y):
"""Performs linear regression given two vectors, X, Y."""
N = len(X)
xbar = numpy.average(X)
ybar = numpy.average(Y)
xybar = numpy.average([X[i] * Y[i] for i in range(N)])
x2bar = numpy.average([X[i] * X[i] for i in range(N)])
B = (xybar - xbar * ybar) / (x2bar - xbar * xbar)
A0 = ybar - B * xbar
yfit = [A0 + B * X[i] for i in range(N)]
yres = [Y[i] - (A0 + B * X[i]) for i in range(N)]
var = sum([math.pow(yres[i], 2) for i in range(N)]) / (N - 2)
std = math.sqrt(var)
return (B, A0, std)
# return math.log(x) if abs(lambdax) < 1.0e-5 else (x**lambdax - 1.0)/lambdax
[docs]def log_likelihood(X, lambdax):
"""
Computes the log-likelihood function for a transformed vector Xtransform.
"""
n = len(X)
Xtrans = [boxcox_transform(x, lambdax) for x in X]
meanX = sum(Xtrans) / float(n)
S2 = (lambdax - 1.0) * sum([math.log(x) for x in X])
S = sum([(x - meanX) ** 2 for x in Xtrans])
S1 = (-n / 2.0) * math.log(S / n)
return S2 + S1
[docs]def boxcox_table(X, minlambda, maxlambda, dellambda):
"""
Returns a table of (log_likelihood function, lambda) pairs
for the data.
"""
# Create a table (lambda, log_likelihood)
out = []
vallambda = minlambda
while vallambda <= maxlambda + 1.0e-5:
llik = log_likelihood(X, vallambda)
out.append((llik, vallambda))
vallambda += dellambda
return out
[docs]def phi_coefficient(X, Y):
"""Calculates the phi-coefficient for two bool arrays"""
N = len(X)
assert len(X) == len(Y), "Length of arrays must be equal"
x1y1 = sum([int(X[j]) == int(Y[j]) == 1 for j in range(N)])
x1y0 = sum([int(X[j]) == 1 and int(Y[j]) == 0 for j in range(N)])
x0y1 = sum([int(X[j]) == 0 and int(Y[j]) == 1 for j in range(N)])
x0y0 = sum([int(X[j]) == int(Y[j]) == 0 for j in range(N)])
x1 = x1y1 + x1y0
x0 = x0y1 + x0y0
y1 = x1y1 + x0y1
y0 = x1y0 + x0y0
phi_coeff = (x1y1 * x0y0 - x1y0 * x0y1) / math.sqrt(x1 * x0 * y1 * y0)
return phi_coeff
[docs]def bh_fdr_correction(X):
"""Adjusts p-values using the Benjamini Hochberg procedure"""
n = len(X)
qvalues = numpy.zeros(n)
pvalues = numpy.array(X)
pvalues.sort()
pvalues = pvalues[::-1]
for i in range(n):
rank = n - i
qvalues[i] = n / float(rank) * pvalues[i]
for i in range(0, n - 1):
if qvalues[i] < qvalues[i + 1]:
qvalues[i + 1] = qvalues[i]
p2qval = dict([(p, q) for (p, q) in zip(pvalues, qvalues)])
return numpy.array([p2qval[p] for p in X])
[docs]def bayesian_essentiality_thresholds(Z_raw, ALPHA=0.05):
"""Returns Essentiality Thresholds using a BH-like procedure"""
Z = numpy.sort(Z_raw)[::-1]
W = 1 - Z
N = len(Z)
ess_threshold = 1.00
INDEX = list(range(3, N + 1))
count = 0
for i in INDEX:
count += 1
wi = 1 - Z[i - 1]
ai_n = (ALPHA * i) / N
mean_wi = numpy.average(W[0 : i - 2])
delta_w = wi - mean_wi
# if count < 30: print(i, wi, ai_n, delta_w)
if delta_w > ai_n:
ess_threshold = Z[i - 1]
# print("i", i)
break
noness_threshold = 0.00
count = 0
INDEX = list(range(0, N + 1))
INDEX.sort(reverse=True)
for i in INDEX:
wi = Z[N - i + 1]
ai_n = (ALPHA * i) / N
mean_wi = numpy.average(Z[N - i + 1 :])
delta_w = Z[N - i + 1] - mean_wi
count += 1
# print(count)
# if count < 20:
# print(i, wi, ai_n, mean_wi, delta_w, N-i+1, N-1, W[N-i-1], W[i-1])
if ai_n > delta_w:
# print(i, wi, ai_n, mean_wi, delta_w, N-i+1, N-1, W[N-i-1], W[i-1])
break
noness_threshold = Z[N - i]
return (ess_threshold, noness_threshold)
[docs]def tri_cube(X):
result = numpy.zeros(len(X))
ii = numpy.logical_and(X >= -1, X <= 1)
result[ii] = numpy.power(1 - numpy.power(numpy.abs(X[ii]), 3), 3)
return result
[docs]def loess(X, Y, h=10000):
smoothed = numpy.zeros(len(Y))
for i, x in enumerate(X):
W = tri_cube((X - x) / float(h))
sW = numpy.sum(W)
wsX = numpy.sum(W * X)
wsY = numpy.sum(W * Y)
wsXY = numpy.sum(W * X * Y)
sXX = numpy.sum(X * X)
B = (sW * wsXY - wsX * wsY) / (sW * sXX - numpy.power(wsX, 2))
A = (wsY - B * wsX) / sW
smoothed[i] = B * x + A
return smoothed
[docs]def loess_correction(X, Y, h=10000, window=100):
Y = numpy.array(Y)
size = int(len(X) / window) + 1
x_w = numpy.zeros(size)
y_w = numpy.zeros(size)
for i in range(size):
x_w[i] = window * i
y_w[i] = sum(Y[window * i : window * (i + 1)])
ysmooth = loess(x_w, y_w, h)
mline = numpy.mean(y_w)
y_w * (ysmooth / mline)
normalized_Y = numpy.zeros(len(Y))
for i in range(size):
normalized_Y[window * i : window * (i + 1)] = Y[
window * i : window * (i + 1)
] * (ysmooth[i] / mline)
return normalized_Y
[docs]def f_mean_diff_flat(*args, **kwargs):
A = args[0]
B = args[1]
if len(B) == 0 or len(A) == 0:
import math
# to prevent warning # TODO: check this is the desired behavior
return math.nan
return numpy.mean(B) - numpy.mean(A)
[docs]def f_sum_diff_flat(*args, **kwargs):
A = args[0]
B = args[1]
return numpy.sum(B) - numpy.sum(A)
[docs]def f_mean_diff_dict(*args, **kwargs):
D = args[0]
data1_total = 0
data2_total = 0
data1_size = 0
data2_size = 0
for L in D:
data1_total += numpy.sum(D[L][0])
data1_size += len(D[L][0])
data2_total += numpy.sum(D[L][1])
data2_size += len(D[L][1])
return (data2_total / float(data2_size)) - (data1_total / float(data1_size))
[docs]def f_sum_diff_dict(*args, **kwargs):
D = args[0]
data1_total = 0
data2_total = 0
for L in D:
data1_total += numpy.sum(D[L][0])
data2_total += numpy.sum(D[L][1])
return data2_total - data1_total
[docs]def f_shuffle_flat(*args, **kwargs):
X = args[0]
return numpy.random.permutation(X)
[docs]def f_shuffle_dict_libraries(*args, **kwargs):
D = args[0]
E = {}
for L in D:
n1 = len(D[L][0])
combined = numpy.append(D[L][0], D[L][1])
perm = numpy.random.permutation(combined)
E[L] = numpy.array([perm[:n1], perm[n1:]], dtype=object)
return E
[docs]def site_restricted_permutation(data):
"""
Arguments:
data:
NOTE: DESTRUCTIVE; caller must make copy of input arg (data) if they want to preserve it
data is 2D array of counts (samples X TA sites) including all reps in all libs in both conditions (as rows)
"""
if len(data) == 0:
return data
number_of_samples, number_of_ta_sites = data.shape[0], data.shape[1]
for index in range(number_of_ta_sites):
numpy.random.shuffle(data[:,index])
return data
[docs]def resampling(
data1,
data2,
S=10000,
test_func=f_mean_diff_flat,
perm_func=f_shuffle_flat,
adaptive=False,
lib_str1="",
lib_str2="",
pseudocount=1,
site_restricted=False,
):
"""Does a permutation test on two sets of data.
Performs the resampling / permutation test given two sets of data using a
function defining the test statistic and a function defining how to permute
the data.
Args:
data1: List or numpy array with the first set of observations.
data2: List or numpy array with the second set of observations.
S: Number of permutation tests (or samples) to obtain.
test_func: Function defining the desired test statistic. Should accept
two lists as arguments. Default is difference in means between
the observations.
perm_func: Function defining the way to permute the data. Should accept
one argument, the combined set of data. Default is random
shuffle.
adaptive: Cuts-off resampling early depending on significance.
Data arrays: (data1 and data2)
Regular resampling used to take 1D arrays of counts pooled (flattened) over replicates.
Now 2D arrays are passed in and flatten them.
Uses F_shuffle_flat() and F_sum_diff_flat().
If using library strings, then inputs are 2D arrays of counts for each sample.
Character in lib_str indicates which lib it is in. Make a dict out of these to pass to perm_func.
Uses F_shuffle_dict_libraries() and F_sum_diff_dict_libraries().
If site_restricted, keep input arrays as 2D and pass to site_restricted_permutation() and F_sum_diff_flat().
Returns:
Tuple with described values
- test_obs -- Test statistic of observation.
- mean1 -- Arithmetic mean of first set of data.
- mean2 -- Arithmetic mean of second set of data.
- log2FC -- Normalized log2FC the means.
- pval_ltail -- Lower tail p-value.
- pval_utail -- Upper tail p-value.
- pval_2tail -- Two-tailed p-value.
- test_sample -- List of samples of the test statistic.
:Example:
>>> from pytransit.specific_tools import stat_tools
>>> import numpy
>>> X = numpy.random.random(100)
>>> Y = numpy.random.random(100)
>>> (test_obs, mean1, mean2, log2fc, pval_ltail, pval_utail, pval_2tail, test_sample) = stat_tools.resampling(X,Y)
>>> pval_2tail
0.2167
>>> test_sample[:3]
[0.076213992904990535, -0.0052513291091412784, -0.0038425140184765172]
"""
# Do basic sanity checks:
# - Check library strings match in some way
lib_diff = set(lib_str1) ^ set(lib_str2)
if lib_diff:
raise ValueError("At least one library string has a letter not used by the other: " + ", ".join(lib_diff))
if lib_str1 and site_restricted:
raise Exception("Cannot do site_restricted resampling with library strings at same time")
# - Check input has some data
assert len(data1) > 0, "Data1 cannot be empty"
assert len(data2) > 0, "Data2 cannot be empty"
if isinstance(data1,list): data1 = numpy.array(data1)
if isinstance(data2,list): data2 = numpy.array(data2)
#TRI note - now I am switching resampling() so caller passes in NON-flattened arrays of counts
if not site_restricted and not lib_str1:
data1 = data1.flatten()
data2 = data2.flatten()
count_ltail = 0
count_utail = 0
count_2tail = 0
test_list = []
# Calculate basic statistics for the input data:
# flattened (pooled) if not lib_str, else multiple samples (rows) for each lib
number_of_samples1 = len(data1) # number of samples (i.e. rows) for site_restricted or lib_str, or pooled counts if flattened
number_of_samples2 = len(data2)
mean1 = 0
mean2 = 0
if number_of_samples1 > 0: mean1 = numpy.mean(data1) # over all counts pooled across reps and libs for cond1
if number_of_samples2 > 0: mean2 = numpy.mean(data2)
if pseudocount > 0:
log2FC = math.log((mean2+pseudocount)/(mean1+pseudocount),2) # as of 3/5/20
else:
# Only adjust log2FC if one of the means is zero
if mean1 > 0 and mean2 > 0: log2FC = math.log((mean2)/(mean1),2)
else: log2FC = math.log((mean2+1.0)/(mean1+1.0),2)
# Get stats and info based on whether working with libraries or not:
number_of_ta_sites = 0
if lib_str1:
# note: returns a generator, not a list
# Get number of TA sites implied
number_of_ta_sites = len(data1.flatten())//len(lib_str1)
assert len(data2.flatten())//len(lib_str2) == number_of_ta_sites, "Datasets do not have matching sites; check input data and library strings."
# Get data
# for lib_str, perm is a dict mapping letters to pairs of numpy arrays (1 for each cond)
perm = get_lib_data_dict(data1.flatten(), lib_str1, data2.flatten(), lib_str2, number_of_ta_sites)
test_obs = test_func(perm)
else:
try:
# site_retricted use F_sum_diff_flat() as test_func too
test_obs = test_func(data1, data2) # first call, actual value from observed counts
except Exception as error:
from pytransit.globals import logging
logging.error(f"""
the resampling function could not apply test function to input data!
data1: {data1}
data2: {data2}
{error}
""")
if site_restricted:
data = numpy.concatenate((data1,data2),axis=0) # keep it as a 2D array
perm = data.copy() # this array will get modified with each permutation
else: # pool all counts (across conditions) into 1 big array
perm = numpy.zeros(number_of_samples1+number_of_samples2)
perm[:number_of_samples1] = data1
perm[number_of_samples1:] = data2
count_ltail = 0
count_utail = 0
count_2tail = 0
test_list = []
s_performed = 0
for _ in range(S):
if mean1+mean2 > 0:
if site_restricted: perm = site_restricted_permutation(perm) #TRI - I could have passed this in as perm_func, but I don't want to require the caller to know this
else: perm = perm_func(perm) #TRI
if not lib_str1:
test_sample = test_func(perm[:number_of_samples1], perm[number_of_samples1:])
else: # case for lib strings
test_sample = test_func(perm) # how do I know how many counts are in cond1 or cond2? perm is a dict over lib strings (and conds?)
else:
test_sample = 0
test_list.append(test_sample)
if test_sample <= test_obs: count_ltail+=1
if test_sample >= test_obs: count_utail+=1
if abs(test_sample) >= abs(test_obs): count_2tail+=1
s_performed+=1
if adaptive:
if (
s_performed == round(S*0.01) or
s_performed == round(S*0.1) or
s_performed == round(S*1)
):
if count_2tail >= round(S*0.01*0.10):
break
pval_ltail = count_ltail/float(s_performed)
pval_utail = count_utail/float(s_performed)
pval_2tail = count_2tail/float(s_performed)
return test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail, pval_2tail, test_list
[docs]def cumulative_average(new_x, n, prev_avg):
return ((new_x + (n * prev_avg)) / (n + 1.0), n + 1)
[docs]def text_histogram(X, nBins=20, resolution=200, obs=None):
MIN = numpy.min(X)
MAX = numpy.max(X)
bin_list = numpy.linspace(MIN, MAX, nBins)
hit_flag = "->"
empty_flag = " "
for b_l, b_u in zip(bin_list[:-2], bin_list[1:]):
Z = numpy.logical_and(b_l <= X, X < b_u)
density = numpy.mean(Z)
if obs != None and (b_l <= obs < b_u):
flag = hit_flag
else:
flag = empty_flag
print("%-12f\t%s|%s" % (b_l, flag, "#" * int(resolution * density)))
Z = numpy.logical_and(bin_list[-1] <= X, X < float("inf"))
density = numpy.mean(Z)
if obs != None and (bin_list[-1] <= obs < float("inf")):
flag = hit_flag
else:
flag = empty_flag
print("%-12f\t%s|%s" % (bin_list[-1], flag, "#" * int(resolution * density)))
[docs]def parse_lib_index(n_data, libstr, number_of_ta_sites):
full_index = numpy.arange(n_data)
lib_to_index = {}
for k, L in enumerate(libstr):
if L not in lib_to_index:
lib_to_index[L] = []
lib_to_index[L] += list(full_index[k * number_of_ta_sites : ((k + 1) * number_of_ta_sites)])
for L, index in lib_to_index.items():
lib_to_index[L] = numpy.array(index)
return lib_to_index
[docs]def combine_lib_dicts(L1, L2):
KEYS = L1.keys()
DATA = {}
for K in KEYS:
DATA[K] = numpy.array([L1[K], L2[K]], dtype=object)
return DATA
[docs]def get_lib_data_dict(data1, ctrl_lib_str, data2, exp_lib_str, number_of_ta_sites):
"""
Arguments:
data1:
it looks like data1 is supposed to be pre-flattened (see parse_lib_index())
"""
lib1_index_dict = parse_lib_index(len(data1), ctrl_lib_str, number_of_ta_sites)
lib2_index_dict = parse_lib_index(len(data2), exp_lib_str, number_of_ta_sites)
lib1_data_dict = dict(
[(L, data1[lib1_index_dict[L]]) for L in sorted(lib1_index_dict)]
)
lib2_data_dict = dict(
[(L, data2[lib2_index_dict[L]]) for L in sorted(lib2_index_dict)]
)
data_dict = combine_lib_dicts(lib1_data_dict, lib2_data_dict)
return data_dict
# TEST-CASES
if __name__ == "__main__":
"""
n = 20
p = 0.5
k = 14
print("")
print("#########################################")
print("############ BINOM TEST #################")
print("#########################################")
print("Coin Tosses: %d" % n)
print("Success Prob: %3.2f" % p)
print("Observed: %d" % k)
print("")
print("Left-Tail Test:")
print("%d tosses, p-value = %f" % (k, binom_test(k,n,p,"less")))
print("")
print("Right-Tail Test:")
print("%d tosses, p-value = %f" % (k, binom_test(k,n,p,"greater")))
print("")
print("Two-Sided Test:")
print("%d tosses, p-value = %f" % (k, binom_test(k,n,p,"two-sided")))
print("")
print("")
print("#########################################")
print("############ RESAMPLING #################")
print("#########################################")
data1 = scipy.stats.norm.rvs(100,10, size=1000)
data2 = scipy.stats.norm.rvs(105,10, size=1000)
(test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail, pval_2tail, test_list) = resampling(data1, data2, S=10000)
print("Data1:")
text_histogram(data1, nBins = 20)
print("")
print("Data2:")
text_histogram(data2, nBins = 20)
print("")
print("Results:", (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail, pval_2tail))
print("")
print("Resampling Histogram:" )
text_histogram(test_list, nBins = 20, obs=test_obs)
"""
## TEST
import pytransit.specific_tools.transit_tools as transit_tools
import pytransit.specific_tools.tnseq_tools as tnseq_tools
import pytransit.specific_tools.norm_tools as norm_tools
import sys
ctrldata = [
"/pacific/home/mdejesus/transit/tests/GI/H37Rv_day0_rep1.wig",
"/pacific/home/mdejesus/transit/tests/GI/Rv2680_day0_rep1.wig",
"/pacific/home/mdejesus/transit/tests/GI/H37Rv_day0_rep2.wig",
"/pacific/home/mdejesus/transit/tests/GI/Rv2680_day0_rep2.wig",
]
expdata = [
"/pacific/home/mdejesus/transit/tests/GI/H37Rv_day32_rep1.wig",
"/pacific/home/mdejesus/transit/tests/GI/H37Rv_day32_rep2.wig",
"/pacific/home/mdejesus/transit/tests/GI/H37Rv_day32_rep3.wig",
"/pacific/home/mdejesus/transit/tests/GI/Rv2680_day32_rep1.wig",
"/pacific/home/mdejesus/transit/tests/GI/Rv2680_day32_rep2.wig",
"/pacific/home/mdejesus/transit/tests/GI/Rv2680_day32_rep3.wig",
]
annotation = "/pacific/home/mdejesus/transit/tests/GI/H37Rv.prot_table"
i = 202
if len(sys.argv) > 1:
i = int(sys.argv[1])
DO_LIB = True
if len(sys.argv) > 2:
DO_LIB = bool(int(sys.argv[2]))
if DO_LIB:
ctrl_lib_str = "ABAB"
exp_lib_str = "AAABBB"
else:
ctrl_lib_str = ""
exp_lib_str = ""
Kctrl = len(ctrldata)
Kexp = len(expdata)
(data, position) = transit_tools.get_validated_data(ctrldata + expdata)
(K, N) = data.shape
(data, factors) = norm_tools.normalize_data(
data, "TTR", ctrldata + expdata, annotation
)
G = tnseq_tools.Genes(ctrldata + expdata, annotation, data=data, position=position)
gene = G[i]
print("\n\n")
print("#" * 100)
print("# (%s) NEW TEST: %s" % (DO_LIB, gene))
print("#" * 100)
print("")
ii = numpy.ones(gene.n) == 1
data1 = gene.reads[:Kctrl, ii].flatten()
data2 = gene.reads[Kctrl:, ii].flatten()
data_dict = get_lib_data_dict(data1, ctrl_lib_str, data2, exp_lib_str, gene.n)
if DO_LIB:
(
test_obs,
mean1,
mean2,
log2FC,
pval_ltail,
pval_utail,
pval_2tail,
testlist,
) = resampling(
data1,
data2,
S=10000,
test_func=f_mean_diff_dict,
perm_func=f_shuffle_dict_libraries,
adaptive=False,
lib_str1=ctrl_lib_str,
lib_str2=exp_lib_str,
)
else:
(
test_obs,
mean1,
mean2,
log2FC,
pval_ltail,
pval_utail,
pval_2tail,
testlist,
) = resampling(
data1,
data2,
S=10000,
test_func=f_mean_diff_flat,
perm_func=f_shuffle_flat,
adaptive=False,
lib_str1=ctrl_lib_str,
lib_str2=exp_lib_str,
)
print("Resampling Histogram:")
text_histogram(testlist, nBins=20, obs=test_obs)