Coverage for /home/deng/Projects/metatree_drawer/metatreedrawer/treeprofiler/src/phylosignal.py: 20%
115 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-08-07 10:33 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2024-08-07 10:33 +0200
1#!/usr/bin/env python3
2import os, math, re
3from multiprocessing.pool import ThreadPool
4import numpy as np
5from scipy.stats import entropy
6from numba import njit, float64, int64
8from pastml.tree import read_tree, name_tree, annotate_dates, DATE, read_forest, DATE_CI, resolve_trees, IS_POLYTOMY, \
9 unresolve_trees
10from pastml.acr import acr, _serialize_acr
11from pastml.annotation import preannotate_forest
12from pastml import col_name2cat
13from collections import defaultdict, Counter
15from treeprofiler.src.utils import add_suffix
17# lambda0 = 0.1 # rate parameter of the proposal
18# se = 0.5 # standard deviation of the proposal
19# sim = 10000 # number of iterations
20# thin = 10 # Keep only each xth iterate
21# burn = 100 # Burned-in iterates
23# ent_type = 'SE' # Linear Shannon Entropy LSE or Shannon Entropy [SE] or Gini impurity [GI]
25''' ADDITIONAL INFORMATION
27[1] Borges, R. et al. (2019). Measuring phylogenetic signal between categorical traits and phylogenies. Bioinformatics, 35, 1862-1869.
29[2] Ribeiro, D. et al. (2023). Testing phylogenetic signal with categorical traits and tree uncertainty, Bioinformatics, Volume 39, Issue 7, July 2023
31[3] Ishikawa SA, Zhukova A, Iwasaki W, Gascuel O. (2019). A Fast Likelihood Method to Reconstruct and Visualize Ancestral Scenarios. Molecular Biology and Evolution, msz131.
32'''
34# Source Script of delta method from https://github.com/diogo-s-ribeiro/delta-statistic/blob/master/Delta-Python/delta_functs.py
35# Metropolis-Hastings step for alpha parameter
36@njit(float64(float64, float64, float64[::1], float64, float64))
37def mhalpha(a,b,x,l0,se):
38 '''a = The current value of the alpha parameter.
39 b = The current value of the beta parameter.
40 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated.
41 l0 = A constant value used in the acceptance ratio computations.
42 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.'''
44 a1 = np.exp(np.random.normal(np.log(a),se, 1))[0]
45 lp_a = np.exp( (len(x)*(math.lgamma(a1+b)-math.lgamma(a1)) - a1*(l0-np.sum(np.log(x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(a)) - a*(l0-np.sum(np.log(x)))) )
46 r = min( 1, lp_a )
48 # Repeat until a valid value is obtained
49 while (np.isnan(lp_a) == True):
50 a1 = np.exp(np.random.normal(np.log(a),se, 1))[0]
51 lp_a = np.exp( (len(x)*(math.lgamma(a1+b)-math.lgamma(a1)) - a1*(l0-np.sum(np.log(x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(a)) - a*(l0-np.sum(np.log(x)))) )
52 r = min( 1, lp_a )
54 # Accept or reject based on the acceptance ratio
55 if np.random.uniform(0,1) < r:
56 return a1
57 else:
58 return a
61# Metropolis-Hastings step for beta parameter
62@njit(float64(float64, float64, float64[::1], float64, float64))
63def mhbeta(a,b,x,l0,se):
64 '''a = The current value of the alpha parameter.
65 b = The current value of the beta parameter.
66 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated.
67 l0 = A constant value used in the acceptance ratio computations.
68 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.'''
70 b1 = np.exp(np.random.normal(np.log(b),se,1))[0]
71 lp_b = np.exp( (len(x)*(math.lgamma(a+b1)-math.lgamma(b1)) - b1*(l0-np.sum(np.log(1-x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(b)) - b*(l0-np.sum(np.log(1-x)))) )
72 r = min(1, lp_b )
74 # Repeat until a valid value is obtained
75 while (np.isnan(lp_b) == True):
76 b1 = np.exp(np.random.normal(np.log(b),se,1))[0]
77 lp_b = np.exp( (len(x)*(math.lgamma(a+b1)-math.lgamma(b1)) - b1*(l0-np.sum(np.log(1-x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(b)) - b*(l0-np.sum(np.log(1-x)))) )
78 r = min(1, lp_b )
80 # Accept or reject based on the acceptance ratio
81 if np.random.uniform(0,1) < r:
82 return b1
83 else:
84 return b
87# Metropolis-Hastings algorithm using alpha and beta
88# @njit(float64[:, ::1]((float64, float64, float64[::1], float64, float64, int64, int64, int64))
89def emcmc(params):
90 '''alpha = The initial value of the alpha parameter.
91 beta = The initial value of the beta parameter.
92 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated.
93 l0 = A constant value used in the acceptance ratio computations.
94 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.
95 sim = The number of total iterations in the Markov Chain Monte Carlo (MCMC) simulation.
96 thin = The thinning parameter, i.e., the number of iterations to discard between saved samples.
97 burn = The number of burn-in iterations to discard at the beginning of the simulation.'''
99 alpha, beta, x, l0, se, sim, thin, burn = params
100 n_size = np.linspace(burn, sim, int((sim - burn) / thin + 1))
101 usim = np.round(n_size, 0, np.empty_like(n_size))
102 gibbs = []
103 p = 0
105 for i in range(sim + 1):
106 alpha = mhalpha(alpha,beta,x,l0,se)
107 beta = mhbeta(alpha,beta,x,l0,se)
109 if i == usim[p]:
110 gibbs.append((alpha, beta))
111 p += 1
113 gibbs = np.asarray(gibbs)
114 return gibbs
116# def parallel_emcmc(threads, alpha, beta, x, l0, se, sim, thin, burn):
117# params = [(alpha, beta, x, l0, se, sim, thin, burn) for _ in range(threads)]
118# with ThreadPool(processes=threads-1) as pool:
119# results = pool.map(func=emcmc, iterable=params)
120# return results
122# Calculate uncertainty using different types
123def entropy_type(prob, ent_type):
124 '''prob = A matrix of ancestral probabilities.
125 ent_type = A string indicating the type of entropy calculation. (options: 'LSE', 'SE', or any other value for Gini impurity).'''
127 # Linear Shannon Entropy
128 if ent_type == 'LSE':
129 k = np.shape(prob)[1]
130 prob = np.asarray(np.where(prob<=(1/k), prob, prob/(1-k) - 1/(1-k)))
131 tent = np.sum(prob, 1)
133 # Ensure absolutes
134 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000))
135 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000))
137 return tent
139 # Shannon Entropy
140 elif ent_type == 'SE':
141 k = np.shape(prob)[1]
142 tent = entropy(prob, base=k, axis=1)
144 # Ensure absolutes
145 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000))
146 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000))
148 return tent
150 # Ginni Impurity
151 else:
152 k = np.shape(prob)[1]
153 tent = ((1 - np.sum(prob**2, axis=1))*k)/ (k - 1)
155 # Ensure absolutes
156 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000))
157 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000))
159 return tent
162# Calculate delta-statistic after an MCMC step
163def delta(x,lambda0,se,sim,thin,burn,ent_type, threads=1):
164 '''x = A matrix of ancestral probabilities.
165 lambda0 = A constant value used in the acceptance ratio computations.
166 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.
167 sim = The number of total iterations in the Markov Chain Monte Carlo (MCMC) simulation.
168 thin = The thinning parameter, i.e., the number of iterations to discard between saved samples.
169 burn = The number of burn-in iterations to discard at the beginning of the simulation.
170 ent_type = A string specifying the type of entropy calculation (options: 'LSE', 'SE', or any other value for Gini impurity).'''
171 params = (np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn)
172 # if threads > 1:
173 # mc1 = parallel_emcmc(threads,np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn)
174 # mc2 = parallel_emcmc(threads,np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn)
175 # else:
176 mc1 = emcmc(params)
177 mc2 = emcmc(params)
178 mchain = np.concatenate((mc1,mc2), axis=0)
180 deltaA = (np.mean(mchain[:,1]))/(np.mean(mchain[:,0]))
182 return deltaA
184# Calculate the marginal probabilities for each discrete trait
185def run_acr_discrete(tree, columns, prediction_method="MPPA", model="F81", threads=1, outdir="./"):
186 prop2acr = {}
187 column2states = {c: np.array(sorted(list(set(states)))) for c, states in columns.items()}
188 features = list(column2states.keys())
189 forest = [tree]
191 for key in column2states.keys():
192 single_column = {key:columns[key]}
193 single_column2states = {key:column2states[key]}
194 # Run ACR
195 acr_result = acr(forest=forest, columns=single_column.keys(), column2states=single_column2states, prediction_method=prediction_method, model=model, threads=threads)
196 prop2acr[key] = acr_result
197 if outdir:
198 _serialize_acr((acr_result[0], outdir))
199 return prop2acr, forest[0]
201# Calculate the marginal probabilities for each continuous trait
202def run_acr_continuous(tree, columns):
203 return
205# Calculate delta-statistic of marginal probabilities each discrete trait
206def run_delta(acr_results, tree, run_whole_tree=False, ent_type='LSE', lambda0=0.1, se=0.5, sim=10000, burn=100, thin=10, threads=1):
207 prop2delta = {}
208 prop2marginals = {}
209 leafnames = tree.leaf_names()
210 node2leaves = tree.get_cached_content(leaves_only=False)
211 # extract the marginal probabilities for each discrete trait
212 if run_whole_tree:
213 for node in tree.traverse():
214 if node.is_leaf:
215 continue
217 for prop, acr_result in acr_results.items():
218 # Get the marginal probabilities for each node
219 children_data = acr_result[0]['marginal_probabilities'].loc[[child.name for child in node2leaves[node] if not child.is_leaf]]
221 #internal_nodes_data[node.name] = children_data
222 if node.is_root:
223 marginal_probs = np.asarray(acr_result[0]['marginal_probabilities'].drop(leafnames))
224 else:
225 marginal_probs = np.asarray(children_data)
226 # run delta for each discrete trait
227 # load annotations to leaves
228 delta_result = delta(marginal_probs, lambda0, se, sim, burn, thin, ent_type, threads)
229 node.add_prop(add_suffix(prop, "delta"), delta_result)
230 else:
231 # this is the case when we only want to calculate delta for the root
232 for prop, acr_result in acr_results.items():
233 # Get the marginal probabilities for each node
234 marginal_probs = np.asarray(acr_result[0]['marginal_probabilities'].drop(leafnames))
235 # run delta for each discrete trait
236 # load annotations to leaves
237 delta_result = delta(marginal_probs, lambda0, se, sim, burn, thin, ent_type, threads)
238 #tree.add_prop(add_suffix(prop, "delta"), delta_result)
239 prop2delta[prop] = delta_result
240 return prop2delta
242# Calculate Pagel's lambda statistic for each continuous trait
243def run_lambda():
244 return
246# Calculate Blomberg's kappa statistic for each continuous trait
247def run_kappa():
248 return