Coverage for /home/deng/Projects/metatree_drawer/metatreedrawer/treeprofiler/src/phylosignal.py: 20%

115 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-08-07 10:33 +0200

1#!/usr/bin/env python3 

2import os, math, re 

3from multiprocessing.pool import ThreadPool 

4import numpy as np 

5from scipy.stats import entropy 

6from numba import njit, float64, int64 

7 

8from pastml.tree import read_tree, name_tree, annotate_dates, DATE, read_forest, DATE_CI, resolve_trees, IS_POLYTOMY, \ 

9 unresolve_trees 

10from pastml.acr import acr, _serialize_acr 

11from pastml.annotation import preannotate_forest 

12from pastml import col_name2cat 

13from collections import defaultdict, Counter 

14 

15from treeprofiler.src.utils import add_suffix 

16 

17# lambda0 = 0.1 # rate parameter of the proposal 

18# se = 0.5 # standard deviation of the proposal 

19# sim = 10000 # number of iterations 

20# thin = 10 # Keep only each xth iterate 

21# burn = 100 # Burned-in iterates 

22 

23# ent_type = 'SE' # Linear Shannon Entropy LSE or Shannon Entropy [SE] or Gini impurity [GI] 

24 

25''' ADDITIONAL INFORMATION 

26 

27[1] Borges, R. et al. (2019). Measuring phylogenetic signal between categorical traits and phylogenies. Bioinformatics, 35, 1862-1869. 

28 

29[2] Ribeiro, D. et al. (2023). Testing phylogenetic signal with categorical traits and tree uncertainty, Bioinformatics, Volume 39, Issue 7, July 2023 

30 

31[3] Ishikawa SA, Zhukova A, Iwasaki W, Gascuel O. (2019). A Fast Likelihood Method to Reconstruct and Visualize Ancestral Scenarios. Molecular Biology and Evolution, msz131. 

32''' 

33 

34# Source Script of delta method from https://github.com/diogo-s-ribeiro/delta-statistic/blob/master/Delta-Python/delta_functs.py 

35# Metropolis-Hastings step for alpha parameter 

36@njit(float64(float64, float64, float64[::1], float64, float64)) 

37def mhalpha(a,b,x,l0,se): 

38 '''a = The current value of the alpha parameter. 

39 b = The current value of the beta parameter. 

40 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated. 

41 l0 = A constant value used in the acceptance ratio computations. 

42 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.''' 

43 

44 a1 = np.exp(np.random.normal(np.log(a),se, 1))[0] 

45 lp_a = np.exp( (len(x)*(math.lgamma(a1+b)-math.lgamma(a1)) - a1*(l0-np.sum(np.log(x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(a)) - a*(l0-np.sum(np.log(x)))) ) 

46 r = min( 1, lp_a ) 

47 

48 # Repeat until a valid value is obtained 

49 while (np.isnan(lp_a) == True): 

50 a1 = np.exp(np.random.normal(np.log(a),se, 1))[0] 

51 lp_a = np.exp( (len(x)*(math.lgamma(a1+b)-math.lgamma(a1)) - a1*(l0-np.sum(np.log(x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(a)) - a*(l0-np.sum(np.log(x)))) ) 

52 r = min( 1, lp_a ) 

53 

54 # Accept or reject based on the acceptance ratio 

55 if np.random.uniform(0,1) < r: 

56 return a1 

57 else: 

58 return a 

59 

60 

61# Metropolis-Hastings step for beta parameter 

62@njit(float64(float64, float64, float64[::1], float64, float64)) 

63def mhbeta(a,b,x,l0,se): 

64 '''a = The current value of the alpha parameter. 

65 b = The current value of the beta parameter. 

66 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated. 

67 l0 = A constant value used in the acceptance ratio computations. 

68 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm.''' 

69 

70 b1 = np.exp(np.random.normal(np.log(b),se,1))[0] 

71 lp_b = np.exp( (len(x)*(math.lgamma(a+b1)-math.lgamma(b1)) - b1*(l0-np.sum(np.log(1-x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(b)) - b*(l0-np.sum(np.log(1-x)))) ) 

72 r = min(1, lp_b ) 

73 

74 # Repeat until a valid value is obtained 

75 while (np.isnan(lp_b) == True): 

76 b1 = np.exp(np.random.normal(np.log(b),se,1))[0] 

77 lp_b = np.exp( (len(x)*(math.lgamma(a+b1)-math.lgamma(b1)) - b1*(l0-np.sum(np.log(1-x)))) - (len(x)*(math.lgamma(a+b)-math.lgamma(b)) - b*(l0-np.sum(np.log(1-x)))) ) 

78 r = min(1, lp_b ) 

79 

80 # Accept or reject based on the acceptance ratio 

81 if np.random.uniform(0,1) < r: 

82 return b1 

83 else: 

84 return b 

85 

86 

87# Metropolis-Hastings algorithm using alpha and beta 

88# @njit(float64[:, ::1]((float64, float64, float64[::1], float64, float64, int64, int64, int64)) 

89def emcmc(params): 

90 '''alpha = The initial value of the alpha parameter. 

91 beta = The initial value of the beta parameter. 

92 x = An array of data points used in the acceptance ratio computations, after uncertainty is calculated. 

93 l0 = A constant value used in the acceptance ratio computations. 

94 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm. 

95 sim = The number of total iterations in the Markov Chain Monte Carlo (MCMC) simulation. 

96 thin = The thinning parameter, i.e., the number of iterations to discard between saved samples. 

97 burn = The number of burn-in iterations to discard at the beginning of the simulation.''' 

98 

99 alpha, beta, x, l0, se, sim, thin, burn = params 

100 n_size = np.linspace(burn, sim, int((sim - burn) / thin + 1)) 

101 usim = np.round(n_size, 0, np.empty_like(n_size)) 

102 gibbs = [] 

103 p = 0 

104 

105 for i in range(sim + 1): 

106 alpha = mhalpha(alpha,beta,x,l0,se) 

107 beta = mhbeta(alpha,beta,x,l0,se) 

108 

109 if i == usim[p]: 

110 gibbs.append((alpha, beta)) 

111 p += 1 

112 

113 gibbs = np.asarray(gibbs) 

114 return gibbs 

115 

116# def parallel_emcmc(threads, alpha, beta, x, l0, se, sim, thin, burn): 

117# params = [(alpha, beta, x, l0, se, sim, thin, burn) for _ in range(threads)] 

118# with ThreadPool(processes=threads-1) as pool: 

119# results = pool.map(func=emcmc, iterable=params) 

120# return results 

121 

122# Calculate uncertainty using different types 

123def entropy_type(prob, ent_type): 

124 '''prob = A matrix of ancestral probabilities. 

125 ent_type = A string indicating the type of entropy calculation. (options: 'LSE', 'SE', or any other value for Gini impurity).''' 

126 

127 # Linear Shannon Entropy 

128 if ent_type == 'LSE': 

129 k = np.shape(prob)[1] 

130 prob = np.asarray(np.where(prob<=(1/k), prob, prob/(1-k) - 1/(1-k))) 

131 tent = np.sum(prob, 1) 

132 

133 # Ensure absolutes 

134 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000)) 

135 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000)) 

136 

137 return tent 

138 

139 # Shannon Entropy 

140 elif ent_type == 'SE': 

141 k = np.shape(prob)[1] 

142 tent = entropy(prob, base=k, axis=1) 

143 

144 # Ensure absolutes 

145 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000)) 

146 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000)) 

147 

148 return tent 

149 

150 # Ginni Impurity 

151 else: 

152 k = np.shape(prob)[1] 

153 tent = ((1 - np.sum(prob**2, axis=1))*k)/ (k - 1) 

154 

155 # Ensure absolutes 

156 tent = np.asarray(np.where(tent != 0, tent, tent + np.random.uniform(0,1,1)/10000)) 

157 tent = np.asarray(np.where(tent != 1, tent, tent - np.random.uniform(0,1,1)/10000)) 

158 

159 return tent 

160 

161 

162# Calculate delta-statistic after an MCMC step 

163def delta(x,lambda0,se,sim,thin,burn,ent_type, threads=1): 

164 '''x = A matrix of ancestral probabilities. 

165 lambda0 = A constant value used in the acceptance ratio computations. 

166 se = The standard deviation used for the random walk in the Metropolis-Hastings algorithm. 

167 sim = The number of total iterations in the Markov Chain Monte Carlo (MCMC) simulation. 

168 thin = The thinning parameter, i.e., the number of iterations to discard between saved samples. 

169 burn = The number of burn-in iterations to discard at the beginning of the simulation. 

170 ent_type = A string specifying the type of entropy calculation (options: 'LSE', 'SE', or any other value for Gini impurity).''' 

171 params = (np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn) 

172 # if threads > 1: 

173 # mc1 = parallel_emcmc(threads,np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn) 

174 # mc2 = parallel_emcmc(threads,np.random.exponential(),np.random.exponential(),entropy_type(x, ent_type),lambda0,se,sim,thin,burn) 

175 # else: 

176 mc1 = emcmc(params) 

177 mc2 = emcmc(params) 

178 mchain = np.concatenate((mc1,mc2), axis=0) 

179 

180 deltaA = (np.mean(mchain[:,1]))/(np.mean(mchain[:,0])) 

181 

182 return deltaA 

183 

184# Calculate the marginal probabilities for each discrete trait 

185def run_acr_discrete(tree, columns, prediction_method="MPPA", model="F81", threads=1, outdir="./"): 

186 prop2acr = {} 

187 column2states = {c: np.array(sorted(list(set(states)))) for c, states in columns.items()} 

188 features = list(column2states.keys()) 

189 forest = [tree] 

190 

191 for key in column2states.keys(): 

192 single_column = {key:columns[key]} 

193 single_column2states = {key:column2states[key]} 

194 # Run ACR 

195 acr_result = acr(forest=forest, columns=single_column.keys(), column2states=single_column2states, prediction_method=prediction_method, model=model, threads=threads) 

196 prop2acr[key] = acr_result 

197 if outdir: 

198 _serialize_acr((acr_result[0], outdir)) 

199 return prop2acr, forest[0] 

200 

201# Calculate the marginal probabilities for each continuous trait 

202def run_acr_continuous(tree, columns): 

203 return 

204 

205# Calculate delta-statistic of marginal probabilities each discrete trait 

206def run_delta(acr_results, tree, run_whole_tree=False, ent_type='LSE', lambda0=0.1, se=0.5, sim=10000, burn=100, thin=10, threads=1): 

207 prop2delta = {} 

208 prop2marginals = {} 

209 leafnames = tree.leaf_names() 

210 node2leaves = tree.get_cached_content(leaves_only=False) 

211 # extract the marginal probabilities for each discrete trait 

212 if run_whole_tree: 

213 for node in tree.traverse(): 

214 if node.is_leaf: 

215 continue 

216 

217 for prop, acr_result in acr_results.items(): 

218 # Get the marginal probabilities for each node 

219 children_data = acr_result[0]['marginal_probabilities'].loc[[child.name for child in node2leaves[node] if not child.is_leaf]] 

220 

221 #internal_nodes_data[node.name] = children_data 

222 if node.is_root: 

223 marginal_probs = np.asarray(acr_result[0]['marginal_probabilities'].drop(leafnames)) 

224 else: 

225 marginal_probs = np.asarray(children_data) 

226 # run delta for each discrete trait 

227 # load annotations to leaves 

228 delta_result = delta(marginal_probs, lambda0, se, sim, burn, thin, ent_type, threads) 

229 node.add_prop(add_suffix(prop, "delta"), delta_result) 

230 else: 

231 # this is the case when we only want to calculate delta for the root 

232 for prop, acr_result in acr_results.items(): 

233 # Get the marginal probabilities for each node 

234 marginal_probs = np.asarray(acr_result[0]['marginal_probabilities'].drop(leafnames)) 

235 # run delta for each discrete trait 

236 # load annotations to leaves 

237 delta_result = delta(marginal_probs, lambda0, se, sim, burn, thin, ent_type, threads) 

238 #tree.add_prop(add_suffix(prop, "delta"), delta_result) 

239 prop2delta[prop] = delta_result 

240 return prop2delta 

241 

242# Calculate Pagel's lambda statistic for each continuous trait 

243def run_lambda(): 

244 return 

245 

246# Calculate Blomberg's kappa statistic for each continuous trait 

247def run_kappa(): 

248 return