Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/clustering/clustvalidation.py: 11%
107 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1import numpy
2from math import sqrt
4def safe_mean(values):
5 """ Returns mean value discarding non finite values """
6 valid_values = []
7 for v in values:
8 if numpy.isfinite(v):
9 valid_values.append(v)
10 return numpy.mean(valid_values), numpy.std(valid_values)
12def safe_mean_vector(vectors):
13 """ Returns mean profile discarding non finite values.
14 """
15 # if only one vector, avg = itself
16 if len(vectors)==1:
17 return vectors[0], numpy.zeros(len(vectors[0]))
18 # Takes the vector length form the first item
19 length = len(vectors[0])
21 safe_mean = []
22 safe_std = []
24 for pos in range(length):
25 pos_mean = []
26 for v in vectors:
27 if numpy.isfinite(v[pos]):
28 pos_mean.append(v[pos])
29 safe_mean.append(numpy.mean(pos_mean))
30 safe_std.append(numpy.std(pos_mean))
31 return numpy.array(safe_mean), numpy.array(safe_std)
33def get_silhouette_width(fdist, cluster):
34 sisters = cluster.get_sisters()
36 # Calculates silhouette
37 silhouette = []
38 intra_dist = []
39 inter_dist = []
40 for st in sisters:
41 if st.profile is None:
42 continue
43 for i in cluster.leaves():
44 # Skip nodes without profile
45 if i._profile is not None:
46 # item intraclsuterdist -> Centroid Diameter
47 a = fdist(i.profile, cluster.profile)*2
48 # intracluster dist -> Centroid Linkage
49 b = fdist(i.profile, st.profile)
51 if (b-a) == 0.0:
52 s = 0.0
53 else:
54 s = (b-a) / max(a,b)
56 intra_dist.append(a)
57 inter_dist.append(b)
58 silhouette.append(s)
60 silhouette, std = safe_mean(silhouette)
61 intracluster_dist, std = safe_mean(intra_dist)
62 intercluster_dist, std = safe_mean(inter_dist)
63 return silhouette, intracluster_dist, intercluster_dist
65def get_avg_profile(node):
66 """ This internal function updates the mean profile
67 associated to an internal node. """
69 if not node.is_leaf:
70 leaf_vectors = [n._profile for n in node.leaves() \
71 if n._profile is not None]
72 if len(leaf_vectors)>0:
73 node._profile, node._std_profile = safe_mean_vector(leaf_vectors)
74 else:
75 node._profile, node._std_profile = None, None
76 return node._profile, node._std_profile
77 else:
78 node._std_profile = [0.0]*len(node._profile)
79 return node._profile, [0.0]*len(node._profile)
82def get_dunn_index(fdist, *clusters):
83 """
84 Returns the Dunn index for the given selection of nodes.
86 J.C. Dunn. Well separated clusters and optimal fuzzy
87 partitions. 1974. J.Cybern. 4. 95-104.
89 """
91 if len(clusters)<2:
92 raise ValueError("At least 2 clusters are required")
94 intra_dist = []
95 for c in clusters:
96 for i in c.leaves():
97 if i is not None:
98 # item intraclsuterdist -> Centroid Diameter
99 a = fdist(i.profile, c.profile)*2
100 intra_dist.append(a)
101 max_a = numpy.max(intra_dist)
102 inter_dist = []
103 for i, ci in enumerate(clusters):
104 for cj in clusters[i+1:]:
105 # intracluster dist -> Centroid Linkage
106 b = fdist(ci.profile, cj.profile)
107 inter_dist.append(b)
108 min_b = numpy.min(inter_dist)
110 if max_a == 0.0:
111 D = 0.0
112 else:
113 D = min_b / max_a
114 return D
118# ####################
119# distance functions
120# ####################
122def pearson_dist(v1, v2):
123 try:
124 from scipy.stats import pearsonr
125 except ImportError:
126 raise RuntimeError("scipy is required to execute this function. Please install it an try again")
128 if (v1 == v2).all():
129 return 0.0
130 else:
131 return 1.0 - pearsonr(list(v1), list(v2))[0]
134def spearman_dist(v1, v2):
135 try:
136 from scipy.stats import spearmanr
137 except ImportError:
138 raise RuntimeError("scipy is required to execute this function. Please install it an try again")
140 if (v1 == v2).all():
141 return 0.0
142 else:
143 return 1.0 - spearmanr(list(v1), list(v2))[0]
146def euclidean_dist(v1, v2):
147 if (v1 == v2).all():
148 return 0.0
149 else:
150 return sqrt( square_euclidean_dist(v1,v2) )
152def square_euclidean_dist(v1,v2):
153 if (v1 == v2).all():
154 return 0.0
155 valids = 0
156 distance= 0.0
157 for i in range(len(v1)):
158 if numpy.isfinite(v1[i]) and numpy.isfinite(v2[i]):
159 valids += 1
160 d = v1[i]-v2[i]
161 distance += d*d
162 if valids==0:
163 raise ValueError("Cannot calculate values")
164 return distance/valids
166default_dist = spearman_dist