Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/clustering/clustvalidation.py: 11%

107 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1import numpy 

2from math import sqrt 

3 

4def safe_mean(values): 

5 """ Returns mean value discarding non finite values """ 

6 valid_values = [] 

7 for v in values: 

8 if numpy.isfinite(v): 

9 valid_values.append(v) 

10 return numpy.mean(valid_values), numpy.std(valid_values) 

11 

12def safe_mean_vector(vectors): 

13 """ Returns mean profile discarding non finite values. 

14 """ 

15 # if only one vector, avg = itself 

16 if len(vectors)==1: 

17 return vectors[0], numpy.zeros(len(vectors[0])) 

18 # Takes the vector length form the first item 

19 length = len(vectors[0]) 

20 

21 safe_mean = [] 

22 safe_std = [] 

23 

24 for pos in range(length): 

25 pos_mean = [] 

26 for v in vectors: 

27 if numpy.isfinite(v[pos]): 

28 pos_mean.append(v[pos]) 

29 safe_mean.append(numpy.mean(pos_mean)) 

30 safe_std.append(numpy.std(pos_mean)) 

31 return numpy.array(safe_mean), numpy.array(safe_std) 

32 

33def get_silhouette_width(fdist, cluster): 

34 sisters = cluster.get_sisters() 

35 

36 # Calculates silhouette 

37 silhouette = [] 

38 intra_dist = [] 

39 inter_dist = [] 

40 for st in sisters: 

41 if st.profile is None: 

42 continue 

43 for i in cluster.leaves(): 

44 # Skip nodes without profile 

45 if i._profile is not None: 

46 # item intraclsuterdist -> Centroid Diameter 

47 a = fdist(i.profile, cluster.profile)*2 

48 # intracluster dist -> Centroid Linkage 

49 b = fdist(i.profile, st.profile) 

50 

51 if (b-a) == 0.0: 

52 s = 0.0 

53 else: 

54 s = (b-a) / max(a,b) 

55 

56 intra_dist.append(a) 

57 inter_dist.append(b) 

58 silhouette.append(s) 

59 

60 silhouette, std = safe_mean(silhouette) 

61 intracluster_dist, std = safe_mean(intra_dist) 

62 intercluster_dist, std = safe_mean(inter_dist) 

63 return silhouette, intracluster_dist, intercluster_dist 

64 

65def get_avg_profile(node): 

66 """ This internal function updates the mean profile 

67 associated to an internal node. """ 

68 

69 if not node.is_leaf: 

70 leaf_vectors = [n._profile for n in node.leaves() \ 

71 if n._profile is not None] 

72 if len(leaf_vectors)>0: 

73 node._profile, node._std_profile = safe_mean_vector(leaf_vectors) 

74 else: 

75 node._profile, node._std_profile = None, None 

76 return node._profile, node._std_profile 

77 else: 

78 node._std_profile = [0.0]*len(node._profile) 

79 return node._profile, [0.0]*len(node._profile) 

80 

81 

82def get_dunn_index(fdist, *clusters): 

83 """ 

84 Returns the Dunn index for the given selection of nodes. 

85 

86 J.C. Dunn. Well separated clusters and optimal fuzzy 

87 partitions. 1974. J.Cybern. 4. 95-104. 

88 

89 """ 

90 

91 if len(clusters)<2: 

92 raise ValueError("At least 2 clusters are required") 

93 

94 intra_dist = [] 

95 for c in clusters: 

96 for i in c.leaves(): 

97 if i is not None: 

98 # item intraclsuterdist -> Centroid Diameter 

99 a = fdist(i.profile, c.profile)*2 

100 intra_dist.append(a) 

101 max_a = numpy.max(intra_dist) 

102 inter_dist = [] 

103 for i, ci in enumerate(clusters): 

104 for cj in clusters[i+1:]: 

105 # intracluster dist -> Centroid Linkage 

106 b = fdist(ci.profile, cj.profile) 

107 inter_dist.append(b) 

108 min_b = numpy.min(inter_dist) 

109 

110 if max_a == 0.0: 

111 D = 0.0 

112 else: 

113 D = min_b / max_a 

114 return D 

115 

116 

117 

118# #################### 

119# distance functions 

120# #################### 

121 

122def pearson_dist(v1, v2): 

123 try: 

124 from scipy.stats import pearsonr 

125 except ImportError: 

126 raise RuntimeError("scipy is required to execute this function. Please install it an try again") 

127 

128 if (v1 == v2).all(): 

129 return 0.0 

130 else: 

131 return 1.0 - pearsonr(list(v1), list(v2))[0] 

132 

133 

134def spearman_dist(v1, v2): 

135 try: 

136 from scipy.stats import spearmanr 

137 except ImportError: 

138 raise RuntimeError("scipy is required to execute this function. Please install it an try again") 

139 

140 if (v1 == v2).all(): 

141 return 0.0 

142 else: 

143 return 1.0 - spearmanr(list(v1), list(v2))[0] 

144 

145 

146def euclidean_dist(v1, v2): 

147 if (v1 == v2).all(): 

148 return 0.0 

149 else: 

150 return sqrt( square_euclidean_dist(v1,v2) ) 

151 

152def square_euclidean_dist(v1,v2): 

153 if (v1 == v2).all(): 

154 return 0.0 

155 valids = 0 

156 distance= 0.0 

157 for i in range(len(v1)): 

158 if numpy.isfinite(v1[i]) and numpy.isfinite(v2[i]): 

159 valids += 1 

160 d = v1[i]-v2[i] 

161 distance += d*d 

162 if valids==0: 

163 raise ValueError("Cannot calculate values") 

164 return distance/valids 

165 

166default_dist = spearman_dist