Coverage for src / tracekit / visualization / histogram.py: 86%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Histogram utilities with automatic bin optimization. 

2 

3This module provides intelligent histogram bin calculation using 

4established statistical rules. 

5 

6 

7Example: 

8 >>> from tracekit.visualization.histogram import calculate_optimal_bins 

9 >>> bins = calculate_optimal_bins(data, method="freedman-diaconis") 

10 

11References: 

12 Sturges' rule (1926) 

13 Freedman-Diaconis rule (1981) 

14 Scott's rule (1979) 

15""" 

16 

17from __future__ import annotations 

18 

19from typing import TYPE_CHECKING, Literal 

20 

21import numpy as np 

22 

23if TYPE_CHECKING: 

24 from numpy.typing import NDArray 

25 

26 

27def calculate_optimal_bins( 

28 data: NDArray[np.float64], 

29 *, 

30 method: Literal["auto", "sturges", "freedman-diaconis", "scott"] = "auto", 

31 min_bins: int = 5, 

32 max_bins: int = 200, 

33) -> int: 

34 """Calculate optimal histogram bin count using statistical rules. 

35 

36 : Automatically calculate optimal histogram bin count 

37 using Sturges, Freedman-Diaconis, or Scott's rule. 

38 

39 Args: 

40 data: Input data array 

41 method: Binning method to use 

42 - "auto": Auto-select based on data characteristics 

43 - "sturges": Sturges' rule (good for normal distributions) 

44 - "freedman-diaconis": Freedman-Diaconis rule (robust to outliers) 

45 - "scott": Scott's rule (good for smooth distributions) 

46 min_bins: Minimum number of bins (default: 5) 

47 max_bins: Maximum number of bins (default: 200) 

48 

49 Returns: 

50 Optimal number of bins (clamped to [min_bins, max_bins]) 

51 

52 Raises: 

53 ValueError: If data is empty or invalid 

54 

55 Example: 

56 >>> data = np.random.randn(1000) 

57 >>> bins = calculate_optimal_bins(data, method="freedman-diaconis") 

58 >>> hist, edges = np.histogram(data, bins=bins) 

59 

60 >>> # Auto-select method 

61 >>> bins = calculate_optimal_bins(data, method="auto") 

62 

63 References: 

64 VIS-025: Histogram Bin Optimization 

65 Sturges (1926): k = ceil(log2(n) + 1) 

66 Freedman-Diaconis (1981): h = 2 * IQR * n^(-1/3) 

67 Scott (1979): h = 3.5 * std * n^(-1/3) 

68 """ 

69 if len(data) == 0: 

70 raise ValueError("Data array cannot be empty") 

71 if min_bins < 1: 

72 raise ValueError("min_bins must be >= 1") 

73 if max_bins < min_bins: 

74 raise ValueError("max_bins must be >= min_bins") 

75 

76 # Remove NaN values 

77 clean_data = data[~np.isnan(data)] 

78 

79 if len(clean_data) < 2: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 return min_bins 

81 

82 # Auto-select method based on data characteristics 

83 if method == "auto": 

84 method = _auto_select_method(clean_data) 

85 

86 # Calculate bins using selected method 

87 if method == "sturges": 

88 bins = _sturges_bins(clean_data) 

89 elif method == "freedman-diaconis": 

90 bins = _freedman_diaconis_bins(clean_data) 

91 elif method == "scott": 

92 bins = _scott_bins(clean_data) 

93 else: 

94 raise ValueError(f"Unknown method: {method}") 

95 

96 # Clamp to valid range 

97 bins = max(min_bins, min(max_bins, bins)) 

98 

99 return bins 

100 

101 

102def calculate_bin_edges( 

103 data: NDArray[np.float64], 

104 n_bins: int, 

105) -> NDArray[np.float64]: 

106 """Calculate histogram bin edges for given bin count. 

107 

108 Args: 

109 data: Input data array 

110 n_bins: Number of bins 

111 

112 Returns: 

113 Array of bin edges (length n_bins + 1) 

114 

115 Raises: 

116 ValueError: If data is empty or n_bins < 1. 

117 

118 Example: 

119 >>> data = np.random.randn(1000) 

120 >>> n_bins = calculate_optimal_bins(data) 

121 >>> edges = calculate_bin_edges(data, n_bins) 

122 """ 

123 if len(data) == 0: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 raise ValueError("Data array cannot be empty") 

125 if n_bins < 1: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 raise ValueError("n_bins must be >= 1") 

127 

128 # Remove NaN values 

129 clean_data = data[~np.isnan(data)] 

130 

131 if len(clean_data) == 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 return np.array([0.0, 1.0]) 

133 

134 # Calculate edges 

135 data_min = np.min(clean_data) 

136 data_max = np.max(clean_data) 

137 

138 # Handle single-value data 

139 if data_min == data_max: 

140 return np.array([data_min - 0.5, data_max + 0.5]) 

141 

142 edges: NDArray[np.float64] = np.linspace(data_min, data_max, n_bins + 1) 

143 return edges 

144 

145 

146def _sturges_bins(data: NDArray[np.float64]) -> int: 

147 """Calculate bins using Sturges' rule. 

148 

149 Sturges' rule: k = ceil(log2(n) + 1) 

150 

151 Good for: Normal distributions, small to moderate sample sizes 

152 

153 Args: 

154 data: Input data 

155 

156 Returns: 

157 Number of bins 

158 """ 

159 n = len(data) 

160 bins = int(np.ceil(np.log2(n) + 1)) 

161 return bins 

162 

163 

164def _freedman_diaconis_bins(data: NDArray[np.float64]) -> int: 

165 """Calculate bins using Freedman-Diaconis rule. 

166 

167 Freedman-Diaconis rule: h = 2 * IQR(x) / n^(1/3) 

168 where h is bin width and IQR is interquartile range 

169 

170 Good for: Robust estimation, data with outliers 

171 

172 Args: 

173 data: Input data 

174 

175 Returns: 

176 Number of bins 

177 """ 

178 n = len(data) 

179 

180 # Calculate IQR 

181 q75, q25 = np.percentile(data, [75, 25]) 

182 iqr = q75 - q25 

183 

184 if iqr == 0: 

185 # Fall back to Sturges if IQR is zero 

186 return _sturges_bins(data) 

187 

188 # Calculate bin width 

189 bin_width = 2.0 * iqr / (n ** (1.0 / 3.0)) 

190 

191 # Calculate number of bins 

192 data_range = np.ptp(data) # peak-to-peak (max - min) 

193 

194 if bin_width == 0: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 return _sturges_bins(data) 

196 

197 bins = int(np.ceil(data_range / bin_width)) 

198 

199 return max(1, bins) 

200 

201 

202def _scott_bins(data: NDArray[np.float64]) -> int: 

203 """Calculate bins using Scott's rule. 

204 

205 Scott's rule: h = 3.5 * std(x) / n^(1/3) 

206 where h is bin width 

207 

208 Good for: Smooth distributions, normally distributed data 

209 

210 Args: 

211 data: Input data 

212 

213 Returns: 

214 Number of bins 

215 """ 

216 n = len(data) 

217 

218 # Calculate standard deviation 

219 std = np.std(data) 

220 

221 if std == 0: 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was never true

222 # Fall back to Sturges if std is zero 

223 return _sturges_bins(data) 

224 

225 # Calculate bin width 

226 bin_width = 3.5 * std / (n ** (1.0 / 3.0)) 

227 

228 # Calculate number of bins 

229 data_range = np.ptp(data) 

230 

231 if bin_width == 0: 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 return _sturges_bins(data) 

233 

234 bins = int(np.ceil(data_range / bin_width)) 

235 

236 return max(1, bins) 

237 

238 

239def _auto_select_method( 

240 data: NDArray[np.float64], 

241) -> Literal["sturges", "freedman-diaconis", "scott"]: 

242 """Auto-select binning method based on data characteristics. 

243 

244 Selection criteria: 

245 - Use Sturges for small samples (n < 100) 

246 - Use Freedman-Diaconis for data with outliers (high skewness) 

247 - Use Scott for smooth, normal-like distributions 

248 

249 Args: 

250 data: Input data 

251 

252 Returns: 

253 Selected method name 

254 """ 

255 n = len(data) 

256 

257 # Small samples: use Sturges 

258 if n < 100: 

259 return "sturges" 

260 

261 # Calculate skewness to detect outliers 

262 mean = np.mean(data) 

263 std = np.std(data) 

264 

265 if std == 0: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 return "sturges" 

267 

268 skewness = np.mean(((data - mean) / std) ** 3) 

269 

270 # High skewness indicates outliers: use Freedman-Diaconis (robust) 

271 if abs(skewness) > 0.5: 

272 return "freedman-diaconis" 

273 

274 # Normal-like distribution: use Scott 

275 return "scott" 

276 

277 

278__all__ = [ 

279 "calculate_bin_edges", 

280 "calculate_optimal_bins", 

281]