Coverage for src / tracekit / visualization / histogram.py: 86%
78 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
1"""Histogram utilities with automatic bin optimization.
3This module provides intelligent histogram bin calculation using
4established statistical rules.
7Example:
8 >>> from tracekit.visualization.histogram import calculate_optimal_bins
9 >>> bins = calculate_optimal_bins(data, method="freedman-diaconis")
11References:
12 Sturges' rule (1926)
13 Freedman-Diaconis rule (1981)
14 Scott's rule (1979)
15"""
17from __future__ import annotations
19from typing import TYPE_CHECKING, Literal
21import numpy as np
23if TYPE_CHECKING:
24 from numpy.typing import NDArray
27def calculate_optimal_bins(
28 data: NDArray[np.float64],
29 *,
30 method: Literal["auto", "sturges", "freedman-diaconis", "scott"] = "auto",
31 min_bins: int = 5,
32 max_bins: int = 200,
33) -> int:
34 """Calculate optimal histogram bin count using statistical rules.
36 : Automatically calculate optimal histogram bin count
37 using Sturges, Freedman-Diaconis, or Scott's rule.
39 Args:
40 data: Input data array
41 method: Binning method to use
42 - "auto": Auto-select based on data characteristics
43 - "sturges": Sturges' rule (good for normal distributions)
44 - "freedman-diaconis": Freedman-Diaconis rule (robust to outliers)
45 - "scott": Scott's rule (good for smooth distributions)
46 min_bins: Minimum number of bins (default: 5)
47 max_bins: Maximum number of bins (default: 200)
49 Returns:
50 Optimal number of bins (clamped to [min_bins, max_bins])
52 Raises:
53 ValueError: If data is empty or invalid
55 Example:
56 >>> data = np.random.randn(1000)
57 >>> bins = calculate_optimal_bins(data, method="freedman-diaconis")
58 >>> hist, edges = np.histogram(data, bins=bins)
60 >>> # Auto-select method
61 >>> bins = calculate_optimal_bins(data, method="auto")
63 References:
64 VIS-025: Histogram Bin Optimization
65 Sturges (1926): k = ceil(log2(n) + 1)
66 Freedman-Diaconis (1981): h = 2 * IQR * n^(-1/3)
67 Scott (1979): h = 3.5 * std * n^(-1/3)
68 """
69 if len(data) == 0:
70 raise ValueError("Data array cannot be empty")
71 if min_bins < 1:
72 raise ValueError("min_bins must be >= 1")
73 if max_bins < min_bins:
74 raise ValueError("max_bins must be >= min_bins")
76 # Remove NaN values
77 clean_data = data[~np.isnan(data)]
79 if len(clean_data) < 2: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 return min_bins
82 # Auto-select method based on data characteristics
83 if method == "auto":
84 method = _auto_select_method(clean_data)
86 # Calculate bins using selected method
87 if method == "sturges":
88 bins = _sturges_bins(clean_data)
89 elif method == "freedman-diaconis":
90 bins = _freedman_diaconis_bins(clean_data)
91 elif method == "scott":
92 bins = _scott_bins(clean_data)
93 else:
94 raise ValueError(f"Unknown method: {method}")
96 # Clamp to valid range
97 bins = max(min_bins, min(max_bins, bins))
99 return bins
102def calculate_bin_edges(
103 data: NDArray[np.float64],
104 n_bins: int,
105) -> NDArray[np.float64]:
106 """Calculate histogram bin edges for given bin count.
108 Args:
109 data: Input data array
110 n_bins: Number of bins
112 Returns:
113 Array of bin edges (length n_bins + 1)
115 Raises:
116 ValueError: If data is empty or n_bins < 1.
118 Example:
119 >>> data = np.random.randn(1000)
120 >>> n_bins = calculate_optimal_bins(data)
121 >>> edges = calculate_bin_edges(data, n_bins)
122 """
123 if len(data) == 0: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 raise ValueError("Data array cannot be empty")
125 if n_bins < 1: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 raise ValueError("n_bins must be >= 1")
128 # Remove NaN values
129 clean_data = data[~np.isnan(data)]
131 if len(clean_data) == 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 return np.array([0.0, 1.0])
134 # Calculate edges
135 data_min = np.min(clean_data)
136 data_max = np.max(clean_data)
138 # Handle single-value data
139 if data_min == data_max:
140 return np.array([data_min - 0.5, data_max + 0.5])
142 edges: NDArray[np.float64] = np.linspace(data_min, data_max, n_bins + 1)
143 return edges
146def _sturges_bins(data: NDArray[np.float64]) -> int:
147 """Calculate bins using Sturges' rule.
149 Sturges' rule: k = ceil(log2(n) + 1)
151 Good for: Normal distributions, small to moderate sample sizes
153 Args:
154 data: Input data
156 Returns:
157 Number of bins
158 """
159 n = len(data)
160 bins = int(np.ceil(np.log2(n) + 1))
161 return bins
164def _freedman_diaconis_bins(data: NDArray[np.float64]) -> int:
165 """Calculate bins using Freedman-Diaconis rule.
167 Freedman-Diaconis rule: h = 2 * IQR(x) / n^(1/3)
168 where h is bin width and IQR is interquartile range
170 Good for: Robust estimation, data with outliers
172 Args:
173 data: Input data
175 Returns:
176 Number of bins
177 """
178 n = len(data)
180 # Calculate IQR
181 q75, q25 = np.percentile(data, [75, 25])
182 iqr = q75 - q25
184 if iqr == 0:
185 # Fall back to Sturges if IQR is zero
186 return _sturges_bins(data)
188 # Calculate bin width
189 bin_width = 2.0 * iqr / (n ** (1.0 / 3.0))
191 # Calculate number of bins
192 data_range = np.ptp(data) # peak-to-peak (max - min)
194 if bin_width == 0: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 return _sturges_bins(data)
197 bins = int(np.ceil(data_range / bin_width))
199 return max(1, bins)
202def _scott_bins(data: NDArray[np.float64]) -> int:
203 """Calculate bins using Scott's rule.
205 Scott's rule: h = 3.5 * std(x) / n^(1/3)
206 where h is bin width
208 Good for: Smooth distributions, normally distributed data
210 Args:
211 data: Input data
213 Returns:
214 Number of bins
215 """
216 n = len(data)
218 # Calculate standard deviation
219 std = np.std(data)
221 if std == 0: 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was never true
222 # Fall back to Sturges if std is zero
223 return _sturges_bins(data)
225 # Calculate bin width
226 bin_width = 3.5 * std / (n ** (1.0 / 3.0))
228 # Calculate number of bins
229 data_range = np.ptp(data)
231 if bin_width == 0: 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 return _sturges_bins(data)
234 bins = int(np.ceil(data_range / bin_width))
236 return max(1, bins)
239def _auto_select_method(
240 data: NDArray[np.float64],
241) -> Literal["sturges", "freedman-diaconis", "scott"]:
242 """Auto-select binning method based on data characteristics.
244 Selection criteria:
245 - Use Sturges for small samples (n < 100)
246 - Use Freedman-Diaconis for data with outliers (high skewness)
247 - Use Scott for smooth, normal-like distributions
249 Args:
250 data: Input data
252 Returns:
253 Selected method name
254 """
255 n = len(data)
257 # Small samples: use Sturges
258 if n < 100:
259 return "sturges"
261 # Calculate skewness to detect outliers
262 mean = np.mean(data)
263 std = np.std(data)
265 if std == 0: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 return "sturges"
268 skewness = np.mean(((data - mean) / std) ** 3)
270 # High skewness indicates outliers: use Freedman-Diaconis (robust)
271 if abs(skewness) > 0.5:
272 return "freedman-diaconis"
274 # Normal-like distribution: use Scott
275 return "scott"
278__all__ = [
279 "calculate_bin_edges",
280 "calculate_optimal_bins",
281]