Coverage for src / tracekit / batch / aggregate.py: 77%
116 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 23:04 +0000
1"""Result aggregation for batch analysis.
4This module provides statistical aggregation and reporting for batch
5analysis results, including outlier detection and export capabilities.
6"""
8from pathlib import Path
9from typing import Any
11import numpy as np
12import pandas as pd
15def aggregate_results(
16 results: pd.DataFrame,
17 *,
18 metrics: list[str] | None = None,
19 outlier_threshold: float = 3.0,
20 include_plots: bool = False,
21 output_format: str = "dict",
22 output_file: str | Path | None = None,
23) -> dict[str, Any] | pd.DataFrame:
24 """Aggregate results from batch analysis into summary statistics.
26 : Computes comprehensive statistics (mean, std, min, max,
27 outliers) for each metric in the batch results. Supports export to various
28 formats and optional visualization generation.
30 Args:
31 results: DataFrame from batch_analyze() containing analysis results
32 metrics: List of column names to aggregate (default: all numeric columns)
33 outlier_threshold: Z-score threshold for outlier detection (default: 3.0)
34 include_plots: Generate comparison plots across files (default: False)
35 output_format: Output format - 'dict', 'dataframe', 'csv', 'excel', 'html'
36 output_file: Optional output file path for export formats
38 Returns:
39 Dictionary or DataFrame with summary statistics:
40 - count: Number of valid values
41 - mean: Mean value
42 - std: Standard deviation
43 - min: Minimum value
44 - max: Maximum value
45 - median: Median value
46 - q25: 25th percentile
47 - q75: 75th percentile
48 - outliers: List of outlier values
49 - outlier_files: List of files containing outliers
51 Raises:
52 ValueError: If no numeric metrics are found in results.
54 Examples:
55 >>> results = tk.batch_analyze(files, tk.characterize_buffer)
56 >>> summary = tk.aggregate_results(
57 ... results,
58 ... metrics=['rise_time', 'fall_time'],
59 ... outlier_threshold=2.5
60 ... )
61 >>> print(summary['rise_time']['mean'])
62 >>> print(summary['rise_time']['outlier_files'])
64 Notes:
65 - Outliers detected using IQR method: values outside [Q1 - k*IQR, Q3 + k*IQR]
66 where k = (threshold / 3.0) * 1.5 (more robust than z-score for heavy-tailed data)
67 - Non-numeric columns are automatically skipped
68 - Missing values (NaN) are excluded from statistics
69 - CSV/Excel/HTML export requires output_file parameter
71 References:
72 BATCH-002: Result Aggregation
73 """
74 if results.empty:
75 return {} if output_format == "dict" else pd.DataFrame()
77 # Determine metrics to analyze
78 if metrics is None:
79 # Auto-select all numeric columns except 'file' and 'error'
80 metrics = results.select_dtypes(include=[np.number]).columns.tolist()
81 metrics = [m for m in metrics if m not in ["file", "error"]]
83 if not metrics:
84 raise ValueError("No numeric metrics found in results")
86 # Compute aggregated statistics
87 aggregated: dict[str, dict[str, Any]] = {}
89 for metric in metrics:
90 if metric not in results.columns:
91 continue
93 # Extract valid (non-null) values
94 values = results[metric].dropna()
96 if values.empty:
97 aggregated[metric] = {
98 "count": 0,
99 "mean": np.nan,
100 "std": np.nan,
101 "min": np.nan,
102 "max": np.nan,
103 "median": np.nan,
104 "q25": np.nan,
105 "q75": np.nan,
106 "outliers": [],
107 "outlier_files": [],
108 }
109 continue
111 # Basic statistics
112 stats = {
113 "count": len(values),
114 "mean": float(values.mean()),
115 "std": float(values.std()),
116 "min": float(values.min()),
117 "max": float(values.max()),
118 "median": float(values.median()),
119 "q25": float(values.quantile(0.25)),
120 "q75": float(values.quantile(0.75)),
121 }
123 # Outlier detection using IQR method (more robust than z-score)
124 # IQR method: outliers are values outside [Q1 - k*IQR, Q3 + k*IQR]
125 # where k = outlier_threshold * 1.5 (standard is k=1.5, we scale by threshold)
126 if len(values) > 3: # Need at least 4 values for meaningful IQR
127 q1 = stats["q25"]
128 q3 = stats["q75"]
129 iqr = q3 - q1
131 # Scale IQR multiplier by threshold (default 3.0 -> 2.0 * 1.5 = 3.0)
132 k = (outlier_threshold / 3.0) * 1.5
134 lower_bound = q1 - k * iqr
135 upper_bound = q3 + k * iqr
137 outlier_mask = (values < lower_bound) | (values > upper_bound)
138 outlier_indices = values[outlier_mask].index.tolist()
139 stats["outliers"] = values[outlier_mask].tolist()
141 # Get corresponding filenames if available
142 if "file" in results.columns:
143 stats["outlier_files"] = results.loc[outlier_indices, "file"].tolist()
144 else:
145 stats["outlier_files"] = outlier_indices
146 else:
147 stats["outliers"] = [] # type: ignore[assignment]
148 stats["outlier_files"] = [] # type: ignore[assignment]
150 aggregated[metric] = stats
152 # Generate plots if requested
153 if include_plots: 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was never true
154 # Import here to avoid circular dependency
155 try:
156 import matplotlib.pyplot as plt
158 for metric in metrics:
159 if metric not in aggregated:
160 continue
162 _fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
164 # Histogram
165 results[metric].dropna().hist(ax=ax1, bins=30)
166 ax1.axvline(
167 aggregated[metric]["mean"],
168 color="r",
169 linestyle="--",
170 label="Mean",
171 )
172 ax1.axvline(
173 aggregated[metric]["median"],
174 color="g",
175 linestyle="--",
176 label="Median",
177 )
178 ax1.set_xlabel(metric)
179 ax1.set_ylabel("Count")
180 ax1.legend()
181 ax1.set_title(f"{metric} Distribution")
183 # Box plot
184 ax2.boxplot(results[metric].dropna())
185 ax2.set_ylabel(metric)
186 ax2.set_title(f"{metric} Box Plot")
188 plt.tight_layout()
190 # Save or show based on output_file
191 if output_file:
192 plot_file = Path(output_file).with_suffix("") / f"{metric}_plot.png"
193 plot_file.parent.mkdir(parents=True, exist_ok=True)
194 plt.savefig(plot_file)
195 else:
196 plt.show()
198 plt.close()
200 except ImportError:
201 pass # Silently skip plotting if matplotlib not available
203 # Format output
204 if output_format == "dict":
205 return aggregated
207 elif output_format == "dataframe":
208 # Convert to DataFrame with metrics as rows
209 df = pd.DataFrame(aggregated).T
210 # Drop list columns for DataFrame format
211 df = df.drop(columns=["outliers", "outlier_files"], errors="ignore")
212 return df
214 elif output_format in ["csv", "excel", "html"]:
215 if not output_file:
216 raise ValueError(f"{output_format} format requires output_file parameter")
218 df = pd.DataFrame(aggregated).T
219 df = df.drop(columns=["outliers", "outlier_files"], errors="ignore")
221 if output_format == "csv":
222 df.to_csv(output_file)
223 elif output_format == "excel": 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 df.to_excel(output_file)
225 elif output_format == "html": 225 ↛ 230line 225 didn't jump to line 230 because the condition on line 225 was always true
226 # Generate HTML report
227 html = _generate_html_report(results, aggregated, metrics)
228 Path(output_file).write_text(html)
230 return df
232 else:
233 raise ValueError(f"Unknown output_format: {output_format}")
236def _generate_html_report(
237 results: pd.DataFrame,
238 aggregated: dict[str, dict[str, Any]],
239 metrics: list[str],
240) -> str:
241 """Generate HTML report for batch analysis results."""
242 html = """
243 <!DOCTYPE html>
244 <html>
245 <head>
246 <title>Batch Analysis Report</title>
247 <style>
248 body { font-family: Arial, sans-serif; margin: 20px; }
249 h1 { color: #333; }
250 h2 { color: #666; margin-top: 30px; }
251 table { border-collapse: collapse; width: 100%; margin: 20px 0; }
252 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
253 th { background-color: #4CAF50; color: white; }
254 tr:nth-child(even) { background-color: #f2f2f2; }
255 .outlier { background-color: #ffcccc; }
256 </style>
257 </head>
258 <body>
259 <h1>Batch Analysis Report</h1>
260 """
261 # Summary statistics table
262 html += "<h2>Summary Statistics</h2>\n<table>\n"
263 html += "<tr><th>Metric</th><th>Count</th><th>Mean</th><th>Std</th>"
264 html += "<th>Min</th><th>Median</th><th>Max</th><th>Outliers</th></tr>\n"
266 for metric in metrics:
267 if metric not in aggregated: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 continue
269 stats = aggregated[metric]
270 html += "<tr>"
271 html += f"<td>{metric}</td>"
272 html += f"<td>{stats['count']}</td>"
273 html += f"<td>{stats['mean']:.4g}</td>"
274 html += f"<td>{stats['std']:.4g}</td>"
275 html += f"<td>{stats['min']:.4g}</td>"
276 html += f"<td>{stats['median']:.4g}</td>"
277 html += f"<td>{stats['max']:.4g}</td>"
278 html += f"<td>{len(stats['outliers'])}</td>"
279 html += "</tr>\n"
281 html += "</table>\n"
283 # Outlier details
284 has_outliers = any(len(aggregated[m]["outliers"]) > 0 for m in metrics if m in aggregated)
286 if has_outliers:
287 html += "<h2>Outliers Detected</h2>\n"
288 for metric in metrics:
289 if metric not in aggregated: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 continue
291 stats = aggregated[metric]
292 if stats["outliers"]: 292 ↛ 288line 292 didn't jump to line 288 because the condition on line 292 was always true
293 html += f"<h3>{metric}</h3>\n<table>\n"
294 html += "<tr><th>File</th><th>Value</th></tr>\n"
295 for file, value in zip(stats["outlier_files"], stats["outliers"], strict=False):
296 html += f"<tr class='outlier'><td>{file}</td><td>{value:.4g}</td></tr>\n"
297 html += "</table>\n"
299 html += "</body>\n</html>"
300 return html