Coverage for src / tracekit / batch / aggregate.py: 77%

116 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Result aggregation for batch analysis. 

2 

3 

4This module provides statistical aggregation and reporting for batch 

5analysis results, including outlier detection and export capabilities. 

6""" 

7 

8from pathlib import Path 

9from typing import Any 

10 

11import numpy as np 

12import pandas as pd 

13 

14 

15def aggregate_results( 

16 results: pd.DataFrame, 

17 *, 

18 metrics: list[str] | None = None, 

19 outlier_threshold: float = 3.0, 

20 include_plots: bool = False, 

21 output_format: str = "dict", 

22 output_file: str | Path | None = None, 

23) -> dict[str, Any] | pd.DataFrame: 

24 """Aggregate results from batch analysis into summary statistics. 

25 

26 : Computes comprehensive statistics (mean, std, min, max, 

27 outliers) for each metric in the batch results. Supports export to various 

28 formats and optional visualization generation. 

29 

30 Args: 

31 results: DataFrame from batch_analyze() containing analysis results 

32 metrics: List of column names to aggregate (default: all numeric columns) 

33 outlier_threshold: Z-score threshold for outlier detection (default: 3.0) 

34 include_plots: Generate comparison plots across files (default: False) 

35 output_format: Output format - 'dict', 'dataframe', 'csv', 'excel', 'html' 

36 output_file: Optional output file path for export formats 

37 

38 Returns: 

39 Dictionary or DataFrame with summary statistics: 

40 - count: Number of valid values 

41 - mean: Mean value 

42 - std: Standard deviation 

43 - min: Minimum value 

44 - max: Maximum value 

45 - median: Median value 

46 - q25: 25th percentile 

47 - q75: 75th percentile 

48 - outliers: List of outlier values 

49 - outlier_files: List of files containing outliers 

50 

51 Raises: 

52 ValueError: If no numeric metrics are found in results. 

53 

54 Examples: 

55 >>> results = tk.batch_analyze(files, tk.characterize_buffer) 

56 >>> summary = tk.aggregate_results( 

57 ... results, 

58 ... metrics=['rise_time', 'fall_time'], 

59 ... outlier_threshold=2.5 

60 ... ) 

61 >>> print(summary['rise_time']['mean']) 

62 >>> print(summary['rise_time']['outlier_files']) 

63 

64 Notes: 

65 - Outliers detected using IQR method: values outside [Q1 - k*IQR, Q3 + k*IQR] 

66 where k = (threshold / 3.0) * 1.5 (more robust than z-score for heavy-tailed data) 

67 - Non-numeric columns are automatically skipped 

68 - Missing values (NaN) are excluded from statistics 

69 - CSV/Excel/HTML export requires output_file parameter 

70 

71 References: 

72 BATCH-002: Result Aggregation 

73 """ 

74 if results.empty: 

75 return {} if output_format == "dict" else pd.DataFrame() 

76 

77 # Determine metrics to analyze 

78 if metrics is None: 

79 # Auto-select all numeric columns except 'file' and 'error' 

80 metrics = results.select_dtypes(include=[np.number]).columns.tolist() 

81 metrics = [m for m in metrics if m not in ["file", "error"]] 

82 

83 if not metrics: 

84 raise ValueError("No numeric metrics found in results") 

85 

86 # Compute aggregated statistics 

87 aggregated: dict[str, dict[str, Any]] = {} 

88 

89 for metric in metrics: 

90 if metric not in results.columns: 

91 continue 

92 

93 # Extract valid (non-null) values 

94 values = results[metric].dropna() 

95 

96 if values.empty: 

97 aggregated[metric] = { 

98 "count": 0, 

99 "mean": np.nan, 

100 "std": np.nan, 

101 "min": np.nan, 

102 "max": np.nan, 

103 "median": np.nan, 

104 "q25": np.nan, 

105 "q75": np.nan, 

106 "outliers": [], 

107 "outlier_files": [], 

108 } 

109 continue 

110 

111 # Basic statistics 

112 stats = { 

113 "count": len(values), 

114 "mean": float(values.mean()), 

115 "std": float(values.std()), 

116 "min": float(values.min()), 

117 "max": float(values.max()), 

118 "median": float(values.median()), 

119 "q25": float(values.quantile(0.25)), 

120 "q75": float(values.quantile(0.75)), 

121 } 

122 

123 # Outlier detection using IQR method (more robust than z-score) 

124 # IQR method: outliers are values outside [Q1 - k*IQR, Q3 + k*IQR] 

125 # where k = outlier_threshold * 1.5 (standard is k=1.5, we scale by threshold) 

126 if len(values) > 3: # Need at least 4 values for meaningful IQR 

127 q1 = stats["q25"] 

128 q3 = stats["q75"] 

129 iqr = q3 - q1 

130 

131 # Scale IQR multiplier by threshold (default 3.0 -> 2.0 * 1.5 = 3.0) 

132 k = (outlier_threshold / 3.0) * 1.5 

133 

134 lower_bound = q1 - k * iqr 

135 upper_bound = q3 + k * iqr 

136 

137 outlier_mask = (values < lower_bound) | (values > upper_bound) 

138 outlier_indices = values[outlier_mask].index.tolist() 

139 stats["outliers"] = values[outlier_mask].tolist() 

140 

141 # Get corresponding filenames if available 

142 if "file" in results.columns: 

143 stats["outlier_files"] = results.loc[outlier_indices, "file"].tolist() 

144 else: 

145 stats["outlier_files"] = outlier_indices 

146 else: 

147 stats["outliers"] = [] # type: ignore[assignment] 

148 stats["outlier_files"] = [] # type: ignore[assignment] 

149 

150 aggregated[metric] = stats 

151 

152 # Generate plots if requested 

153 if include_plots: 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was never true

154 # Import here to avoid circular dependency 

155 try: 

156 import matplotlib.pyplot as plt 

157 

158 for metric in metrics: 

159 if metric not in aggregated: 

160 continue 

161 

162 _fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) 

163 

164 # Histogram 

165 results[metric].dropna().hist(ax=ax1, bins=30) 

166 ax1.axvline( 

167 aggregated[metric]["mean"], 

168 color="r", 

169 linestyle="--", 

170 label="Mean", 

171 ) 

172 ax1.axvline( 

173 aggregated[metric]["median"], 

174 color="g", 

175 linestyle="--", 

176 label="Median", 

177 ) 

178 ax1.set_xlabel(metric) 

179 ax1.set_ylabel("Count") 

180 ax1.legend() 

181 ax1.set_title(f"{metric} Distribution") 

182 

183 # Box plot 

184 ax2.boxplot(results[metric].dropna()) 

185 ax2.set_ylabel(metric) 

186 ax2.set_title(f"{metric} Box Plot") 

187 

188 plt.tight_layout() 

189 

190 # Save or show based on output_file 

191 if output_file: 

192 plot_file = Path(output_file).with_suffix("") / f"{metric}_plot.png" 

193 plot_file.parent.mkdir(parents=True, exist_ok=True) 

194 plt.savefig(plot_file) 

195 else: 

196 plt.show() 

197 

198 plt.close() 

199 

200 except ImportError: 

201 pass # Silently skip plotting if matplotlib not available 

202 

203 # Format output 

204 if output_format == "dict": 

205 return aggregated 

206 

207 elif output_format == "dataframe": 

208 # Convert to DataFrame with metrics as rows 

209 df = pd.DataFrame(aggregated).T 

210 # Drop list columns for DataFrame format 

211 df = df.drop(columns=["outliers", "outlier_files"], errors="ignore") 

212 return df 

213 

214 elif output_format in ["csv", "excel", "html"]: 

215 if not output_file: 

216 raise ValueError(f"{output_format} format requires output_file parameter") 

217 

218 df = pd.DataFrame(aggregated).T 

219 df = df.drop(columns=["outliers", "outlier_files"], errors="ignore") 

220 

221 if output_format == "csv": 

222 df.to_csv(output_file) 

223 elif output_format == "excel": 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 df.to_excel(output_file) 

225 elif output_format == "html": 225 ↛ 230line 225 didn't jump to line 230 because the condition on line 225 was always true

226 # Generate HTML report 

227 html = _generate_html_report(results, aggregated, metrics) 

228 Path(output_file).write_text(html) 

229 

230 return df 

231 

232 else: 

233 raise ValueError(f"Unknown output_format: {output_format}") 

234 

235 

236def _generate_html_report( 

237 results: pd.DataFrame, 

238 aggregated: dict[str, dict[str, Any]], 

239 metrics: list[str], 

240) -> str: 

241 """Generate HTML report for batch analysis results.""" 

242 html = """ 

243 <!DOCTYPE html> 

244 <html> 

245 <head> 

246 <title>Batch Analysis Report</title> 

247 <style> 

248 body { font-family: Arial, sans-serif; margin: 20px; } 

249 h1 { color: #333; } 

250 h2 { color: #666; margin-top: 30px; } 

251 table { border-collapse: collapse; width: 100%; margin: 20px 0; } 

252 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; } 

253 th { background-color: #4CAF50; color: white; } 

254 tr:nth-child(even) { background-color: #f2f2f2; } 

255 .outlier { background-color: #ffcccc; } 

256 </style> 

257 </head> 

258 <body> 

259 <h1>Batch Analysis Report</h1> 

260 """ 

261 # Summary statistics table 

262 html += "<h2>Summary Statistics</h2>\n<table>\n" 

263 html += "<tr><th>Metric</th><th>Count</th><th>Mean</th><th>Std</th>" 

264 html += "<th>Min</th><th>Median</th><th>Max</th><th>Outliers</th></tr>\n" 

265 

266 for metric in metrics: 

267 if metric not in aggregated: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 continue 

269 stats = aggregated[metric] 

270 html += "<tr>" 

271 html += f"<td>{metric}</td>" 

272 html += f"<td>{stats['count']}</td>" 

273 html += f"<td>{stats['mean']:.4g}</td>" 

274 html += f"<td>{stats['std']:.4g}</td>" 

275 html += f"<td>{stats['min']:.4g}</td>" 

276 html += f"<td>{stats['median']:.4g}</td>" 

277 html += f"<td>{stats['max']:.4g}</td>" 

278 html += f"<td>{len(stats['outliers'])}</td>" 

279 html += "</tr>\n" 

280 

281 html += "</table>\n" 

282 

283 # Outlier details 

284 has_outliers = any(len(aggregated[m]["outliers"]) > 0 for m in metrics if m in aggregated) 

285 

286 if has_outliers: 

287 html += "<h2>Outliers Detected</h2>\n" 

288 for metric in metrics: 

289 if metric not in aggregated: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 continue 

291 stats = aggregated[metric] 

292 if stats["outliers"]: 292 ↛ 288line 292 didn't jump to line 288 because the condition on line 292 was always true

293 html += f"<h3>{metric}</h3>\n<table>\n" 

294 html += "<tr><th>File</th><th>Value</th></tr>\n" 

295 for file, value in zip(stats["outlier_files"], stats["outliers"], strict=False): 

296 html += f"<tr class='outlier'><td>{file}</td><td>{value:.4g}</td></tr>\n" 

297 html += "</table>\n" 

298 

299 html += "</body>\n</html>" 

300 return html