Coverage for agentos/evaluation/regression.py: 36%

152 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-07-02 09:59 +0800

1"""Evaluation regression testing for AgentOS. 

2 

3Compare evaluation runs, detect regressions, generate CI artifacts. 

4Builds on top of agentos.evaluation core (GoldenDataset, Evaluator, EvalReport). 

5""" 

6 

7from __future__ import annotations 

8 

9from dataclasses import dataclass, field 

10from typing import Any, Callable, Dict, List, Optional, Tuple 

11import json 

12import math 

13import time 

14import xml.etree.ElementTree as ET 

15 

16from agentos.evaluation import GoldenDataset, Evaluator, EvalConfig, EvalReport, ScoreDetail 

17 

18 

19@dataclass 

20class RegressionCheck: 

21 """A single regression check result.""" 

22 case_id: str 

23 baseline_score: float 

24 current_score: float 

25 delta: float 

26 regression: bool = False 

27 severity: str = "none" # none | minor | moderate | severe 

28 details: str = "" 

29 

30 

31@dataclass 

32class RegressionReport: 

33 """Comparison report between baseline and current evaluation.""" 

34 baseline_label: str = "baseline" 

35 current_label: str = "current" 

36 baseline: EvalReport = None 

37 current: EvalReport = None 

38 checks: List[RegressionCheck] = field(default_factory=list) 

39 total_regressions: int = 0 

40 total_improvements: int = 0 

41 pass_delta: float = 0.0 

42 score_delta: float = 0.0 

43 verdict: str = "OK" # OK | WARN | FAIL 

44 

45 def to_markdown(self) -> str: 

46 lines = [ 

47 f"# Regression Report: {self.baseline_label} → {self.current_label}", 

48 "", 

49 f"**Verdict**: `{self.verdict}`", 

50 f"**Pass Rate**: {self.baseline.pass_rate:.1%} → {self.current.pass_rate:.1%} (Δ={self.pass_delta:+.1%})", 

51 f"**Avg Score**: {self.baseline.avg_score:.1f} → {self.current.avg_score:.1f} (Δ={self.score_delta:+.1f})", 

52 f"**Regressions**: {self.total_regressions} | **Improvements**: {self.total_improvements}", 

53 "", 

54 ] 

55 

56 if self.checks: 

57 lines.append("## Detail") 

58 lines.append("| Case ID | Baseline | Current | Δ | Verdict |") 

59 lines.append("|---------|----------|---------|---|---------|") 

60 for c in self.checks: 

61 icon = "RECRESSION" if c.regression else "IMPROVED" if c.delta > 0 else "SAME" 

62 lines.append( 

63 f"| {c.case_id} | {c.baseline_score:.1f} | {c.current_score:.1f} | " 

64 f"{c.delta:+.1f} | {icon} |" 

65 ) 

66 

67 return "\n".join(lines) 

68 

69 

70class RegressionRunner: 

71 """Detect regressions by comparing baseline and current evaluation runs. 

72 

73 Usage: 

74 runner = RegressionRunner(evaluator, baseline=report) 

75 report = await runner.check(current_report) 

76 # or sync: 

77 report = runner.check_sync(current_report) 

78 """ 

79 

80 def __init__( 

81 self, 

82 evaluator: Evaluator, 

83 baseline: Optional[EvalReport] = None, 

84 threshold: float = 5.0, 

85 severe_threshold: float = 20.0, 

86 ): 

87 self.evaluator = evaluator 

88 self.baseline = baseline 

89 self.threshold = threshold 

90 self.severe_threshold = severe_threshold 

91 

92 async def run_baseline(self) -> EvalReport: 

93 """Run and store the baseline.""" 

94 self.baseline = await self.evaluator.run() 

95 return self.baseline 

96 

97 async def check(self, current: Optional[EvalReport] = None) -> RegressionReport: 

98 """Compare current against baseline. If current not given, run it.""" 

99 if current is None: 

100 current = await self.evaluator.run() 

101 return self._compare(current) 

102 

103 def check_sync(self, current: EvalReport) -> RegressionReport: 

104 """Synchronous version for testing.""" 

105 return self._compare(current) 

106 

107 def _compare(self, current: EvalReport) -> RegressionReport: 

108 

109 if self.baseline is None: 

110 raise ValueError("No baseline set. Call run_baseline() first.") 

111 

112 report = RegressionReport( 

113 baseline=self.baseline, 

114 current=current, 

115 pass_delta=current.pass_rate - self.baseline.pass_rate, 

116 score_delta=current.avg_score - self.baseline.avg_score, 

117 ) 

118 

119 # Build lookup from baseline results 

120 baseline_map: Dict[str, ScoreDetail] = { 

121 r.case_id: r for r in self.baseline.results 

122 } 

123 

124 for current_result in current.results: 

125 cid = current_result.case_id 

126 baseline_result = baseline_map.get(cid) 

127 

128 if baseline_result is None: 

129 # New case, no baseline comparison 

130 report.checks.append(RegressionCheck( 

131 case_id=cid, 

132 baseline_score=0, 

133 current_score=current_result.total_score, 

134 delta=0, 

135 details="new case", 

136 )) 

137 continue 

138 

139 delta = current_result.total_score - baseline_result.total_score 

140 regression = delta < -self.threshold 

141 

142 severity = "none" 

143 if delta < -self.severe_threshold: 

144 severity = "severe" 

145 elif delta < -self.threshold: 

146 severity = "moderate" 

147 elif delta < 0: 

148 severity = "minor" 

149 

150 if regression: 

151 report.total_regressions += 1 

152 elif delta > self.threshold: 

153 report.total_improvements += 1 

154 

155 report.checks.append(RegressionCheck( 

156 case_id=cid, 

157 baseline_score=baseline_result.total_score, 

158 current_score=current_result.total_score, 

159 delta=round(delta, 1), 

160 regression=regression, 

161 severity=severity, 

162 )) 

163 

164 # Verdict 

165 if report.total_regressions > 0: 

166 has_severe = any(c.severity == "severe" for c in report.checks) 

167 report.verdict = "FAIL" if has_severe else "WARN" 

168 

169 return report 

170 

171 

172# --- Statistical Runner --- 

173 

174 

175@dataclass 

176class StatResult: 

177 """Statistical summary of N evaluation runs.""" 

178 trials: int = 0 

179 pass_rates: List[float] = field(default_factory=list) 

180 avg_scores: List[float] = field(default_factory=list) 

181 mean_pass_rate: float = 0.0 

182 std_pass_rate: float = 0.0 

183 mean_score: float = 0.0 

184 std_score: float = 0.0 

185 ci95_pass_rate: Tuple[float, float] = (0.0, 0.0) 

186 ci95_score: Tuple[float, float] = (0.0, 0.0) 

187 

188 def to_dict(self) -> dict: 

189 return { 

190 "trials": self.trials, 

191 "mean_pass_rate": round(self.mean_pass_rate, 4), 

192 "std_pass_rate": round(self.std_pass_rate, 4), 

193 "ci95_pass_rate": [round(x, 4) for x in self.ci95_pass_rate], 

194 "mean_score": round(self.mean_score, 2), 

195 "std_score": round(self.std_score, 2), 

196 "ci95_score": [round(x, 2) for x in self.ci95_score], 

197 } 

198 

199 

200class StatisticalRunner: 

201 """Run evaluation N times and compute statistics. 

202 

203 Usage: 

204 srunner = StatisticalRunner(evaluator, trials=10) 

205 stats = await srunner.run() 

206 """ 

207 

208 def __init__(self, evaluator: Evaluator, trials: int = 5): 

209 self.evaluator = evaluator 

210 self.trials = max(trials, 2) 

211 

212 async def run(self) -> StatResult: 

213 pass_rates = [] 

214 avg_scores = [] 

215 

216 for i in range(self.trials): 

217 report = await self.evaluator.run() 

218 pass_rates.append(report.pass_rate) 

219 avg_scores.append(report.avg_score) 

220 

221 return self._compute(pass_rates, avg_scores) 

222 

223 def _compute(self, pass_rates: List[float], avg_scores: List[float]) -> StatResult: 

224 n = len(pass_rates) 

225 mean_pr = sum(pass_rates) / n 

226 var_pr = sum((x - mean_pr) ** 2 for x in pass_rates) / (n - 1) 

227 std_pr = math.sqrt(var_pr) if var_pr > 0 else 0 

228 

229 mean_s = sum(avg_scores) / n 

230 var_s = sum((x - mean_s) ** 2 for x in avg_scores) / (n - 1) 

231 std_s = math.sqrt(var_s) if var_s > 0 else 0 

232 

233 # 95% CI using t-distribution approximation (z for simplicity) 

234 z = 1.96 

235 ci_pr = ( 

236 max(0, mean_pr - z * std_pr / math.sqrt(n)), 

237 min(1, mean_pr + z * std_pr / math.sqrt(n)), 

238 ) 

239 ci_s = ( 

240 mean_s - z * std_s / math.sqrt(n), 

241 mean_s + z * std_s / math.sqrt(n), 

242 ) 

243 

244 return StatResult( 

245 trials=n, 

246 pass_rates=pass_rates, 

247 avg_scores=avg_scores, 

248 mean_pass_rate=round(mean_pr, 4), 

249 std_pass_rate=round(std_pr, 4), 

250 mean_score=round(mean_s, 2), 

251 std_score=round(std_s, 2), 

252 ci95_pass_rate=(round(ci_pr[0], 4), round(ci_pr[1], 4)), 

253 ci95_score=(round(ci_s[0], 2), round(ci_s[1], 2)), 

254 ) 

255 

256 

257# --- CI Exports --- 

258 

259 

260def to_junit_xml(report: EvalReport, suite_name: str = "AgentOS Eval") -> str: 

261 """Convert evaluation report to JUnit XML for CI integration (GitHub Actions, Jenkins, etc.).""" 

262 total = len(report.results) 

263 failed_count = sum(1 for r in report.results if not r.passed) 

264 total_time = sum(r.duration_ms for r in report.results) / 1000 

265 

266 suite = ET.Element("testsuite", { 

267 "name": suite_name, 

268 "tests": str(total), 

269 "failures": str(failed_count), 

270 "errors": "0", 

271 "skipped": "0", 

272 "time": f"{total_time:.3f}", 

273 "timestamp": report.timestamp or "", 

274 }) 

275 

276 for result in report.results: 

277 testcase = ET.SubElement(suite, "testcase", { 

278 "classname": f"AgentOS.{report.dataset_name}", 

279 "name": result.case_id, 

280 "time": f"{result.duration_ms / 1000:.3f}", 

281 }) 

282 

283 if not result.passed: 

284 failure = ET.SubElement(testcase, "failure", { 

285 "type": "ScoreBelowThreshold", 

286 "message": f"Score {result.total_score:.1f}: {result.details}", 

287 }) 

288 failure.text = f"Actual: {result.actual_output[:500]}\nErrors: {result.errors}" 

289 

290 # Add score as property 

291 props = ET.SubElement(testcase, "properties") 

292 ET.SubElement(props, "property", { 

293 "name": "score", "value": f"{result.total_score:.1f}" 

294 }) 

295 ET.SubElement(props, "property", { 

296 "name": "metrics", "value": json.dumps(result.metrics) 

297 }) 

298 

299 return ET.tostring(suite, encoding="unicode") 

300 

301 

302def to_json(report: EvalReport, indent: int = 2) -> str: 

303 """Serialize EvalReport to JSON string.""" 

304 import dataclasses 

305 

306 class ReportEncoder(json.JSONEncoder): 

307 def default(self, obj): 

308 if dataclasses.is_dataclass(obj): 

309 return dataclasses.asdict(obj) 

310 return super().default(obj) 

311 

312 return json.dumps(report, cls=ReportEncoder, indent=indent, ensure_ascii=False) 

313 

314 

315def save_report(report: EvalReport, path: str, format: str = "markdown"): 

316 """Save evaluation report to file (markdown / json / junit).""" 

317 if format == "markdown": 

318 content = report.to_markdown() 

319 elif format == "json": 

320 content = to_json(report) 

321 elif format == "junit": 

322 content = to_junit_xml(report) 

323 else: 

324 raise ValueError(f"Unknown format: {format}") 

325 

326 with open(path, "w", encoding="utf-8") as f: 

327 f.write(content)