Coverage for agentos/evaluation/regression.py: 36%
152 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
1"""Evaluation regression testing for AgentOS.
3Compare evaluation runs, detect regressions, generate CI artifacts.
4Builds on top of agentos.evaluation core (GoldenDataset, Evaluator, EvalReport).
5"""
7from __future__ import annotations
9from dataclasses import dataclass, field
10from typing import Any, Callable, Dict, List, Optional, Tuple
11import json
12import math
13import time
14import xml.etree.ElementTree as ET
16from agentos.evaluation import GoldenDataset, Evaluator, EvalConfig, EvalReport, ScoreDetail
19@dataclass
20class RegressionCheck:
21 """A single regression check result."""
22 case_id: str
23 baseline_score: float
24 current_score: float
25 delta: float
26 regression: bool = False
27 severity: str = "none" # none | minor | moderate | severe
28 details: str = ""
31@dataclass
32class RegressionReport:
33 """Comparison report between baseline and current evaluation."""
34 baseline_label: str = "baseline"
35 current_label: str = "current"
36 baseline: EvalReport = None
37 current: EvalReport = None
38 checks: List[RegressionCheck] = field(default_factory=list)
39 total_regressions: int = 0
40 total_improvements: int = 0
41 pass_delta: float = 0.0
42 score_delta: float = 0.0
43 verdict: str = "OK" # OK | WARN | FAIL
45 def to_markdown(self) -> str:
46 lines = [
47 f"# Regression Report: {self.baseline_label} → {self.current_label}",
48 "",
49 f"**Verdict**: `{self.verdict}`",
50 f"**Pass Rate**: {self.baseline.pass_rate:.1%} → {self.current.pass_rate:.1%} (Δ={self.pass_delta:+.1%})",
51 f"**Avg Score**: {self.baseline.avg_score:.1f} → {self.current.avg_score:.1f} (Δ={self.score_delta:+.1f})",
52 f"**Regressions**: {self.total_regressions} | **Improvements**: {self.total_improvements}",
53 "",
54 ]
56 if self.checks:
57 lines.append("## Detail")
58 lines.append("| Case ID | Baseline | Current | Δ | Verdict |")
59 lines.append("|---------|----------|---------|---|---------|")
60 for c in self.checks:
61 icon = "RECRESSION" if c.regression else "IMPROVED" if c.delta > 0 else "SAME"
62 lines.append(
63 f"| {c.case_id} | {c.baseline_score:.1f} | {c.current_score:.1f} | "
64 f"{c.delta:+.1f} | {icon} |"
65 )
67 return "\n".join(lines)
70class RegressionRunner:
71 """Detect regressions by comparing baseline and current evaluation runs.
73 Usage:
74 runner = RegressionRunner(evaluator, baseline=report)
75 report = await runner.check(current_report)
76 # or sync:
77 report = runner.check_sync(current_report)
78 """
80 def __init__(
81 self,
82 evaluator: Evaluator,
83 baseline: Optional[EvalReport] = None,
84 threshold: float = 5.0,
85 severe_threshold: float = 20.0,
86 ):
87 self.evaluator = evaluator
88 self.baseline = baseline
89 self.threshold = threshold
90 self.severe_threshold = severe_threshold
92 async def run_baseline(self) -> EvalReport:
93 """Run and store the baseline."""
94 self.baseline = await self.evaluator.run()
95 return self.baseline
97 async def check(self, current: Optional[EvalReport] = None) -> RegressionReport:
98 """Compare current against baseline. If current not given, run it."""
99 if current is None:
100 current = await self.evaluator.run()
101 return self._compare(current)
103 def check_sync(self, current: EvalReport) -> RegressionReport:
104 """Synchronous version for testing."""
105 return self._compare(current)
107 def _compare(self, current: EvalReport) -> RegressionReport:
109 if self.baseline is None:
110 raise ValueError("No baseline set. Call run_baseline() first.")
112 report = RegressionReport(
113 baseline=self.baseline,
114 current=current,
115 pass_delta=current.pass_rate - self.baseline.pass_rate,
116 score_delta=current.avg_score - self.baseline.avg_score,
117 )
119 # Build lookup from baseline results
120 baseline_map: Dict[str, ScoreDetail] = {
121 r.case_id: r for r in self.baseline.results
122 }
124 for current_result in current.results:
125 cid = current_result.case_id
126 baseline_result = baseline_map.get(cid)
128 if baseline_result is None:
129 # New case, no baseline comparison
130 report.checks.append(RegressionCheck(
131 case_id=cid,
132 baseline_score=0,
133 current_score=current_result.total_score,
134 delta=0,
135 details="new case",
136 ))
137 continue
139 delta = current_result.total_score - baseline_result.total_score
140 regression = delta < -self.threshold
142 severity = "none"
143 if delta < -self.severe_threshold:
144 severity = "severe"
145 elif delta < -self.threshold:
146 severity = "moderate"
147 elif delta < 0:
148 severity = "minor"
150 if regression:
151 report.total_regressions += 1
152 elif delta > self.threshold:
153 report.total_improvements += 1
155 report.checks.append(RegressionCheck(
156 case_id=cid,
157 baseline_score=baseline_result.total_score,
158 current_score=current_result.total_score,
159 delta=round(delta, 1),
160 regression=regression,
161 severity=severity,
162 ))
164 # Verdict
165 if report.total_regressions > 0:
166 has_severe = any(c.severity == "severe" for c in report.checks)
167 report.verdict = "FAIL" if has_severe else "WARN"
169 return report
172# --- Statistical Runner ---
175@dataclass
176class StatResult:
177 """Statistical summary of N evaluation runs."""
178 trials: int = 0
179 pass_rates: List[float] = field(default_factory=list)
180 avg_scores: List[float] = field(default_factory=list)
181 mean_pass_rate: float = 0.0
182 std_pass_rate: float = 0.0
183 mean_score: float = 0.0
184 std_score: float = 0.0
185 ci95_pass_rate: Tuple[float, float] = (0.0, 0.0)
186 ci95_score: Tuple[float, float] = (0.0, 0.0)
188 def to_dict(self) -> dict:
189 return {
190 "trials": self.trials,
191 "mean_pass_rate": round(self.mean_pass_rate, 4),
192 "std_pass_rate": round(self.std_pass_rate, 4),
193 "ci95_pass_rate": [round(x, 4) for x in self.ci95_pass_rate],
194 "mean_score": round(self.mean_score, 2),
195 "std_score": round(self.std_score, 2),
196 "ci95_score": [round(x, 2) for x in self.ci95_score],
197 }
200class StatisticalRunner:
201 """Run evaluation N times and compute statistics.
203 Usage:
204 srunner = StatisticalRunner(evaluator, trials=10)
205 stats = await srunner.run()
206 """
208 def __init__(self, evaluator: Evaluator, trials: int = 5):
209 self.evaluator = evaluator
210 self.trials = max(trials, 2)
212 async def run(self) -> StatResult:
213 pass_rates = []
214 avg_scores = []
216 for i in range(self.trials):
217 report = await self.evaluator.run()
218 pass_rates.append(report.pass_rate)
219 avg_scores.append(report.avg_score)
221 return self._compute(pass_rates, avg_scores)
223 def _compute(self, pass_rates: List[float], avg_scores: List[float]) -> StatResult:
224 n = len(pass_rates)
225 mean_pr = sum(pass_rates) / n
226 var_pr = sum((x - mean_pr) ** 2 for x in pass_rates) / (n - 1)
227 std_pr = math.sqrt(var_pr) if var_pr > 0 else 0
229 mean_s = sum(avg_scores) / n
230 var_s = sum((x - mean_s) ** 2 for x in avg_scores) / (n - 1)
231 std_s = math.sqrt(var_s) if var_s > 0 else 0
233 # 95% CI using t-distribution approximation (z for simplicity)
234 z = 1.96
235 ci_pr = (
236 max(0, mean_pr - z * std_pr / math.sqrt(n)),
237 min(1, mean_pr + z * std_pr / math.sqrt(n)),
238 )
239 ci_s = (
240 mean_s - z * std_s / math.sqrt(n),
241 mean_s + z * std_s / math.sqrt(n),
242 )
244 return StatResult(
245 trials=n,
246 pass_rates=pass_rates,
247 avg_scores=avg_scores,
248 mean_pass_rate=round(mean_pr, 4),
249 std_pass_rate=round(std_pr, 4),
250 mean_score=round(mean_s, 2),
251 std_score=round(std_s, 2),
252 ci95_pass_rate=(round(ci_pr[0], 4), round(ci_pr[1], 4)),
253 ci95_score=(round(ci_s[0], 2), round(ci_s[1], 2)),
254 )
257# --- CI Exports ---
260def to_junit_xml(report: EvalReport, suite_name: str = "AgentOS Eval") -> str:
261 """Convert evaluation report to JUnit XML for CI integration (GitHub Actions, Jenkins, etc.)."""
262 total = len(report.results)
263 failed_count = sum(1 for r in report.results if not r.passed)
264 total_time = sum(r.duration_ms for r in report.results) / 1000
266 suite = ET.Element("testsuite", {
267 "name": suite_name,
268 "tests": str(total),
269 "failures": str(failed_count),
270 "errors": "0",
271 "skipped": "0",
272 "time": f"{total_time:.3f}",
273 "timestamp": report.timestamp or "",
274 })
276 for result in report.results:
277 testcase = ET.SubElement(suite, "testcase", {
278 "classname": f"AgentOS.{report.dataset_name}",
279 "name": result.case_id,
280 "time": f"{result.duration_ms / 1000:.3f}",
281 })
283 if not result.passed:
284 failure = ET.SubElement(testcase, "failure", {
285 "type": "ScoreBelowThreshold",
286 "message": f"Score {result.total_score:.1f}: {result.details}",
287 })
288 failure.text = f"Actual: {result.actual_output[:500]}\nErrors: {result.errors}"
290 # Add score as property
291 props = ET.SubElement(testcase, "properties")
292 ET.SubElement(props, "property", {
293 "name": "score", "value": f"{result.total_score:.1f}"
294 })
295 ET.SubElement(props, "property", {
296 "name": "metrics", "value": json.dumps(result.metrics)
297 })
299 return ET.tostring(suite, encoding="unicode")
302def to_json(report: EvalReport, indent: int = 2) -> str:
303 """Serialize EvalReport to JSON string."""
304 import dataclasses
306 class ReportEncoder(json.JSONEncoder):
307 def default(self, obj):
308 if dataclasses.is_dataclass(obj):
309 return dataclasses.asdict(obj)
310 return super().default(obj)
312 return json.dumps(report, cls=ReportEncoder, indent=indent, ensure_ascii=False)
315def save_report(report: EvalReport, path: str, format: str = "markdown"):
316 """Save evaluation report to file (markdown / json / junit)."""
317 if format == "markdown":
318 content = report.to_markdown()
319 elif format == "json":
320 content = to_json(report)
321 elif format == "junit":
322 content = to_junit_xml(report)
323 else:
324 raise ValueError(f"Unknown format: {format}")
326 with open(path, "w", encoding="utf-8") as f:
327 f.write(content)