Coverage for agentos/eval/benchmark.py: 44%
242 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
1"""
2v1.10.0: External Evaluation Harness — SWE-bench & GAIA benchmark runner.
4Supports:
5- SWE-bench: software engineering task resolution
6- GAIA: multi-step reasoning benchmark
7- Custom eval suites via registry
8- Scoring: pass@k, F1, exact match, semantic similarity
9"""
11from __future__ import annotations
13import json
14import time
15import statistics
16from dataclasses import dataclass, field
17from enum import Enum
18from pathlib import Path
19from typing import Any, Callable, Optional
22# ── Enums & Data Classes ──────────────────────────────────────────
24class EvalMetric(str, Enum):
25 """Supported evaluation metrics."""
26 PASS_AT_K = "pass@k" # Fraction of correct in k generations
27 EXACT_MATCH = "exact_match" # String equality
28 F1 = "f1" # F1 score (token overlap)
29 ROUGE_L = "rouge_l" # ROUGE-L
30 SEMANTIC_SIM = "semantic_sim" # Embedding cosine similarity
31 LLM_AS_JUDGE = "llm_as_judge" # LLM-graded
34class EvalSuite(str, Enum):
35 """Supported benchmark suites."""
36 SWE_BENCH = "swe-bench"
37 SWE_BENCH_LITE = "swe-bench-lite"
38 GAIA = "gaia"
39 GAIA_VAL = "gaia-validation"
40 CUSTOM = "custom"
43@dataclass
44class EvalCase:
45 """A single evaluation case."""
46 id: str
47 suite: EvalSuite
48 prompt: str
49 expected: str
50 repo: str = "" # For SWE-bench: git repo
51 base_commit: str = "" # For SWE-bench: base commit hash
52 test_patch: str = "" # For SWE-bench: test patch
53 metadata: dict[str, Any] = field(default_factory=dict)
56@dataclass
57class EvalSample:
58 """One generation sample for a case."""
59 case_id: str
60 sample_index: int # 0..k-1 for pass@k
61 generated: str
62 score: float = 0.0
63 passed: bool = False
64 latency_ms: float = 0.0
65 metadata: dict[str, Any] = field(default_factory=dict)
68@dataclass
69class EvalResult:
70 """Result for a single evaluation case (aggregated across samples)."""
71 case_id: str
72 suite: EvalSuite
73 metric: EvalMetric
74 score: float # pass@k or single-sample score
75 samples: list[EvalSample] = field(default_factory=list)
76 error: str = ""
79@dataclass
80class EvalReport:
81 """Full evaluation report across all cases."""
82 suite: EvalSuite
83 total_cases: int
84 passed_cases: int
85 avg_score: float
86 scores: list[float] = field(default_factory=list)
87 metric: EvalMetric = EvalMetric.EXACT_MATCH
88 results: list[EvalResult] = field(default_factory=list)
89 duration_s: float = 0.0
90 metadata: dict[str, Any] = field(default_factory=dict)
92 @property
93 def success_rate(self) -> float:
94 return self.passed_cases / max(self.total_cases, 1)
96 @property
97 def median_score(self) -> float:
98 return statistics.median(self.scores) if self.scores else 0.0
100 @property
101 def std_dev(self) -> float:
102 return statistics.stdev(self.scores) if len(self.scores) > 1 else 0.0
105# ── Scorers ────────────────────────────────────────────────────────
107class Scorer:
108 """Base scorer."""
110 def score(self, generated: str, expected: str) -> float:
111 raise NotImplementedError
113 @property
114 def metric(self) -> EvalMetric:
115 raise NotImplementedError
118class ExactMatchScorer(Scorer):
119 """Exact string match scorer."""
121 @property
122 def metric(self) -> EvalMetric:
123 return EvalMetric.EXACT_MATCH
125 def score(self, generated: str, expected: str) -> float:
126 if not expected:
127 return 1.0 if not generated else 0.0
128 return 1.0 if generated.strip() == expected.strip() else 0.0
131class F1Scorer(Scorer):
132 """Token-level F1 scorer."""
134 @property
135 def metric(self) -> EvalMetric:
136 return EvalMetric.F1
138 def score(self, generated: str, expected: str) -> float:
139 if not expected:
140 return 1.0 if not generated else 0.0
142 gen_tokens = set(generated.lower().split())
143 exp_tokens = set(expected.lower().split())
145 if not gen_tokens or not exp_tokens:
146 return 0.0
148 tp = len(gen_tokens & exp_tokens)
149 precision = tp / len(gen_tokens)
150 recall = tp / len(exp_tokens)
152 if precision + recall == 0:
153 return 0.0
154 return 2 * precision * recall / (precision + recall)
157class ROUGELScorer(Scorer):
158 """ROUGE-L scorer (longest common subsequence)."""
160 @property
161 def metric(self) -> EvalMetric:
162 return EvalMetric.ROUGE_L
164 @staticmethod
165 def _lcs_len(a: list[str], b: list[str]) -> int:
166 m, n = len(a), len(b)
167 dp = [[0] * (n + 1) for _ in range(m + 1)]
168 for i in range(1, m + 1):
169 for j in range(1, n + 1):
170 if a[i - 1] == b[j - 1]:
171 dp[i][j] = dp[i - 1][j - 1] + 1
172 else:
173 dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
174 return dp[m][n]
176 def score(self, generated: str, expected: str) -> float:
177 if not expected:
178 return 1.0 if not generated else 0.0
180 gen_tokens = generated.lower().split()
181 exp_tokens = expected.lower().split()
183 if not gen_tokens or not exp_tokens:
184 return 0.0
186 lcs = self._lcs_len(gen_tokens, exp_tokens)
187 precision = lcs / len(gen_tokens) if gen_tokens else 0
188 recall = lcs / len(exp_tokens) if exp_tokens else 0
190 if precision + recall == 0:
191 return 0.0
192 return 2 * precision * recall / (precision + recall)
195def get_scorer(metric: EvalMetric) -> Scorer:
196 """Factory: get scorer for a given metric."""
197 return {
198 EvalMetric.EXACT_MATCH: ExactMatchScorer(),
199 EvalMetric.F1: F1Scorer(),
200 EvalMetric.ROUGE_L: ROUGELScorer(),
201 }.get(metric, ExactMatchScorer())
204# ── SWE-bench Loader ──────────────────────────────────────────────
206class SWEBenchLoader:
207 """Load and parse SWE-bench dataset.
209 SWE-bench format: each instance is a GitHub issue with a known fix.
210 Agent must produce a patch that passes the test suite.
211 """
213 @staticmethod
214 def load(path: str | Path, subset: str = "lite") -> list[EvalCase]:
215 """Load SWE-bench instances from a JSON/JSONL file."""
216 path = Path(path)
217 cases = []
219 if not path.exists():
220 raise FileNotFoundError(f"SWE-bench dataset not found: {path}")
222 if path.suffix == ".jsonl":
223 with open(path) as f:
224 for line in f:
225 if line.strip():
226 instance = json.loads(line)
227 cases.append(SWEBenchLoader._parse(instance, subset))
228 elif path.suffix == ".json":
229 data = json.loads(path.read_text())
230 instances = data if isinstance(data, list) else data.get("instances", [])
231 for instance in instances:
232 cases.append(SWEBenchLoader._parse(instance, subset))
234 return cases
236 @staticmethod
237 def _parse(instance: dict, subset: str) -> EvalCase:
238 return EvalCase(
239 id=instance.get("instance_id", ""),
240 suite=EvalSuite.SWE_BENCH_LITE if "lite" in subset else EvalSuite.SWE_BENCH,
241 prompt=instance.get("problem_statement", instance.get("issue", "")),
242 expected="", # SWE-bench doesn't have expected text; it has a test patch
243 repo=instance.get("repo", ""),
244 base_commit=instance.get("base_commit", ""),
245 test_patch=instance.get("test_patch", instance.get("patch", "")),
246 metadata={
247 "hints_text": instance.get("hints_text", ""),
248 "version": instance.get("version", ""),
249 },
250 )
253# ── GAIA Loader ────────────────────────────────────────────────────
255class GAIALoader:
256 """Load and parse GAIA benchmark dataset.
258 GAIA: multi-step reasoning benchmark with 466 questions.
259 Levels: L1 (simple), L2 (medium), L3 (complex).
260 """
262 @staticmethod
263 def load(path: str | Path, level: str | None = None) -> list[EvalCase]:
264 """Load GAIA questions from JSON/JSONL."""
265 path = Path(path)
266 cases = []
268 if not path.exists():
269 raise FileNotFoundError(f"GAIA dataset not found: {path}")
271 if path.suffix == ".jsonl":
272 with open(path) as f:
273 for line in f:
274 if line.strip():
275 q = json.loads(line)
276 if level and q.get("Level", "") != level:
277 continue
278 cases.append(GAIALoader._parse(q, "validation" in path.name))
279 elif path.suffix == ".json":
280 data = json.loads(path.read_text())
281 questions = data if isinstance(data, list) else data.get("questions", [])
282 for q in questions:
283 if level and q.get("Level", "") != level:
284 continue
285 cases.append(GAIALoader._parse(q, "validation" in path.name))
287 return cases
289 @staticmethod
290 def _parse(q: dict, is_val: bool) -> EvalCase:
291 return EvalCase(
292 id=q.get("task_id", q.get("id", "")),
293 suite=EvalSuite.GAIA_VAL if is_val else EvalSuite.GAIA,
294 prompt=q.get("Question", q.get("question", "")),
295 expected=q.get("Final answer", q.get("answer", "")),
296 metadata={
297 "level": q.get("Level", ""),
298 "annotator_metadata": q.get("Annotator Metadata", ""),
299 },
300 )
303# ── Evaluation Runner ──────────────────────────────────────────────
305class EvalRunner:
306 """Run evaluations over multiple cases with pass@k support.
308 Usage:
309 runner = EvalRunner(generate_fn=my_agent.generate)
310 report = runner.run(cases, k=3, metric=EvalMetric.EXACT_MATCH)
311 """
313 def __init__(
314 self,
315 generate_fn: Callable[[str], str],
316 scorer: Scorer | None = None,
317 ):
318 """
319 Args:
320 generate_fn: Function (prompt) -> generated_text
321 scorer: Optional scorer override
322 """
323 self.generate = generate_fn
324 self.scorer = scorer
326 def run(
327 self,
328 cases: list[EvalCase],
329 k: int = 1,
330 metric: EvalMetric = EvalMetric.EXACT_MATCH,
331 on_case_start: Callable[[EvalCase], None] | None = None,
332 on_case_end: Callable[[EvalResult], None] | None = None,
333 ) -> EvalReport:
334 """Run evaluation on a list of cases.
336 Args:
337 cases: Evaluation cases
338 k: Number of samples per case (for pass@k)
339 metric: Scoring metric
340 on_case_start: Callback before each case
341 on_case_end: Callback after each case
343 Returns:
344 EvalReport with aggregated results
345 """
346 start_time = time.time()
347 scorer = self.scorer or get_scorer(metric)
348 results: list[EvalResult] = []
350 for case in cases:
351 if on_case_start:
352 on_case_start(case)
354 samples: list[EvalSample] = []
355 scores: list[float] = []
356 error = ""
358 for i in range(k):
359 try:
360 t0 = time.time()
361 generated = self.generate(case.prompt)
362 latency = (time.time() - t0) * 1000
364 s = scorer.score(generated, case.expected)
365 samples.append(EvalSample(
366 case_id=case.id, sample_index=i,
367 generated=generated, score=s,
368 passed=s >= 0.5, latency_ms=latency,
369 ))
370 scores.append(s)
371 except Exception as e:
372 error = str(e)
373 samples.append(EvalSample(
374 case_id=case.id, sample_index=i,
375 generated="", score=0.0, passed=False,
376 latency_ms=0,
377 ))
378 scores.append(0.0)
380 # pass@k: fraction where at least one sample passes
381 any_pass = any(s.passed for s in samples)
382 # Use max score for the case score
383 case_score = max(scores) if scores else 0.0
385 result = EvalResult(
386 case_id=case.id, suite=case.suite,
387 metric=metric, score=case_score,
388 samples=samples, error=error,
389 )
390 results.append(result)
392 if on_case_end:
393 on_case_end(result)
395 scores_list = [r.score for r in results]
396 passed = sum(1 for s in scores_list if s >= 0.5)
398 report = EvalReport(
399 suite=cases[0].suite if cases else EvalSuite.CUSTOM,
400 total_cases=len(cases),
401 passed_cases=passed,
402 avg_score=sum(scores_list) / max(len(scores_list), 1),
403 scores=scores_list,
404 metric=metric,
405 results=results,
406 duration_s=time.time() - start_time,
407 )
408 return report
410 def run_pass_at_k(
411 self,
412 cases: list[EvalCase],
413 k: int = 5,
414 metric: EvalMetric = EvalMetric.EXACT_MATCH,
415 ) -> EvalReport:
416 """Run pass@k evaluation (shorthand)."""
417 return self.run(cases, k=k, metric=metric)
419 def print_report(self, report: EvalReport) -> str:
420 """Generate a human-readable report string."""
421 lines = [
422 f"╔══ Evaluation Report ══╗",
423 f"║ Suite: {report.suite.value:<20} ║",
424 f"║ Metric: {report.metric.value:<20} ║",
425 f"║ Cases: {report.total_cases:<20} ║",
426 f"║ Passed: {report.passed_cases} ({report.success_rate:.1%})",
427 f"║ Avg Score:{report.avg_score:.4f}",
428 f"║ Median: {report.median_score:.4f} ║",
429 f"║ Std Dev: {report.std_dev:.4f} ║",
430 f"║ Time: {report.duration_s:.1f}s",
431 f"╚════════════════════════╝",
432 ]
433 if report.results and len(report.results) <= 20:
434 lines.append("\nPer-case scores:")
435 for r in report.results:
436 icon = "✓" if r.score >= 0.5 else "✗"
437 lines.append(f" {icon} {r.case_id[:40]:<42} {r.score:.3f}")
439 return "\n".join(lines)
442# ── Eval Registry ──────────────────────────────────────────────────
444class EvalRegistry:
445 """Registry for custom evaluation suites and scorers."""
447 def __init__(self):
448 self._suites: dict[str, list[EvalCase]] = {}
449 self._scorers: dict[str, Scorer] = {}
451 def register_suite(self, name: str, cases: list[EvalCase]) -> None:
452 self._suites[name] = cases
454 def register_scorer(self, name: str, scorer: Scorer) -> None:
455 self._scorers[name] = scorer
457 def get_suite(self, name: str) -> list[EvalCase]:
458 if name not in self._suites:
459 raise KeyError(f"Unknown eval suite: {name}")
460 return self._suites[name]
462 def get_scorer(self, name: str) -> Scorer:
463 return self._scorers.get(name, get_scorer(EvalMetric.EXACT_MATCH))
465 def list_suites(self) -> list[str]:
466 return list(self._suites.keys())
469# ── Quick Eval Helpers ─────────────────────────────────────────────
471def evaluate_quick(
472 generate_fn: Callable[[str], str],
473 cases: list[dict[str, str]],
474 metric: EvalMetric = EvalMetric.EXACT_MATCH,
475 k: int = 1,
476) -> EvalReport:
477 """Quick evaluation from a list of {prompt, expected} dicts."""
478 eval_cases = [
479 EvalCase(id=str(i), suite=EvalSuite.CUSTOM, prompt=c["prompt"], expected=c["expected"])
480 for i, c in enumerate(cases)
481 ]
482 runner = EvalRunner(generate_fn)
483 return runner.run(eval_cases, k=k, metric=metric)