Coverage for agentos/eval/benchmark.py: 44%

242 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-07-02 09:59 +0800

1""" 

2v1.10.0: External Evaluation Harness — SWE-bench & GAIA benchmark runner. 

3 

4Supports: 

5- SWE-bench: software engineering task resolution 

6- GAIA: multi-step reasoning benchmark 

7- Custom eval suites via registry 

8- Scoring: pass@k, F1, exact match, semantic similarity 

9""" 

10 

11from __future__ import annotations 

12 

13import json 

14import time 

15import statistics 

16from dataclasses import dataclass, field 

17from enum import Enum 

18from pathlib import Path 

19from typing import Any, Callable, Optional 

20 

21 

22# ── Enums & Data Classes ────────────────────────────────────────── 

23 

24class EvalMetric(str, Enum): 

25 """Supported evaluation metrics.""" 

26 PASS_AT_K = "pass@k" # Fraction of correct in k generations 

27 EXACT_MATCH = "exact_match" # String equality 

28 F1 = "f1" # F1 score (token overlap) 

29 ROUGE_L = "rouge_l" # ROUGE-L 

30 SEMANTIC_SIM = "semantic_sim" # Embedding cosine similarity 

31 LLM_AS_JUDGE = "llm_as_judge" # LLM-graded 

32 

33 

34class EvalSuite(str, Enum): 

35 """Supported benchmark suites.""" 

36 SWE_BENCH = "swe-bench" 

37 SWE_BENCH_LITE = "swe-bench-lite" 

38 GAIA = "gaia" 

39 GAIA_VAL = "gaia-validation" 

40 CUSTOM = "custom" 

41 

42 

43@dataclass 

44class EvalCase: 

45 """A single evaluation case.""" 

46 id: str 

47 suite: EvalSuite 

48 prompt: str 

49 expected: str 

50 repo: str = "" # For SWE-bench: git repo 

51 base_commit: str = "" # For SWE-bench: base commit hash 

52 test_patch: str = "" # For SWE-bench: test patch 

53 metadata: dict[str, Any] = field(default_factory=dict) 

54 

55 

56@dataclass 

57class EvalSample: 

58 """One generation sample for a case.""" 

59 case_id: str 

60 sample_index: int # 0..k-1 for pass@k 

61 generated: str 

62 score: float = 0.0 

63 passed: bool = False 

64 latency_ms: float = 0.0 

65 metadata: dict[str, Any] = field(default_factory=dict) 

66 

67 

68@dataclass 

69class EvalResult: 

70 """Result for a single evaluation case (aggregated across samples).""" 

71 case_id: str 

72 suite: EvalSuite 

73 metric: EvalMetric 

74 score: float # pass@k or single-sample score 

75 samples: list[EvalSample] = field(default_factory=list) 

76 error: str = "" 

77 

78 

79@dataclass 

80class EvalReport: 

81 """Full evaluation report across all cases.""" 

82 suite: EvalSuite 

83 total_cases: int 

84 passed_cases: int 

85 avg_score: float 

86 scores: list[float] = field(default_factory=list) 

87 metric: EvalMetric = EvalMetric.EXACT_MATCH 

88 results: list[EvalResult] = field(default_factory=list) 

89 duration_s: float = 0.0 

90 metadata: dict[str, Any] = field(default_factory=dict) 

91 

92 @property 

93 def success_rate(self) -> float: 

94 return self.passed_cases / max(self.total_cases, 1) 

95 

96 @property 

97 def median_score(self) -> float: 

98 return statistics.median(self.scores) if self.scores else 0.0 

99 

100 @property 

101 def std_dev(self) -> float: 

102 return statistics.stdev(self.scores) if len(self.scores) > 1 else 0.0 

103 

104 

105# ── Scorers ──────────────────────────────────────────────────────── 

106 

107class Scorer: 

108 """Base scorer.""" 

109 

110 def score(self, generated: str, expected: str) -> float: 

111 raise NotImplementedError 

112 

113 @property 

114 def metric(self) -> EvalMetric: 

115 raise NotImplementedError 

116 

117 

118class ExactMatchScorer(Scorer): 

119 """Exact string match scorer.""" 

120 

121 @property 

122 def metric(self) -> EvalMetric: 

123 return EvalMetric.EXACT_MATCH 

124 

125 def score(self, generated: str, expected: str) -> float: 

126 if not expected: 

127 return 1.0 if not generated else 0.0 

128 return 1.0 if generated.strip() == expected.strip() else 0.0 

129 

130 

131class F1Scorer(Scorer): 

132 """Token-level F1 scorer.""" 

133 

134 @property 

135 def metric(self) -> EvalMetric: 

136 return EvalMetric.F1 

137 

138 def score(self, generated: str, expected: str) -> float: 

139 if not expected: 

140 return 1.0 if not generated else 0.0 

141 

142 gen_tokens = set(generated.lower().split()) 

143 exp_tokens = set(expected.lower().split()) 

144 

145 if not gen_tokens or not exp_tokens: 

146 return 0.0 

147 

148 tp = len(gen_tokens & exp_tokens) 

149 precision = tp / len(gen_tokens) 

150 recall = tp / len(exp_tokens) 

151 

152 if precision + recall == 0: 

153 return 0.0 

154 return 2 * precision * recall / (precision + recall) 

155 

156 

157class ROUGELScorer(Scorer): 

158 """ROUGE-L scorer (longest common subsequence).""" 

159 

160 @property 

161 def metric(self) -> EvalMetric: 

162 return EvalMetric.ROUGE_L 

163 

164 @staticmethod 

165 def _lcs_len(a: list[str], b: list[str]) -> int: 

166 m, n = len(a), len(b) 

167 dp = [[0] * (n + 1) for _ in range(m + 1)] 

168 for i in range(1, m + 1): 

169 for j in range(1, n + 1): 

170 if a[i - 1] == b[j - 1]: 

171 dp[i][j] = dp[i - 1][j - 1] + 1 

172 else: 

173 dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) 

174 return dp[m][n] 

175 

176 def score(self, generated: str, expected: str) -> float: 

177 if not expected: 

178 return 1.0 if not generated else 0.0 

179 

180 gen_tokens = generated.lower().split() 

181 exp_tokens = expected.lower().split() 

182 

183 if not gen_tokens or not exp_tokens: 

184 return 0.0 

185 

186 lcs = self._lcs_len(gen_tokens, exp_tokens) 

187 precision = lcs / len(gen_tokens) if gen_tokens else 0 

188 recall = lcs / len(exp_tokens) if exp_tokens else 0 

189 

190 if precision + recall == 0: 

191 return 0.0 

192 return 2 * precision * recall / (precision + recall) 

193 

194 

195def get_scorer(metric: EvalMetric) -> Scorer: 

196 """Factory: get scorer for a given metric.""" 

197 return { 

198 EvalMetric.EXACT_MATCH: ExactMatchScorer(), 

199 EvalMetric.F1: F1Scorer(), 

200 EvalMetric.ROUGE_L: ROUGELScorer(), 

201 }.get(metric, ExactMatchScorer()) 

202 

203 

204# ── SWE-bench Loader ────────────────────────────────────────────── 

205 

206class SWEBenchLoader: 

207 """Load and parse SWE-bench dataset. 

208 

209 SWE-bench format: each instance is a GitHub issue with a known fix. 

210 Agent must produce a patch that passes the test suite. 

211 """ 

212 

213 @staticmethod 

214 def load(path: str | Path, subset: str = "lite") -> list[EvalCase]: 

215 """Load SWE-bench instances from a JSON/JSONL file.""" 

216 path = Path(path) 

217 cases = [] 

218 

219 if not path.exists(): 

220 raise FileNotFoundError(f"SWE-bench dataset not found: {path}") 

221 

222 if path.suffix == ".jsonl": 

223 with open(path) as f: 

224 for line in f: 

225 if line.strip(): 

226 instance = json.loads(line) 

227 cases.append(SWEBenchLoader._parse(instance, subset)) 

228 elif path.suffix == ".json": 

229 data = json.loads(path.read_text()) 

230 instances = data if isinstance(data, list) else data.get("instances", []) 

231 for instance in instances: 

232 cases.append(SWEBenchLoader._parse(instance, subset)) 

233 

234 return cases 

235 

236 @staticmethod 

237 def _parse(instance: dict, subset: str) -> EvalCase: 

238 return EvalCase( 

239 id=instance.get("instance_id", ""), 

240 suite=EvalSuite.SWE_BENCH_LITE if "lite" in subset else EvalSuite.SWE_BENCH, 

241 prompt=instance.get("problem_statement", instance.get("issue", "")), 

242 expected="", # SWE-bench doesn't have expected text; it has a test patch 

243 repo=instance.get("repo", ""), 

244 base_commit=instance.get("base_commit", ""), 

245 test_patch=instance.get("test_patch", instance.get("patch", "")), 

246 metadata={ 

247 "hints_text": instance.get("hints_text", ""), 

248 "version": instance.get("version", ""), 

249 }, 

250 ) 

251 

252 

253# ── GAIA Loader ──────────────────────────────────────────────────── 

254 

255class GAIALoader: 

256 """Load and parse GAIA benchmark dataset. 

257 

258 GAIA: multi-step reasoning benchmark with 466 questions. 

259 Levels: L1 (simple), L2 (medium), L3 (complex). 

260 """ 

261 

262 @staticmethod 

263 def load(path: str | Path, level: str | None = None) -> list[EvalCase]: 

264 """Load GAIA questions from JSON/JSONL.""" 

265 path = Path(path) 

266 cases = [] 

267 

268 if not path.exists(): 

269 raise FileNotFoundError(f"GAIA dataset not found: {path}") 

270 

271 if path.suffix == ".jsonl": 

272 with open(path) as f: 

273 for line in f: 

274 if line.strip(): 

275 q = json.loads(line) 

276 if level and q.get("Level", "") != level: 

277 continue 

278 cases.append(GAIALoader._parse(q, "validation" in path.name)) 

279 elif path.suffix == ".json": 

280 data = json.loads(path.read_text()) 

281 questions = data if isinstance(data, list) else data.get("questions", []) 

282 for q in questions: 

283 if level and q.get("Level", "") != level: 

284 continue 

285 cases.append(GAIALoader._parse(q, "validation" in path.name)) 

286 

287 return cases 

288 

289 @staticmethod 

290 def _parse(q: dict, is_val: bool) -> EvalCase: 

291 return EvalCase( 

292 id=q.get("task_id", q.get("id", "")), 

293 suite=EvalSuite.GAIA_VAL if is_val else EvalSuite.GAIA, 

294 prompt=q.get("Question", q.get("question", "")), 

295 expected=q.get("Final answer", q.get("answer", "")), 

296 metadata={ 

297 "level": q.get("Level", ""), 

298 "annotator_metadata": q.get("Annotator Metadata", ""), 

299 }, 

300 ) 

301 

302 

303# ── Evaluation Runner ────────────────────────────────────────────── 

304 

305class EvalRunner: 

306 """Run evaluations over multiple cases with pass@k support. 

307 

308 Usage: 

309 runner = EvalRunner(generate_fn=my_agent.generate) 

310 report = runner.run(cases, k=3, metric=EvalMetric.EXACT_MATCH) 

311 """ 

312 

313 def __init__( 

314 self, 

315 generate_fn: Callable[[str], str], 

316 scorer: Scorer | None = None, 

317 ): 

318 """ 

319 Args: 

320 generate_fn: Function (prompt) -> generated_text 

321 scorer: Optional scorer override 

322 """ 

323 self.generate = generate_fn 

324 self.scorer = scorer 

325 

326 def run( 

327 self, 

328 cases: list[EvalCase], 

329 k: int = 1, 

330 metric: EvalMetric = EvalMetric.EXACT_MATCH, 

331 on_case_start: Callable[[EvalCase], None] | None = None, 

332 on_case_end: Callable[[EvalResult], None] | None = None, 

333 ) -> EvalReport: 

334 """Run evaluation on a list of cases. 

335 

336 Args: 

337 cases: Evaluation cases 

338 k: Number of samples per case (for pass@k) 

339 metric: Scoring metric 

340 on_case_start: Callback before each case 

341 on_case_end: Callback after each case 

342 

343 Returns: 

344 EvalReport with aggregated results 

345 """ 

346 start_time = time.time() 

347 scorer = self.scorer or get_scorer(metric) 

348 results: list[EvalResult] = [] 

349 

350 for case in cases: 

351 if on_case_start: 

352 on_case_start(case) 

353 

354 samples: list[EvalSample] = [] 

355 scores: list[float] = [] 

356 error = "" 

357 

358 for i in range(k): 

359 try: 

360 t0 = time.time() 

361 generated = self.generate(case.prompt) 

362 latency = (time.time() - t0) * 1000 

363 

364 s = scorer.score(generated, case.expected) 

365 samples.append(EvalSample( 

366 case_id=case.id, sample_index=i, 

367 generated=generated, score=s, 

368 passed=s >= 0.5, latency_ms=latency, 

369 )) 

370 scores.append(s) 

371 except Exception as e: 

372 error = str(e) 

373 samples.append(EvalSample( 

374 case_id=case.id, sample_index=i, 

375 generated="", score=0.0, passed=False, 

376 latency_ms=0, 

377 )) 

378 scores.append(0.0) 

379 

380 # pass@k: fraction where at least one sample passes 

381 any_pass = any(s.passed for s in samples) 

382 # Use max score for the case score 

383 case_score = max(scores) if scores else 0.0 

384 

385 result = EvalResult( 

386 case_id=case.id, suite=case.suite, 

387 metric=metric, score=case_score, 

388 samples=samples, error=error, 

389 ) 

390 results.append(result) 

391 

392 if on_case_end: 

393 on_case_end(result) 

394 

395 scores_list = [r.score for r in results] 

396 passed = sum(1 for s in scores_list if s >= 0.5) 

397 

398 report = EvalReport( 

399 suite=cases[0].suite if cases else EvalSuite.CUSTOM, 

400 total_cases=len(cases), 

401 passed_cases=passed, 

402 avg_score=sum(scores_list) / max(len(scores_list), 1), 

403 scores=scores_list, 

404 metric=metric, 

405 results=results, 

406 duration_s=time.time() - start_time, 

407 ) 

408 return report 

409 

410 def run_pass_at_k( 

411 self, 

412 cases: list[EvalCase], 

413 k: int = 5, 

414 metric: EvalMetric = EvalMetric.EXACT_MATCH, 

415 ) -> EvalReport: 

416 """Run pass@k evaluation (shorthand).""" 

417 return self.run(cases, k=k, metric=metric) 

418 

419 def print_report(self, report: EvalReport) -> str: 

420 """Generate a human-readable report string.""" 

421 lines = [ 

422 f"╔══ Evaluation Report ══╗", 

423 f"║ Suite: {report.suite.value:<20} ║", 

424 f"║ Metric: {report.metric.value:<20} ║", 

425 f"║ Cases: {report.total_cases:<20} ║", 

426 f"║ Passed: {report.passed_cases} ({report.success_rate:.1%})", 

427 f"║ Avg Score:{report.avg_score:.4f}", 

428 f"║ Median: {report.median_score:.4f} ║", 

429 f"║ Std Dev: {report.std_dev:.4f} ║", 

430 f"║ Time: {report.duration_s:.1f}s", 

431 f"╚════════════════════════╝", 

432 ] 

433 if report.results and len(report.results) <= 20: 

434 lines.append("\nPer-case scores:") 

435 for r in report.results: 

436 icon = "✓" if r.score >= 0.5 else "✗" 

437 lines.append(f" {icon} {r.case_id[:40]:<42} {r.score:.3f}") 

438 

439 return "\n".join(lines) 

440 

441 

442# ── Eval Registry ────────────────────────────────────────────────── 

443 

444class EvalRegistry: 

445 """Registry for custom evaluation suites and scorers.""" 

446 

447 def __init__(self): 

448 self._suites: dict[str, list[EvalCase]] = {} 

449 self._scorers: dict[str, Scorer] = {} 

450 

451 def register_suite(self, name: str, cases: list[EvalCase]) -> None: 

452 self._suites[name] = cases 

453 

454 def register_scorer(self, name: str, scorer: Scorer) -> None: 

455 self._scorers[name] = scorer 

456 

457 def get_suite(self, name: str) -> list[EvalCase]: 

458 if name not in self._suites: 

459 raise KeyError(f"Unknown eval suite: {name}") 

460 return self._suites[name] 

461 

462 def get_scorer(self, name: str) -> Scorer: 

463 return self._scorers.get(name, get_scorer(EvalMetric.EXACT_MATCH)) 

464 

465 def list_suites(self) -> list[str]: 

466 return list(self._suites.keys()) 

467 

468 

469# ── Quick Eval Helpers ───────────────────────────────────────────── 

470 

471def evaluate_quick( 

472 generate_fn: Callable[[str], str], 

473 cases: list[dict[str, str]], 

474 metric: EvalMetric = EvalMetric.EXACT_MATCH, 

475 k: int = 1, 

476) -> EvalReport: 

477 """Quick evaluation from a list of {prompt, expected} dicts.""" 

478 eval_cases = [ 

479 EvalCase(id=str(i), suite=EvalSuite.CUSTOM, prompt=c["prompt"], expected=c["expected"]) 

480 for i, c in enumerate(cases) 

481 ] 

482 runner = EvalRunner(generate_fn) 

483 return runner.run(eval_cases, k=k, metric=metric)