Coverage for agentos/evaluation/benchmark.py: 0%
68 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
1"""
2AgentOS v0.20 评测框架。
3支持 SWE-bench、Tool-use 等基准测试。
4"""
6from __future__ import annotations
8from dataclasses import dataclass, field
9from typing import Any, Callable
12@dataclass
13class BenchmarkCase:
14 """A single benchmark evaluation case."""
15 id: str
16 task: str
17 expected_output: str | None = None
18 expected_tools: list[str] | None = None
19 ground_truth: dict[str, Any] = field(default_factory=dict)
20 metrics: list[str] = field(default_factory=lambda: ["accuracy"])
23@dataclass
24class EvalResult:
25 """Result of a benchmark evaluation run."""
26 case_id: str
27 passed: bool
28 score: float
29 output: str
30 expected: str | None = None
31 metrics: dict[str, float] = field(default_factory=dict)
32 duration_ms: float = 0.0
35class Evaluator:
36 """评测运行器。"""
38 def __init__(self, agent_loop: Any):
39 self.agent = agent_loop
40 self.results: list[EvalResult] = []
42 async def evaluate(self, benchmark: list[BenchmarkCase]) -> list[EvalResult]:
43 """运行全部评测用例。"""
44 import time
46 self.results = []
47 for case in benchmark:
48 start = time.time()
49 try:
50 result = await self.agent.run(case.task)
51 output = result.output
52 passed = self._check(output, case)
53 score = 1.0 if passed else 0.0
54 except Exception as e:
55 output = str(e)
56 passed = False
57 score = 0.0
59 duration_ms = (time.time() - start) * 1000
60 self.results.append(EvalResult(
61 case_id=case.id,
62 passed=passed,
63 score=score,
64 output=output[:2000],
65 expected=case.expected_output,
66 duration_ms=duration_ms,
67 ))
69 return self.results
71 def _check(self, output: str, case: BenchmarkCase) -> bool:
72 # Enhanced scoring: use CompositeScorer for fuzzy matching
73 if not case.expected_output:
74 return True
76 from agentos.evaluation.scorers import (
77 CompositeScorer, ScoringStrategy, STRATEGY_QA, STRATEGY_CODE_GEN,
78 STRATEGY_SUMMARY, STRATEGY_TRANSLATION,
79 )
81 # Select strategy based on case category
82 category = case.ground_truth.get("category", "qa")
83 strategy_map = {
84 "qa": STRATEGY_QA,
85 "code": STRATEGY_CODE_GEN,
86 "summary": STRATEGY_SUMMARY,
87 "translation": STRATEGY_TRANSLATION,
88 }
89 strategy = strategy_map.get(category, ScoringStrategy(
90 weights={"rouge_l": 0.3, "bleu": 0.1, "contains": 0.4, "exact": 0.2},
91 pass_threshold=0.5,
92 ))
94 scorer = CompositeScorer(strategy)
95 result = scorer.score(case.expected_output, output)
97 # Store detailed scores in metrics
98 if hasattr(result, "scores"):
99 for k, v in result.scores.items():
100 case.metrics.append(k)
102 return result.passed
104 @property
105 def pass_rate(self) -> float:
106 if not self.results:
107 return 0.0
108 return sum(1 for r in self.results if r.passed) / len(self.results)
110 @property
111 def avg_score(self) -> float:
112 if not self.results:
113 return 0.0
114 return sum(r.score for r in self.results) / len(self.results)
116 def summary(self) -> str:
117 return (
118 f"总用例: {len(self.results)}\n"
119 f"通过: {sum(1 for r in self.results if r.passed)}\n"
120 f"通过率: {self.pass_rate:.1%}\n"
121 f"平均分: {self.avg_score:.2f}"
122 )
125# ── 内置基准 ────────────────────────────────────
127def builtin_benchmarks() -> list[BenchmarkCase]:
128 """Built-in benchmark suite across 4 categories."""
129 return [
130 # ── QA ──
131 BenchmarkCase(id="qa_math_1", task="1+1等于几?只回答数字。",
132 expected_output="2", ground_truth={"category": "qa"}),
133 BenchmarkCase(id="qa_fact_1", task="法国的首都是哪里?",
134 expected_output="Paris", ground_truth={"category": "qa"}),
135 BenchmarkCase(id="qa_fact_2", task="水的沸点是多少度?",
136 expected_output="100", ground_truth={"category": "qa"}),
137 BenchmarkCase(id="qa_fact_3", task="太阳系最大的行星是?",
138 expected_output="木星", ground_truth={"category": "qa"}),
139 BenchmarkCase(id="qa_define_1", task="什么是人工智能?",
140 expected_output="人工智能", ground_truth={"category": "qa"}),
142 # ── Code ──
143 BenchmarkCase(id="code_fib", task="写一个Python函数计算斐波那契数列第n项。",
144 expected_output="def fibonacci", ground_truth={"category": "code"}),
145 BenchmarkCase(id="code_sort", task="用Python写一个列表排序函数。",
146 expected_output="def sort", ground_truth={"category": "code"}),
147 BenchmarkCase(id="code_read", task="如何用Python读取文件?",
148 expected_output="open", ground_truth={"category": "code"}),
150 # ── Summary ──
151 BenchmarkCase(id="sum_short", task="用一句话总结:地球是太阳系第三颗行星,拥有液态水和大气层。",
152 expected_output="地球", ground_truth={"category": "summary"}),
153 BenchmarkCase(id="sum_tech", task="总结Python的主要特点。",
154 expected_output="Python", ground_truth={"category": "summary"}),
156 # ── Translation ──
157 BenchmarkCase(id="trans_en2zh", task="把Hello翻译成中文。",
158 expected_output="你好", ground_truth={"category": "translation"}),
159 BenchmarkCase(id="trans_zh2en", task="把谢谢翻译成英文。",
160 expected_output="thank you", ground_truth={"category": "translation"}),
162 # ── Tool-use ──
163 BenchmarkCase(id="tool_shell", task="列出当前目录的文件。使用shell工具。",
164 expected_tools=["shell"], ground_truth={"category": "qa"}),
165 BenchmarkCase(id="multi_step", task="先创建目录test_dir,再创建hello.txt并写入内容。",
166 ground_truth={"category": "qa"}),
167 ]