Coverage for agentos/evaluation/benchmark.py: 0%

68 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-07-02 09:59 +0800

1""" 

2AgentOS v0.20 评测框架。 

3支持 SWE-bench、Tool-use 等基准测试。 

4""" 

5 

6from __future__ import annotations 

7 

8from dataclasses import dataclass, field 

9from typing import Any, Callable 

10 

11 

12@dataclass 

13class BenchmarkCase: 

14 """A single benchmark evaluation case.""" 

15 id: str 

16 task: str 

17 expected_output: str | None = None 

18 expected_tools: list[str] | None = None 

19 ground_truth: dict[str, Any] = field(default_factory=dict) 

20 metrics: list[str] = field(default_factory=lambda: ["accuracy"]) 

21 

22 

23@dataclass 

24class EvalResult: 

25 """Result of a benchmark evaluation run.""" 

26 case_id: str 

27 passed: bool 

28 score: float 

29 output: str 

30 expected: str | None = None 

31 metrics: dict[str, float] = field(default_factory=dict) 

32 duration_ms: float = 0.0 

33 

34 

35class Evaluator: 

36 """评测运行器。""" 

37 

38 def __init__(self, agent_loop: Any): 

39 self.agent = agent_loop 

40 self.results: list[EvalResult] = [] 

41 

42 async def evaluate(self, benchmark: list[BenchmarkCase]) -> list[EvalResult]: 

43 """运行全部评测用例。""" 

44 import time 

45 

46 self.results = [] 

47 for case in benchmark: 

48 start = time.time() 

49 try: 

50 result = await self.agent.run(case.task) 

51 output = result.output 

52 passed = self._check(output, case) 

53 score = 1.0 if passed else 0.0 

54 except Exception as e: 

55 output = str(e) 

56 passed = False 

57 score = 0.0 

58 

59 duration_ms = (time.time() - start) * 1000 

60 self.results.append(EvalResult( 

61 case_id=case.id, 

62 passed=passed, 

63 score=score, 

64 output=output[:2000], 

65 expected=case.expected_output, 

66 duration_ms=duration_ms, 

67 )) 

68 

69 return self.results 

70 

71 def _check(self, output: str, case: BenchmarkCase) -> bool: 

72 # Enhanced scoring: use CompositeScorer for fuzzy matching 

73 if not case.expected_output: 

74 return True 

75 

76 from agentos.evaluation.scorers import ( 

77 CompositeScorer, ScoringStrategy, STRATEGY_QA, STRATEGY_CODE_GEN, 

78 STRATEGY_SUMMARY, STRATEGY_TRANSLATION, 

79 ) 

80 

81 # Select strategy based on case category 

82 category = case.ground_truth.get("category", "qa") 

83 strategy_map = { 

84 "qa": STRATEGY_QA, 

85 "code": STRATEGY_CODE_GEN, 

86 "summary": STRATEGY_SUMMARY, 

87 "translation": STRATEGY_TRANSLATION, 

88 } 

89 strategy = strategy_map.get(category, ScoringStrategy( 

90 weights={"rouge_l": 0.3, "bleu": 0.1, "contains": 0.4, "exact": 0.2}, 

91 pass_threshold=0.5, 

92 )) 

93 

94 scorer = CompositeScorer(strategy) 

95 result = scorer.score(case.expected_output, output) 

96 

97 # Store detailed scores in metrics 

98 if hasattr(result, "scores"): 

99 for k, v in result.scores.items(): 

100 case.metrics.append(k) 

101 

102 return result.passed 

103 

104 @property 

105 def pass_rate(self) -> float: 

106 if not self.results: 

107 return 0.0 

108 return sum(1 for r in self.results if r.passed) / len(self.results) 

109 

110 @property 

111 def avg_score(self) -> float: 

112 if not self.results: 

113 return 0.0 

114 return sum(r.score for r in self.results) / len(self.results) 

115 

116 def summary(self) -> str: 

117 return ( 

118 f"总用例: {len(self.results)}\n" 

119 f"通过: {sum(1 for r in self.results if r.passed)}\n" 

120 f"通过率: {self.pass_rate:.1%}\n" 

121 f"平均分: {self.avg_score:.2f}" 

122 ) 

123 

124 

125# ── 内置基准 ──────────────────────────────────── 

126 

127def builtin_benchmarks() -> list[BenchmarkCase]: 

128 """Built-in benchmark suite across 4 categories.""" 

129 return [ 

130 # ── QA ── 

131 BenchmarkCase(id="qa_math_1", task="1+1等于几?只回答数字。", 

132 expected_output="2", ground_truth={"category": "qa"}), 

133 BenchmarkCase(id="qa_fact_1", task="法国的首都是哪里?", 

134 expected_output="Paris", ground_truth={"category": "qa"}), 

135 BenchmarkCase(id="qa_fact_2", task="水的沸点是多少度?", 

136 expected_output="100", ground_truth={"category": "qa"}), 

137 BenchmarkCase(id="qa_fact_3", task="太阳系最大的行星是?", 

138 expected_output="木星", ground_truth={"category": "qa"}), 

139 BenchmarkCase(id="qa_define_1", task="什么是人工智能?", 

140 expected_output="人工智能", ground_truth={"category": "qa"}), 

141 

142 # ── Code ── 

143 BenchmarkCase(id="code_fib", task="写一个Python函数计算斐波那契数列第n项。", 

144 expected_output="def fibonacci", ground_truth={"category": "code"}), 

145 BenchmarkCase(id="code_sort", task="用Python写一个列表排序函数。", 

146 expected_output="def sort", ground_truth={"category": "code"}), 

147 BenchmarkCase(id="code_read", task="如何用Python读取文件?", 

148 expected_output="open", ground_truth={"category": "code"}), 

149 

150 # ── Summary ── 

151 BenchmarkCase(id="sum_short", task="用一句话总结:地球是太阳系第三颗行星,拥有液态水和大气层。", 

152 expected_output="地球", ground_truth={"category": "summary"}), 

153 BenchmarkCase(id="sum_tech", task="总结Python的主要特点。", 

154 expected_output="Python", ground_truth={"category": "summary"}), 

155 

156 # ── Translation ── 

157 BenchmarkCase(id="trans_en2zh", task="把Hello翻译成中文。", 

158 expected_output="你好", ground_truth={"category": "translation"}), 

159 BenchmarkCase(id="trans_zh2en", task="把谢谢翻译成英文。", 

160 expected_output="thank you", ground_truth={"category": "translation"}), 

161 

162 # ── Tool-use ── 

163 BenchmarkCase(id="tool_shell", task="列出当前目录的文件。使用shell工具。", 

164 expected_tools=["shell"], ground_truth={"category": "qa"}), 

165 BenchmarkCase(id="multi_step", task="先创建目录test_dir,再创建hello.txt并写入内容。", 

166 ground_truth={"category": "qa"}), 

167 ]