Coverage for little_loops / fsm / evaluators.py: 96%

272 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-05-22 16:19 -0500

1"""FSM Evaluators for loop execution. 

2 

3This module provides evaluators that interpret action output and produce 

4verdicts for state transitions. 

5 

6Supported evaluator types: 

7 

8Tier 1 (Deterministic - no API calls): 

9 exit_code: Map Unix exit codes to verdicts (0=success, 1=failure, 2+=error) 

10 output_numeric: Compare numeric output to target value 

11 output_json: Extract and compare JSON path values 

12 output_contains: Pattern matching on stdout 

13 convergence: Track progress toward a target value 

14 diff_stall: Detect stalled iterations via git diff comparison 

15 harbor_scorer: Interpret Harbor-format benchmark scorer exit code and float stdout 

16 

17Tier 2 (LLM-based): 

18 llm_structured: Use LLM with structured output for natural language evaluation 

19 

20Tier 3 (External process): 

21 mcp_result: Parse MCP tool call response envelope 

22""" 

23 

24from __future__ import annotations 

25 

26import hashlib 

27import json 

28import re 

29import subprocess 

30import time 

31from collections.abc import Callable 

32from dataclasses import dataclass 

33from pathlib import Path 

34from typing import Any 

35 

36from little_loops.fsm.interpolation import ( 

37 InterpolationContext, 

38 InterpolationError, 

39 interpolate, 

40) 

41from little_loops.fsm.schema import DEFAULT_LLM_MODEL, EvaluateConfig 

42from little_loops.host_runner import resolve_host 

43 

44 

45@dataclass 

46class EvaluationResult: 

47 """Result from an evaluator. 

48 

49 Attributes: 

50 verdict: The routing key for state transitions 

51 details: Evaluator-specific metadata for debugging/logging 

52 """ 

53 

54 verdict: str 

55 details: dict[str, Any] 

56 

57 

58# Default schema for LLM structured evaluation 

59DEFAULT_LLM_SCHEMA: dict[str, Any] = { 

60 "type": "object", 

61 "properties": { 

62 "verdict": { 

63 "type": "string", 

64 "enum": ["yes", "no", "blocked", "partial"], 

65 "description": ( 

66 "- yes: The condition/check evaluated to true\n" 

67 "- no: The condition/check evaluated to false\n" 

68 "- blocked: Cannot proceed without external help\n" 

69 "- partial: Made progress but not complete" 

70 ), 

71 }, 

72 "confidence": { 

73 "type": "number", 

74 "minimum": 0, 

75 "maximum": 1, 

76 "description": "Confidence in this verdict (0-1)", 

77 }, 

78 "reason": { 

79 "type": "string", 

80 "description": "Brief explanation", 

81 }, 

82 }, 

83 "required": ["verdict", "confidence", "reason"], 

84} 

85 

86DEFAULT_LLM_PROMPT = "Evaluate whether this action succeeded based on its output." 

87 

88_NUMERIC_OPERATORS: dict[str, Callable[[float, float], bool]] = { 

89 "eq": lambda v, t: v == t, 

90 "ne": lambda v, t: v != t, 

91 "lt": lambda v, t: v < t, 

92 "le": lambda v, t: v <= t, 

93 "gt": lambda v, t: v > t, 

94 "ge": lambda v, t: v >= t, 

95} 

96 

97 

98def evaluate_exit_code(exit_code: int) -> EvaluationResult: 

99 """Map Unix exit code to verdict. 

100 

101 Args: 

102 exit_code: The process exit code 

103 

104 Returns: 

105 EvaluationResult with verdict: 

106 - 0 -> yes 

107 - 1 -> no 

108 - 2+ -> error 

109 """ 

110 if exit_code == 0: 

111 verdict = "yes" 

112 elif exit_code == 1: 

113 verdict = "no" 

114 else: 

115 verdict = "error" 

116 

117 return EvaluationResult(verdict=verdict, details={"exit_code": exit_code}) 

118 

119 

120def evaluate_output_numeric( 

121 output: str, 

122 operator: str, 

123 target: float, 

124) -> EvaluationResult: 

125 """Parse stdout as number and compare to target. 

126 

127 Args: 

128 output: The action stdout to parse as a number 

129 operator: Comparison operator (eq, ne, lt, le, gt, ge) 

130 target: Target value to compare against 

131 

132 Returns: 

133 EvaluationResult with verdict: 

134 - Condition met -> yes 

135 - Condition not met -> no 

136 - Parse error -> error 

137 """ 

138 try: 

139 value = float(output.strip()) 

140 except ValueError: 

141 return EvaluationResult( 

142 verdict="error", 

143 details={"error": f"Cannot parse as number: {output[:100]}"}, 

144 ) 

145 

146 if operator not in _NUMERIC_OPERATORS: 

147 return EvaluationResult( 

148 verdict="error", 

149 details={"error": f"Unknown operator: {operator}"}, 

150 ) 

151 

152 condition_met = _NUMERIC_OPERATORS[operator](value, target) 

153 return EvaluationResult( 

154 verdict="yes" if condition_met else "no", 

155 details={"value": value, "target": target, "operator": operator}, 

156 ) 

157 

158 

159def _extract_json_path(data: Any, path: str) -> Any: 

160 """Extract value from dict using jq-style path like '.summary.failed'. 

161 

162 Args: 

163 data: The parsed JSON data (dict or list) 

164 path: Dot-separated path, optionally starting with '.' 

165 

166 Returns: 

167 The value at the specified path 

168 

169 Raises: 

170 KeyError: If path not found in data 

171 """ 

172 if path.startswith("."): 

173 path = path[1:] 

174 parts = path.split(".") 

175 current = data 

176 for part in parts: 

177 if isinstance(current, dict) and part in current: 

178 current = current[part] 

179 elif isinstance(current, list) and part.isdigit(): 

180 idx = int(part) 

181 if 0 <= idx < len(current): 

182 current = current[idx] 

183 else: 

184 raise KeyError(path) 

185 else: 

186 raise KeyError(path) 

187 return current 

188 

189 

190def _compare_values( 

191 value: int | float, operator: str, target: int | float, path: str 

192) -> EvaluationResult: 

193 """Compare numeric values using operator. 

194 

195 Args: 

196 value: The extracted value to compare 

197 operator: Comparison operator 

198 target: Target value 

199 path: JSON path for details 

200 

201 Returns: 

202 EvaluationResult with comparison result 

203 """ 

204 if operator not in _NUMERIC_OPERATORS: 

205 return EvaluationResult( 

206 verdict="error", 

207 details={"error": f"Unknown operator: {operator}"}, 

208 ) 

209 

210 condition_met = _NUMERIC_OPERATORS[operator](value, target) 

211 return EvaluationResult( 

212 verdict="yes" if condition_met else "no", 

213 details={"value": value, "path": path, "target": target, "operator": operator}, 

214 ) 

215 

216 

217def evaluate_output_json( 

218 output: str, 

219 path: str, 

220 operator: str, 

221 target: Any, 

222) -> EvaluationResult: 

223 """Parse JSON and extract value at path, then compare. 

224 

225 Args: 

226 output: The action stdout containing JSON 

227 path: jq-style dot notation path (e.g., '.summary.failed') 

228 operator: Comparison operator (eq, ne, lt, le, gt, ge) 

229 target: Target value for comparison 

230 

231 Returns: 

232 EvaluationResult with verdict: 

233 - Condition met -> yes 

234 - Condition not met -> no 

235 - Parse/path error -> error 

236 """ 

237 try: 

238 data = json.loads(output) 

239 except json.JSONDecodeError as e: 

240 return EvaluationResult( 

241 verdict="error", 

242 details={"error": f"Invalid JSON: {e}"}, 

243 ) 

244 

245 try: 

246 value = _extract_json_path(data, path) 

247 except KeyError: 

248 return EvaluationResult( 

249 verdict="error", 

250 details={"error": f"Path not found: {path}"}, 

251 ) 

252 

253 # Use numeric comparison if both values are numeric 

254 if isinstance(value, (int, float)) and isinstance(target, (int, float)): 

255 return _compare_values(value, operator, target, path) 

256 

257 # For non-numeric values, only eq and ne are supported 

258 if operator == "eq": 

259 verdict = "yes" if value == target else "no" 

260 elif operator == "ne": 

261 verdict = "yes" if value != target else "no" 

262 else: 

263 return EvaluationResult( 

264 verdict="error", 

265 details={"error": f"Operator {operator} not supported for non-numeric values"}, 

266 ) 

267 

268 return EvaluationResult( 

269 verdict=verdict, 

270 details={"value": value, "path": path, "target": target, "operator": operator}, 

271 ) 

272 

273 

274def evaluate_output_contains( 

275 output: str, 

276 pattern: str, 

277 negate: bool = False, 

278) -> EvaluationResult: 

279 """Check if pattern exists in output. 

280 

281 Pattern can be regex or substring. If regex fails to compile, 

282 falls back to substring matching. 

283 

284 Args: 

285 output: The action stdout to search 

286 pattern: Regex pattern or substring 

287 negate: If True, invert the match result 

288 

289 Returns: 

290 EvaluationResult with verdict: 

291 - Found (negate=False) -> yes 

292 - Found (negate=True) -> no 

293 - Not found (negate=False) -> no 

294 - Not found (negate=True) -> yes 

295 """ 

296 # Try regex first, fall back to substring 

297 try: 

298 matched = bool(re.search(pattern, output)) 

299 except re.error: 

300 matched = pattern in output 

301 

302 if negate: 

303 verdict = "no" if matched else "yes" 

304 else: 

305 verdict = "yes" if matched else "no" 

306 

307 return EvaluationResult( 

308 verdict=verdict, 

309 details={"matched": matched, "pattern": pattern, "negate": negate}, 

310 ) 

311 

312 

313def evaluate_convergence( 

314 current: float, 

315 previous: float | None, 

316 target: float, 

317 tolerance: float = 0, 

318 direction: str = "minimize", 

319) -> EvaluationResult: 

320 """Compare current value to target and previous. 

321 

322 Args: 

323 current: Current metric value 

324 previous: Previous metric value (None if first iteration) 

325 target: Target value to reach 

326 tolerance: Acceptable distance from target 

327 direction: 'minimize' or 'maximize' 

328 

329 Returns: 

330 EvaluationResult with verdict: 

331 - Value within tolerance of target -> target 

332 - Value improved toward target -> progress 

333 - Value unchanged or worsened -> stall 

334 """ 

335 # Check if target reached (within tolerance) 

336 if abs(current - target) <= tolerance: 

337 return EvaluationResult( 

338 verdict="target", 

339 details={"current": current, "target": target, "delta": 0}, 

340 ) 

341 

342 # First iteration has no previous value 

343 if previous is None: 

344 return EvaluationResult( 

345 verdict="progress", 

346 details={ 

347 "current": current, 

348 "previous": None, 

349 "target": target, 

350 "delta": None, 

351 }, 

352 ) 

353 

354 # Calculate progress 

355 delta = current - previous 

356 

357 if direction == "minimize": 

358 # For minimizing, negative delta is progress 

359 made_progress = delta < 0 

360 else: 

361 # For maximizing, positive delta is progress 

362 made_progress = delta > 0 

363 

364 verdict = "progress" if made_progress else "stall" 

365 

366 return EvaluationResult( 

367 verdict=verdict, 

368 details={ 

369 "current": current, 

370 "previous": previous, 

371 "target": target, 

372 "delta": delta, 

373 "direction": direction, 

374 }, 

375 ) 

376 

377 

378def evaluate_diff_stall( 

379 scope: list[str] | None = None, 

380 max_stall: int = 1, 

381) -> EvaluationResult: 

382 """Detect stalled iterations by comparing git diff --stat between runs. 

383 

384 On first call, snapshots the current diff and returns 'yes'. 

385 On subsequent calls, compares current diff to the previous snapshot. 

386 If the diff is identical for max_stall consecutive iterations, returns 

387 'no' (stalled). If different, resets the stall counter and returns 

388 'yes' (progress). 

389 

390 State is persisted in /tmp using a key derived from the scope argument, 

391 so different loops with different scopes maintain independent stall counters. 

392 

393 Args: 

394 scope: Optional list of paths to limit the git diff to. Defaults to 

395 the entire working tree. 

396 max_stall: Number of consecutive no-change iterations before stall 

397 verdict. Defaults to 1. 

398 

399 Returns: 

400 EvaluationResult with verdict: 

401 - yes: diff changed since last iteration (progress made) 

402 - no: diff unchanged for max_stall iterations (stalled) 

403 - error: git command failed or timed out 

404 """ 

405 cmd = ["git", "diff", "--stat"] 

406 if scope: 

407 cmd += ["--"] + scope 

408 

409 try: 

410 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30) 

411 except subprocess.TimeoutExpired: 

412 return EvaluationResult(verdict="error", details={"error": "git diff timed out"}) 

413 except FileNotFoundError: 

414 return EvaluationResult(verdict="error", details={"error": "git not found in PATH"}) 

415 

416 if proc.returncode != 0: 

417 return EvaluationResult( 

418 verdict="error", 

419 details={"error": f"git diff failed: {proc.stderr[:200]}"}, 

420 ) 

421 

422 current_diff = proc.stdout 

423 

424 # Derive a stable cache key from the scope so independent loops don't collide 

425 scope_str = "|".join(sorted(scope)) if scope else "_root_" 

426 cache_key = hashlib.md5(scope_str.encode()).hexdigest()[:12] 

427 loops_tmp = Path.cwd() / ".loops" / "tmp" 

428 loops_tmp.mkdir(parents=True, exist_ok=True) 

429 state_file = loops_tmp / f"ll-diff-stall-{cache_key}.txt" 

430 count_file = loops_tmp / f"ll-diff-stall-{cache_key}.count" 

431 

432 # Read previous snapshot and stall count 

433 previous_diff: str | None = None 

434 stall_count = 0 

435 try: 

436 previous_diff = state_file.read_text() 

437 stall_count = int(count_file.read_text().strip()) 

438 except (FileNotFoundError, ValueError): 

439 pass 

440 

441 # First iteration: save snapshot and report progress 

442 if previous_diff is None: 

443 state_file.write_text(current_diff) 

444 count_file.write_text("0") 

445 return EvaluationResult( 

446 verdict="yes", 

447 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True}, 

448 ) 

449 

450 if current_diff == previous_diff: 

451 stall_count += 1 

452 count_file.write_text(str(stall_count)) 

453 if stall_count >= max_stall: 

454 return EvaluationResult( 

455 verdict="no", 

456 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False}, 

457 ) 

458 # Not yet at max_stall threshold — still report yes so loop continues 

459 return EvaluationResult( 

460 verdict="yes", 

461 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False}, 

462 ) 

463 else: 

464 # Progress: update snapshot and reset counter 

465 state_file.write_text(current_diff) 

466 count_file.write_text("0") 

467 return EvaluationResult( 

468 verdict="yes", 

469 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True}, 

470 ) 

471 

472 

473def evaluate_mcp_result(output: str, exit_code: int) -> EvaluationResult: 

474 """Evaluate an MCP tool call result from the mcp-call subprocess. 

475 

476 Maps exit codes and MCP response envelope fields to routing verdicts. 

477 

478 Exit code conventions (set by mcp-call): 

479 0 → parse isError from JSON envelope 

480 1 → tool_error (tool ran but isError: true) 

481 124 → timeout (transport-level timeout) 

482 127 → not_found (server or tool missing from .mcp.json) 

483 

484 Args: 

485 output: stdout from mcp-call (MCP response envelope JSON) 

486 exit_code: Exit code from mcp-call subprocess 

487 

488 Returns: 

489 EvaluationResult with verdict: 

490 - success → isError: false 

491 - tool_error → isError: true 

492 - not_found → server/tool not in .mcp.json (exit 127) 

493 - timeout → transport-level timeout (exit 124) 

494 """ 

495 if exit_code == 127: 

496 return EvaluationResult( 

497 verdict="not_found", 

498 details={"exit_code": exit_code, "error": "Server or tool not found in .mcp.json"}, 

499 ) 

500 

501 if exit_code == 124: 

502 return EvaluationResult( 

503 verdict="timeout", 

504 details={"exit_code": exit_code, "error": "MCP tool call timed out"}, 

505 ) 

506 

507 # Parse MCP envelope JSON from stdout 

508 try: 

509 envelope = json.loads(output.strip()) if output.strip() else {} 

510 except json.JSONDecodeError: 

511 return EvaluationResult( 

512 verdict="tool_error", 

513 details={ 

514 "exit_code": exit_code, 

515 "error": f"Invalid JSON from mcp-call: {output[:200]}", 

516 }, 

517 ) 

518 

519 is_error = envelope.get("isError", exit_code != 0) 

520 

521 if is_error: 

522 return EvaluationResult( 

523 verdict="tool_error", 

524 details={"exit_code": exit_code, "envelope": envelope}, 

525 ) 

526 

527 return EvaluationResult( 

528 verdict="success", 

529 details={"exit_code": exit_code, "envelope": envelope}, 

530 ) 

531 

532 

533def evaluate_harbor_scorer(output: str, exit_code: int) -> EvaluationResult: 

534 """Evaluate a Harbor-format benchmark scorer result. 

535 

536 The scorer is a shell command that prints a float score (0.0–1.0) to stdout 

537 and exits 0 on success or non-zero on failure. 

538 

539 Args: 

540 output: stdout from the scorer subprocess (expected: a bare float) 

541 exit_code: Exit code from the scorer subprocess 

542 

543 Returns: 

544 EvaluationResult with verdict: 

545 - yes → exit 0 and stdout parses as a float 

546 - no → exit non-zero (scorer determined failure) 

547 - error → exit 0 but stdout is not parseable as a float 

548 """ 

549 if exit_code != 0: 

550 return EvaluationResult( 

551 verdict="no", 

552 details={"exit_code": exit_code}, 

553 ) 

554 

555 try: 

556 score = float(output.strip()) 

557 except (ValueError, AttributeError): 

558 return EvaluationResult( 

559 verdict="error", 

560 details={ 

561 "exit_code": exit_code, 

562 "error": f"Scorer stdout is not a float: {output[:200]}", 

563 }, 

564 ) 

565 

566 return EvaluationResult( 

567 verdict="yes", 

568 details={"score": score, "exit_code": 0}, 

569 ) 

570 

571 

572def evaluate_llm_structured( 

573 output: str, 

574 prompt: str | None = None, 

575 schema: dict[str, Any] | None = None, 

576 min_confidence: float = 0.5, 

577 uncertain_suffix: bool = False, 

578 model: str = DEFAULT_LLM_MODEL, 

579 max_tokens: int = 256, 

580 timeout: int = 1800, 

581) -> EvaluationResult: 

582 """Evaluate action output using LLM with structured output via Claude CLI. 

583 

584 This is the ONLY place in the FSM system that uses LLM structured output. 

585 Requires the ``claude`` CLI to be installed and authenticated. 

586 

587 Args: 

588 output: Action stdout to evaluate 

589 prompt: Custom evaluation prompt (defaults to basic success check) 

590 schema: Custom JSON schema for structured response 

591 min_confidence: Minimum confidence threshold (0-1) 

592 uncertain_suffix: If True, append _uncertain to low-confidence verdicts 

593 model: Model identifier (CLI aliases like "sonnet" or full names) 

594 max_tokens: Maximum tokens for response (passed to --max-turns is not 

595 applicable; kept for signature compat) 

596 timeout: Timeout in seconds 

597 

598 Returns: 

599 EvaluationResult with verdict from LLM and confidence/reason in details 

600 """ 

601 effective_schema = schema or DEFAULT_LLM_SCHEMA 

602 effective_prompt = prompt or DEFAULT_LLM_PROMPT 

603 

604 # Truncate output to avoid context limits (keep last 4000 chars) 

605 truncated = output[-4000:] if len(output) > 4000 else output 

606 

607 user_prompt = f"{effective_prompt}\n\n<action_output>\n{truncated}\n</action_output>" 

608 

609 invocation = resolve_host().build_blocking_json(prompt=user_prompt, model=model) 

610 # Builder drops json_schema (Protocol surface only) and omits the 

611 # claude-CLI-specific --no-session-persistence flag; augment at call site. 

612 args = list(invocation.args) + [ 

613 "--json-schema", 

614 json.dumps(effective_schema), 

615 "--no-session-persistence", 

616 ] 

617 

618 t0 = time.monotonic() 

619 try: 

620 proc = subprocess.run( 

621 [invocation.binary, *args], capture_output=True, text=True, timeout=timeout 

622 ) 

623 except subprocess.TimeoutExpired: 

624 return EvaluationResult( 

625 verdict="error", 

626 details={"error": "LLM evaluation timeout", "timeout": True}, 

627 ) 

628 except FileNotFoundError: 

629 return EvaluationResult( 

630 verdict="error", 

631 details={ 

632 "error": f"{invocation.binary} CLI not found. Install the active host CLI (see LL_HOST_CLI).", 

633 "missing_dependency": True, 

634 }, 

635 ) 

636 llm_latency_ms = int((time.monotonic() - t0) * 1000) 

637 

638 if proc.returncode != 0: 

639 return EvaluationResult( 

640 verdict="error", 

641 details={ 

642 "error": f"{invocation.binary} CLI error: {proc.stderr.strip()}", 

643 "api_error": True, 

644 }, 

645 ) 

646 

647 # Guard: empty stdout with exit 0 (API error not reflected in exit code) 

648 if not proc.stdout.strip(): 

649 stderr_info = proc.stderr.strip()[:200] if proc.stderr else "" 

650 error_msg = f"{invocation.binary} CLI returned empty output" 

651 if stderr_info: 

652 error_msg += f" (stderr: {stderr_info})" 

653 return EvaluationResult( 

654 verdict="error", 

655 details={"error": error_msg, "empty_output": True}, 

656 ) 

657 

658 # Parse the CLI JSON envelope and extract structured result. 

659 # With --json-schema the envelope is: 

660 # success: {"type":"result","subtype":"success","structured_output":{...},...} 

661 # failure: {"type":"result","subtype":"error_max_structured_output_retries",...} 

662 # If stdout is JSONL (multiple JSON objects), use the last non-empty line. 

663 try: 

664 stdout = proc.stdout.strip() 

665 try: 

666 envelope = json.loads(stdout) 

667 except json.JSONDecodeError: 

668 # Try JSONL: take the last non-empty line 

669 lines = [line for line in stdout.split("\n") if line.strip()] 

670 if not lines: 

671 raise 

672 envelope = json.loads(lines[-1]) 

673 

674 # Check structured-output retry exhaustion (--json-schema failure mode) 

675 if envelope.get("subtype") == "error_max_structured_output_retries": 

676 return EvaluationResult( 

677 verdict="error", 

678 details={ 

679 "error": "Claude CLI could not produce valid structured output after retries", 

680 "api_error": True, 

681 }, 

682 ) 

683 

684 # Check legacy is_error flag (some CLI versions exit 0 but report error in envelope) 

685 if envelope.get("is_error", False): 

686 err_text = str(envelope.get("result", "") or "")[:200] 

687 return EvaluationResult( 

688 verdict="error", 

689 details={"error": f"Claude CLI reported error: {err_text}", "api_error": True}, 

690 ) 

691 

692 # --json-schema mode returns validated dict in "structured_output" 

693 if isinstance(envelope.get("structured_output"), dict): 

694 llm_result: dict[str, Any] = envelope["structured_output"] 

695 else: 

696 raw_result = envelope.get("result", "") 

697 if isinstance(raw_result, dict): 

698 llm_result = raw_result 

699 elif raw_result: 

700 llm_result = json.loads(raw_result) 

701 elif "verdict" in envelope: 

702 llm_result = envelope 

703 else: 

704 raw_preview = proc.stdout[:300] 

705 return EvaluationResult( 

706 verdict="error", 

707 details={ 

708 "error": "Empty result field in Claude CLI response", 

709 "raw_preview": raw_preview, 

710 }, 

711 ) 

712 except (json.JSONDecodeError, TypeError, ValueError) as e: 

713 raw_preview = proc.stdout[:300] if proc.stdout else "(empty)" 

714 return EvaluationResult( 

715 verdict="error", 

716 details={"error": f"Failed to parse LLM response: {e}", "raw_preview": raw_preview}, 

717 ) 

718 

719 # Build result with confidence handling 

720 verdict = str(llm_result.get("verdict", "error")) 

721 confidence = float(llm_result.get("confidence", 1.0)) 

722 confident = confidence >= min_confidence 

723 

724 # Optionally modify verdict for low confidence 

725 if uncertain_suffix and not confident: 

726 verdict = f"{verdict}_uncertain" 

727 

728 return EvaluationResult( 

729 verdict=verdict, 

730 details={ 

731 "confidence": confidence, 

732 "confident": confident, 

733 "reason": llm_result.get("reason", ""), 

734 "raw": llm_result, 

735 "llm_model": model, 

736 "llm_latency_ms": llm_latency_ms, 

737 "llm_prompt": user_prompt[:500], 

738 "llm_raw_output": proc.stdout[:500] if proc.stdout else "", 

739 }, 

740 ) 

741 

742 

743def evaluate( 

744 config: EvaluateConfig, 

745 output: str, 

746 exit_code: int, 

747 context: InterpolationContext, 

748) -> EvaluationResult: 

749 """Dispatch to appropriate evaluator based on config type. 

750 

751 Args: 

752 config: Evaluator configuration with type and parameters 

753 output: Action stdout 

754 exit_code: Action exit code 

755 context: Runtime context for variable interpolation 

756 

757 Returns: 

758 EvaluationResult from the appropriate evaluator 

759 

760 Raises: 

761 ValueError: If evaluator type is unknown 

762 """ 

763 eval_type = config.type 

764 

765 if eval_type == "exit_code": 

766 return evaluate_exit_code(exit_code) 

767 

768 elif eval_type == "output_numeric": 

769 if config.target is None: 

770 raise ValueError("output_numeric evaluator requires 'target' to be set") 

771 elif isinstance(config.target, str): 

772 try: 

773 resolved = interpolate(config.target, context) if context else config.target 

774 numeric_target = float(resolved) 

775 except (InterpolationError, ValueError) as e: 

776 raise ValueError( 

777 f"output_numeric target must be numeric, got: {config.target!r}" 

778 ) from e 

779 else: 

780 numeric_target = float(config.target) 

781 return evaluate_output_numeric( 

782 output=output, 

783 operator=config.operator or "eq", 

784 target=numeric_target, 

785 ) 

786 

787 elif eval_type == "output_json": 

788 return evaluate_output_json( 

789 output=output, 

790 path=config.path or "", 

791 operator=config.operator or "eq", 

792 target=config.target, 

793 ) 

794 

795 elif eval_type == "output_contains": 

796 return evaluate_output_contains( 

797 output=output, 

798 pattern=config.pattern or "", 

799 negate=config.negate, 

800 ) 

801 

802 elif eval_type == "convergence": 

803 # Resolve previous value from interpolation if configured 

804 previous: float | None = None 

805 if config.previous: 

806 try: 

807 previous = float(interpolate(config.previous, context)) 

808 except (InterpolationError, ValueError): 

809 # Previous unavailable on first iteration, continue with None 

810 pass 

811 

812 # Parse current value from output 

813 try: 

814 current = float(output.strip()) 

815 except ValueError: 

816 return EvaluationResult( 

817 verdict="error", 

818 details={"error": f"Cannot parse output as number: {output[:100]}"}, 

819 ) 

820 

821 # Resolve target (may be interpolated string like "${context.target}") 

822 convergence_target: float 

823 if isinstance(config.target, str): 

824 try: 

825 convergence_target = float(interpolate(config.target, context)) 

826 except (InterpolationError, ValueError) as e: 

827 return EvaluationResult( 

828 verdict="error", 

829 details={"error": f"Cannot resolve target: {e}"}, 

830 ) 

831 else: 

832 if config.target is None: 

833 raise ValueError("convergence evaluator requires 'target' to be set") 

834 convergence_target = float(config.target) 

835 

836 # Resolve tolerance (may be interpolated string) 

837 tolerance: float = 0.0 

838 if config.tolerance is not None: 

839 if isinstance(config.tolerance, str): 

840 try: 

841 tolerance = float(interpolate(config.tolerance, context)) 

842 except (InterpolationError, ValueError): 

843 tolerance = 0.0 

844 else: 

845 tolerance = float(config.tolerance) 

846 

847 return evaluate_convergence( 

848 current=current, 

849 previous=previous, 

850 target=convergence_target, 

851 tolerance=tolerance, 

852 direction=config.direction, 

853 ) 

854 

855 elif eval_type == "diff_stall": 

856 return evaluate_diff_stall( 

857 scope=config.scope, 

858 max_stall=config.max_stall, 

859 ) 

860 

861 elif eval_type == "llm_structured": 

862 prompt = config.prompt 

863 if prompt and context: 

864 try: 

865 prompt = interpolate(prompt, context) 

866 except InterpolationError: 

867 pass # Use raw prompt on resolution failure 

868 return evaluate_llm_structured( 

869 output=output, 

870 prompt=prompt, 

871 schema=config.schema, 

872 min_confidence=config.min_confidence, 

873 uncertain_suffix=config.uncertain_suffix, 

874 ) 

875 

876 elif eval_type == "mcp_result": 

877 return evaluate_mcp_result(output=output, exit_code=exit_code) 

878 

879 elif eval_type == "harbor_scorer": 

880 return evaluate_harbor_scorer(output=output, exit_code=exit_code) 

881 

882 else: 

883 raise ValueError(f"Unknown evaluator type: {eval_type}")