Coverage for little_loops / fsm / evaluators.py: 96%
272 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
1"""FSM Evaluators for loop execution.
3This module provides evaluators that interpret action output and produce
4verdicts for state transitions.
6Supported evaluator types:
8Tier 1 (Deterministic - no API calls):
9 exit_code: Map Unix exit codes to verdicts (0=success, 1=failure, 2+=error)
10 output_numeric: Compare numeric output to target value
11 output_json: Extract and compare JSON path values
12 output_contains: Pattern matching on stdout
13 convergence: Track progress toward a target value
14 diff_stall: Detect stalled iterations via git diff comparison
15 harbor_scorer: Interpret Harbor-format benchmark scorer exit code and float stdout
17Tier 2 (LLM-based):
18 llm_structured: Use LLM with structured output for natural language evaluation
20Tier 3 (External process):
21 mcp_result: Parse MCP tool call response envelope
22"""
24from __future__ import annotations
26import hashlib
27import json
28import re
29import subprocess
30import time
31from collections.abc import Callable
32from dataclasses import dataclass
33from pathlib import Path
34from typing import Any
36from little_loops.fsm.interpolation import (
37 InterpolationContext,
38 InterpolationError,
39 interpolate,
40)
41from little_loops.fsm.schema import DEFAULT_LLM_MODEL, EvaluateConfig
42from little_loops.host_runner import resolve_host
45@dataclass
46class EvaluationResult:
47 """Result from an evaluator.
49 Attributes:
50 verdict: The routing key for state transitions
51 details: Evaluator-specific metadata for debugging/logging
52 """
54 verdict: str
55 details: dict[str, Any]
58# Default schema for LLM structured evaluation
59DEFAULT_LLM_SCHEMA: dict[str, Any] = {
60 "type": "object",
61 "properties": {
62 "verdict": {
63 "type": "string",
64 "enum": ["yes", "no", "blocked", "partial"],
65 "description": (
66 "- yes: The condition/check evaluated to true\n"
67 "- no: The condition/check evaluated to false\n"
68 "- blocked: Cannot proceed without external help\n"
69 "- partial: Made progress but not complete"
70 ),
71 },
72 "confidence": {
73 "type": "number",
74 "minimum": 0,
75 "maximum": 1,
76 "description": "Confidence in this verdict (0-1)",
77 },
78 "reason": {
79 "type": "string",
80 "description": "Brief explanation",
81 },
82 },
83 "required": ["verdict", "confidence", "reason"],
84}
86DEFAULT_LLM_PROMPT = "Evaluate whether this action succeeded based on its output."
88_NUMERIC_OPERATORS: dict[str, Callable[[float, float], bool]] = {
89 "eq": lambda v, t: v == t,
90 "ne": lambda v, t: v != t,
91 "lt": lambda v, t: v < t,
92 "le": lambda v, t: v <= t,
93 "gt": lambda v, t: v > t,
94 "ge": lambda v, t: v >= t,
95}
98def evaluate_exit_code(exit_code: int) -> EvaluationResult:
99 """Map Unix exit code to verdict.
101 Args:
102 exit_code: The process exit code
104 Returns:
105 EvaluationResult with verdict:
106 - 0 -> yes
107 - 1 -> no
108 - 2+ -> error
109 """
110 if exit_code == 0:
111 verdict = "yes"
112 elif exit_code == 1:
113 verdict = "no"
114 else:
115 verdict = "error"
117 return EvaluationResult(verdict=verdict, details={"exit_code": exit_code})
120def evaluate_output_numeric(
121 output: str,
122 operator: str,
123 target: float,
124) -> EvaluationResult:
125 """Parse stdout as number and compare to target.
127 Args:
128 output: The action stdout to parse as a number
129 operator: Comparison operator (eq, ne, lt, le, gt, ge)
130 target: Target value to compare against
132 Returns:
133 EvaluationResult with verdict:
134 - Condition met -> yes
135 - Condition not met -> no
136 - Parse error -> error
137 """
138 try:
139 value = float(output.strip())
140 except ValueError:
141 return EvaluationResult(
142 verdict="error",
143 details={"error": f"Cannot parse as number: {output[:100]}"},
144 )
146 if operator not in _NUMERIC_OPERATORS:
147 return EvaluationResult(
148 verdict="error",
149 details={"error": f"Unknown operator: {operator}"},
150 )
152 condition_met = _NUMERIC_OPERATORS[operator](value, target)
153 return EvaluationResult(
154 verdict="yes" if condition_met else "no",
155 details={"value": value, "target": target, "operator": operator},
156 )
159def _extract_json_path(data: Any, path: str) -> Any:
160 """Extract value from dict using jq-style path like '.summary.failed'.
162 Args:
163 data: The parsed JSON data (dict or list)
164 path: Dot-separated path, optionally starting with '.'
166 Returns:
167 The value at the specified path
169 Raises:
170 KeyError: If path not found in data
171 """
172 if path.startswith("."):
173 path = path[1:]
174 parts = path.split(".")
175 current = data
176 for part in parts:
177 if isinstance(current, dict) and part in current:
178 current = current[part]
179 elif isinstance(current, list) and part.isdigit():
180 idx = int(part)
181 if 0 <= idx < len(current):
182 current = current[idx]
183 else:
184 raise KeyError(path)
185 else:
186 raise KeyError(path)
187 return current
190def _compare_values(
191 value: int | float, operator: str, target: int | float, path: str
192) -> EvaluationResult:
193 """Compare numeric values using operator.
195 Args:
196 value: The extracted value to compare
197 operator: Comparison operator
198 target: Target value
199 path: JSON path for details
201 Returns:
202 EvaluationResult with comparison result
203 """
204 if operator not in _NUMERIC_OPERATORS:
205 return EvaluationResult(
206 verdict="error",
207 details={"error": f"Unknown operator: {operator}"},
208 )
210 condition_met = _NUMERIC_OPERATORS[operator](value, target)
211 return EvaluationResult(
212 verdict="yes" if condition_met else "no",
213 details={"value": value, "path": path, "target": target, "operator": operator},
214 )
217def evaluate_output_json(
218 output: str,
219 path: str,
220 operator: str,
221 target: Any,
222) -> EvaluationResult:
223 """Parse JSON and extract value at path, then compare.
225 Args:
226 output: The action stdout containing JSON
227 path: jq-style dot notation path (e.g., '.summary.failed')
228 operator: Comparison operator (eq, ne, lt, le, gt, ge)
229 target: Target value for comparison
231 Returns:
232 EvaluationResult with verdict:
233 - Condition met -> yes
234 - Condition not met -> no
235 - Parse/path error -> error
236 """
237 try:
238 data = json.loads(output)
239 except json.JSONDecodeError as e:
240 return EvaluationResult(
241 verdict="error",
242 details={"error": f"Invalid JSON: {e}"},
243 )
245 try:
246 value = _extract_json_path(data, path)
247 except KeyError:
248 return EvaluationResult(
249 verdict="error",
250 details={"error": f"Path not found: {path}"},
251 )
253 # Use numeric comparison if both values are numeric
254 if isinstance(value, (int, float)) and isinstance(target, (int, float)):
255 return _compare_values(value, operator, target, path)
257 # For non-numeric values, only eq and ne are supported
258 if operator == "eq":
259 verdict = "yes" if value == target else "no"
260 elif operator == "ne":
261 verdict = "yes" if value != target else "no"
262 else:
263 return EvaluationResult(
264 verdict="error",
265 details={"error": f"Operator {operator} not supported for non-numeric values"},
266 )
268 return EvaluationResult(
269 verdict=verdict,
270 details={"value": value, "path": path, "target": target, "operator": operator},
271 )
274def evaluate_output_contains(
275 output: str,
276 pattern: str,
277 negate: bool = False,
278) -> EvaluationResult:
279 """Check if pattern exists in output.
281 Pattern can be regex or substring. If regex fails to compile,
282 falls back to substring matching.
284 Args:
285 output: The action stdout to search
286 pattern: Regex pattern or substring
287 negate: If True, invert the match result
289 Returns:
290 EvaluationResult with verdict:
291 - Found (negate=False) -> yes
292 - Found (negate=True) -> no
293 - Not found (negate=False) -> no
294 - Not found (negate=True) -> yes
295 """
296 # Try regex first, fall back to substring
297 try:
298 matched = bool(re.search(pattern, output))
299 except re.error:
300 matched = pattern in output
302 if negate:
303 verdict = "no" if matched else "yes"
304 else:
305 verdict = "yes" if matched else "no"
307 return EvaluationResult(
308 verdict=verdict,
309 details={"matched": matched, "pattern": pattern, "negate": negate},
310 )
313def evaluate_convergence(
314 current: float,
315 previous: float | None,
316 target: float,
317 tolerance: float = 0,
318 direction: str = "minimize",
319) -> EvaluationResult:
320 """Compare current value to target and previous.
322 Args:
323 current: Current metric value
324 previous: Previous metric value (None if first iteration)
325 target: Target value to reach
326 tolerance: Acceptable distance from target
327 direction: 'minimize' or 'maximize'
329 Returns:
330 EvaluationResult with verdict:
331 - Value within tolerance of target -> target
332 - Value improved toward target -> progress
333 - Value unchanged or worsened -> stall
334 """
335 # Check if target reached (within tolerance)
336 if abs(current - target) <= tolerance:
337 return EvaluationResult(
338 verdict="target",
339 details={"current": current, "target": target, "delta": 0},
340 )
342 # First iteration has no previous value
343 if previous is None:
344 return EvaluationResult(
345 verdict="progress",
346 details={
347 "current": current,
348 "previous": None,
349 "target": target,
350 "delta": None,
351 },
352 )
354 # Calculate progress
355 delta = current - previous
357 if direction == "minimize":
358 # For minimizing, negative delta is progress
359 made_progress = delta < 0
360 else:
361 # For maximizing, positive delta is progress
362 made_progress = delta > 0
364 verdict = "progress" if made_progress else "stall"
366 return EvaluationResult(
367 verdict=verdict,
368 details={
369 "current": current,
370 "previous": previous,
371 "target": target,
372 "delta": delta,
373 "direction": direction,
374 },
375 )
378def evaluate_diff_stall(
379 scope: list[str] | None = None,
380 max_stall: int = 1,
381) -> EvaluationResult:
382 """Detect stalled iterations by comparing git diff --stat between runs.
384 On first call, snapshots the current diff and returns 'yes'.
385 On subsequent calls, compares current diff to the previous snapshot.
386 If the diff is identical for max_stall consecutive iterations, returns
387 'no' (stalled). If different, resets the stall counter and returns
388 'yes' (progress).
390 State is persisted in /tmp using a key derived from the scope argument,
391 so different loops with different scopes maintain independent stall counters.
393 Args:
394 scope: Optional list of paths to limit the git diff to. Defaults to
395 the entire working tree.
396 max_stall: Number of consecutive no-change iterations before stall
397 verdict. Defaults to 1.
399 Returns:
400 EvaluationResult with verdict:
401 - yes: diff changed since last iteration (progress made)
402 - no: diff unchanged for max_stall iterations (stalled)
403 - error: git command failed or timed out
404 """
405 cmd = ["git", "diff", "--stat"]
406 if scope:
407 cmd += ["--"] + scope
409 try:
410 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
411 except subprocess.TimeoutExpired:
412 return EvaluationResult(verdict="error", details={"error": "git diff timed out"})
413 except FileNotFoundError:
414 return EvaluationResult(verdict="error", details={"error": "git not found in PATH"})
416 if proc.returncode != 0:
417 return EvaluationResult(
418 verdict="error",
419 details={"error": f"git diff failed: {proc.stderr[:200]}"},
420 )
422 current_diff = proc.stdout
424 # Derive a stable cache key from the scope so independent loops don't collide
425 scope_str = "|".join(sorted(scope)) if scope else "_root_"
426 cache_key = hashlib.md5(scope_str.encode()).hexdigest()[:12]
427 loops_tmp = Path.cwd() / ".loops" / "tmp"
428 loops_tmp.mkdir(parents=True, exist_ok=True)
429 state_file = loops_tmp / f"ll-diff-stall-{cache_key}.txt"
430 count_file = loops_tmp / f"ll-diff-stall-{cache_key}.count"
432 # Read previous snapshot and stall count
433 previous_diff: str | None = None
434 stall_count = 0
435 try:
436 previous_diff = state_file.read_text()
437 stall_count = int(count_file.read_text().strip())
438 except (FileNotFoundError, ValueError):
439 pass
441 # First iteration: save snapshot and report progress
442 if previous_diff is None:
443 state_file.write_text(current_diff)
444 count_file.write_text("0")
445 return EvaluationResult(
446 verdict="yes",
447 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True},
448 )
450 if current_diff == previous_diff:
451 stall_count += 1
452 count_file.write_text(str(stall_count))
453 if stall_count >= max_stall:
454 return EvaluationResult(
455 verdict="no",
456 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False},
457 )
458 # Not yet at max_stall threshold — still report yes so loop continues
459 return EvaluationResult(
460 verdict="yes",
461 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False},
462 )
463 else:
464 # Progress: update snapshot and reset counter
465 state_file.write_text(current_diff)
466 count_file.write_text("0")
467 return EvaluationResult(
468 verdict="yes",
469 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True},
470 )
473def evaluate_mcp_result(output: str, exit_code: int) -> EvaluationResult:
474 """Evaluate an MCP tool call result from the mcp-call subprocess.
476 Maps exit codes and MCP response envelope fields to routing verdicts.
478 Exit code conventions (set by mcp-call):
479 0 → parse isError from JSON envelope
480 1 → tool_error (tool ran but isError: true)
481 124 → timeout (transport-level timeout)
482 127 → not_found (server or tool missing from .mcp.json)
484 Args:
485 output: stdout from mcp-call (MCP response envelope JSON)
486 exit_code: Exit code from mcp-call subprocess
488 Returns:
489 EvaluationResult with verdict:
490 - success → isError: false
491 - tool_error → isError: true
492 - not_found → server/tool not in .mcp.json (exit 127)
493 - timeout → transport-level timeout (exit 124)
494 """
495 if exit_code == 127:
496 return EvaluationResult(
497 verdict="not_found",
498 details={"exit_code": exit_code, "error": "Server or tool not found in .mcp.json"},
499 )
501 if exit_code == 124:
502 return EvaluationResult(
503 verdict="timeout",
504 details={"exit_code": exit_code, "error": "MCP tool call timed out"},
505 )
507 # Parse MCP envelope JSON from stdout
508 try:
509 envelope = json.loads(output.strip()) if output.strip() else {}
510 except json.JSONDecodeError:
511 return EvaluationResult(
512 verdict="tool_error",
513 details={
514 "exit_code": exit_code,
515 "error": f"Invalid JSON from mcp-call: {output[:200]}",
516 },
517 )
519 is_error = envelope.get("isError", exit_code != 0)
521 if is_error:
522 return EvaluationResult(
523 verdict="tool_error",
524 details={"exit_code": exit_code, "envelope": envelope},
525 )
527 return EvaluationResult(
528 verdict="success",
529 details={"exit_code": exit_code, "envelope": envelope},
530 )
533def evaluate_harbor_scorer(output: str, exit_code: int) -> EvaluationResult:
534 """Evaluate a Harbor-format benchmark scorer result.
536 The scorer is a shell command that prints a float score (0.0–1.0) to stdout
537 and exits 0 on success or non-zero on failure.
539 Args:
540 output: stdout from the scorer subprocess (expected: a bare float)
541 exit_code: Exit code from the scorer subprocess
543 Returns:
544 EvaluationResult with verdict:
545 - yes → exit 0 and stdout parses as a float
546 - no → exit non-zero (scorer determined failure)
547 - error → exit 0 but stdout is not parseable as a float
548 """
549 if exit_code != 0:
550 return EvaluationResult(
551 verdict="no",
552 details={"exit_code": exit_code},
553 )
555 try:
556 score = float(output.strip())
557 except (ValueError, AttributeError):
558 return EvaluationResult(
559 verdict="error",
560 details={
561 "exit_code": exit_code,
562 "error": f"Scorer stdout is not a float: {output[:200]}",
563 },
564 )
566 return EvaluationResult(
567 verdict="yes",
568 details={"score": score, "exit_code": 0},
569 )
572def evaluate_llm_structured(
573 output: str,
574 prompt: str | None = None,
575 schema: dict[str, Any] | None = None,
576 min_confidence: float = 0.5,
577 uncertain_suffix: bool = False,
578 model: str = DEFAULT_LLM_MODEL,
579 max_tokens: int = 256,
580 timeout: int = 1800,
581) -> EvaluationResult:
582 """Evaluate action output using LLM with structured output via Claude CLI.
584 This is the ONLY place in the FSM system that uses LLM structured output.
585 Requires the ``claude`` CLI to be installed and authenticated.
587 Args:
588 output: Action stdout to evaluate
589 prompt: Custom evaluation prompt (defaults to basic success check)
590 schema: Custom JSON schema for structured response
591 min_confidence: Minimum confidence threshold (0-1)
592 uncertain_suffix: If True, append _uncertain to low-confidence verdicts
593 model: Model identifier (CLI aliases like "sonnet" or full names)
594 max_tokens: Maximum tokens for response (passed to --max-turns is not
595 applicable; kept for signature compat)
596 timeout: Timeout in seconds
598 Returns:
599 EvaluationResult with verdict from LLM and confidence/reason in details
600 """
601 effective_schema = schema or DEFAULT_LLM_SCHEMA
602 effective_prompt = prompt or DEFAULT_LLM_PROMPT
604 # Truncate output to avoid context limits (keep last 4000 chars)
605 truncated = output[-4000:] if len(output) > 4000 else output
607 user_prompt = f"{effective_prompt}\n\n<action_output>\n{truncated}\n</action_output>"
609 invocation = resolve_host().build_blocking_json(prompt=user_prompt, model=model)
610 # Builder drops json_schema (Protocol surface only) and omits the
611 # claude-CLI-specific --no-session-persistence flag; augment at call site.
612 args = list(invocation.args) + [
613 "--json-schema",
614 json.dumps(effective_schema),
615 "--no-session-persistence",
616 ]
618 t0 = time.monotonic()
619 try:
620 proc = subprocess.run(
621 [invocation.binary, *args], capture_output=True, text=True, timeout=timeout
622 )
623 except subprocess.TimeoutExpired:
624 return EvaluationResult(
625 verdict="error",
626 details={"error": "LLM evaluation timeout", "timeout": True},
627 )
628 except FileNotFoundError:
629 return EvaluationResult(
630 verdict="error",
631 details={
632 "error": f"{invocation.binary} CLI not found. Install the active host CLI (see LL_HOST_CLI).",
633 "missing_dependency": True,
634 },
635 )
636 llm_latency_ms = int((time.monotonic() - t0) * 1000)
638 if proc.returncode != 0:
639 return EvaluationResult(
640 verdict="error",
641 details={
642 "error": f"{invocation.binary} CLI error: {proc.stderr.strip()}",
643 "api_error": True,
644 },
645 )
647 # Guard: empty stdout with exit 0 (API error not reflected in exit code)
648 if not proc.stdout.strip():
649 stderr_info = proc.stderr.strip()[:200] if proc.stderr else ""
650 error_msg = f"{invocation.binary} CLI returned empty output"
651 if stderr_info:
652 error_msg += f" (stderr: {stderr_info})"
653 return EvaluationResult(
654 verdict="error",
655 details={"error": error_msg, "empty_output": True},
656 )
658 # Parse the CLI JSON envelope and extract structured result.
659 # With --json-schema the envelope is:
660 # success: {"type":"result","subtype":"success","structured_output":{...},...}
661 # failure: {"type":"result","subtype":"error_max_structured_output_retries",...}
662 # If stdout is JSONL (multiple JSON objects), use the last non-empty line.
663 try:
664 stdout = proc.stdout.strip()
665 try:
666 envelope = json.loads(stdout)
667 except json.JSONDecodeError:
668 # Try JSONL: take the last non-empty line
669 lines = [line for line in stdout.split("\n") if line.strip()]
670 if not lines:
671 raise
672 envelope = json.loads(lines[-1])
674 # Check structured-output retry exhaustion (--json-schema failure mode)
675 if envelope.get("subtype") == "error_max_structured_output_retries":
676 return EvaluationResult(
677 verdict="error",
678 details={
679 "error": "Claude CLI could not produce valid structured output after retries",
680 "api_error": True,
681 },
682 )
684 # Check legacy is_error flag (some CLI versions exit 0 but report error in envelope)
685 if envelope.get("is_error", False):
686 err_text = str(envelope.get("result", "") or "")[:200]
687 return EvaluationResult(
688 verdict="error",
689 details={"error": f"Claude CLI reported error: {err_text}", "api_error": True},
690 )
692 # --json-schema mode returns validated dict in "structured_output"
693 if isinstance(envelope.get("structured_output"), dict):
694 llm_result: dict[str, Any] = envelope["structured_output"]
695 else:
696 raw_result = envelope.get("result", "")
697 if isinstance(raw_result, dict):
698 llm_result = raw_result
699 elif raw_result:
700 llm_result = json.loads(raw_result)
701 elif "verdict" in envelope:
702 llm_result = envelope
703 else:
704 raw_preview = proc.stdout[:300]
705 return EvaluationResult(
706 verdict="error",
707 details={
708 "error": "Empty result field in Claude CLI response",
709 "raw_preview": raw_preview,
710 },
711 )
712 except (json.JSONDecodeError, TypeError, ValueError) as e:
713 raw_preview = proc.stdout[:300] if proc.stdout else "(empty)"
714 return EvaluationResult(
715 verdict="error",
716 details={"error": f"Failed to parse LLM response: {e}", "raw_preview": raw_preview},
717 )
719 # Build result with confidence handling
720 verdict = str(llm_result.get("verdict", "error"))
721 confidence = float(llm_result.get("confidence", 1.0))
722 confident = confidence >= min_confidence
724 # Optionally modify verdict for low confidence
725 if uncertain_suffix and not confident:
726 verdict = f"{verdict}_uncertain"
728 return EvaluationResult(
729 verdict=verdict,
730 details={
731 "confidence": confidence,
732 "confident": confident,
733 "reason": llm_result.get("reason", ""),
734 "raw": llm_result,
735 "llm_model": model,
736 "llm_latency_ms": llm_latency_ms,
737 "llm_prompt": user_prompt[:500],
738 "llm_raw_output": proc.stdout[:500] if proc.stdout else "",
739 },
740 )
743def evaluate(
744 config: EvaluateConfig,
745 output: str,
746 exit_code: int,
747 context: InterpolationContext,
748) -> EvaluationResult:
749 """Dispatch to appropriate evaluator based on config type.
751 Args:
752 config: Evaluator configuration with type and parameters
753 output: Action stdout
754 exit_code: Action exit code
755 context: Runtime context for variable interpolation
757 Returns:
758 EvaluationResult from the appropriate evaluator
760 Raises:
761 ValueError: If evaluator type is unknown
762 """
763 eval_type = config.type
765 if eval_type == "exit_code":
766 return evaluate_exit_code(exit_code)
768 elif eval_type == "output_numeric":
769 if config.target is None:
770 raise ValueError("output_numeric evaluator requires 'target' to be set")
771 elif isinstance(config.target, str):
772 try:
773 resolved = interpolate(config.target, context) if context else config.target
774 numeric_target = float(resolved)
775 except (InterpolationError, ValueError) as e:
776 raise ValueError(
777 f"output_numeric target must be numeric, got: {config.target!r}"
778 ) from e
779 else:
780 numeric_target = float(config.target)
781 return evaluate_output_numeric(
782 output=output,
783 operator=config.operator or "eq",
784 target=numeric_target,
785 )
787 elif eval_type == "output_json":
788 return evaluate_output_json(
789 output=output,
790 path=config.path or "",
791 operator=config.operator or "eq",
792 target=config.target,
793 )
795 elif eval_type == "output_contains":
796 return evaluate_output_contains(
797 output=output,
798 pattern=config.pattern or "",
799 negate=config.negate,
800 )
802 elif eval_type == "convergence":
803 # Resolve previous value from interpolation if configured
804 previous: float | None = None
805 if config.previous:
806 try:
807 previous = float(interpolate(config.previous, context))
808 except (InterpolationError, ValueError):
809 # Previous unavailable on first iteration, continue with None
810 pass
812 # Parse current value from output
813 try:
814 current = float(output.strip())
815 except ValueError:
816 return EvaluationResult(
817 verdict="error",
818 details={"error": f"Cannot parse output as number: {output[:100]}"},
819 )
821 # Resolve target (may be interpolated string like "${context.target}")
822 convergence_target: float
823 if isinstance(config.target, str):
824 try:
825 convergence_target = float(interpolate(config.target, context))
826 except (InterpolationError, ValueError) as e:
827 return EvaluationResult(
828 verdict="error",
829 details={"error": f"Cannot resolve target: {e}"},
830 )
831 else:
832 if config.target is None:
833 raise ValueError("convergence evaluator requires 'target' to be set")
834 convergence_target = float(config.target)
836 # Resolve tolerance (may be interpolated string)
837 tolerance: float = 0.0
838 if config.tolerance is not None:
839 if isinstance(config.tolerance, str):
840 try:
841 tolerance = float(interpolate(config.tolerance, context))
842 except (InterpolationError, ValueError):
843 tolerance = 0.0
844 else:
845 tolerance = float(config.tolerance)
847 return evaluate_convergence(
848 current=current,
849 previous=previous,
850 target=convergence_target,
851 tolerance=tolerance,
852 direction=config.direction,
853 )
855 elif eval_type == "diff_stall":
856 return evaluate_diff_stall(
857 scope=config.scope,
858 max_stall=config.max_stall,
859 )
861 elif eval_type == "llm_structured":
862 prompt = config.prompt
863 if prompt and context:
864 try:
865 prompt = interpolate(prompt, context)
866 except InterpolationError:
867 pass # Use raw prompt on resolution failure
868 return evaluate_llm_structured(
869 output=output,
870 prompt=prompt,
871 schema=config.schema,
872 min_confidence=config.min_confidence,
873 uncertain_suffix=config.uncertain_suffix,
874 )
876 elif eval_type == "mcp_result":
877 return evaluate_mcp_result(output=output, exit_code=exit_code)
879 elif eval_type == "harbor_scorer":
880 return evaluate_harbor_scorer(output=output, exit_code=exit_code)
882 else:
883 raise ValueError(f"Unknown evaluator type: {eval_type}")