{% endif %}
{% if tool_results %}
{% set t_cls = "pass" if t_passed == t_total else "fail" %}
Tool Tests
{{ t_passed }}/{{ t_total }}
{% endif %}
{% if callback_results %}
{% set c_cls = "pass" if c_passed == c_total else "fail" %}
Callback Tests
{{ c_passed }}/{{ c_total }}
{% endif %}
All Evals
Result
Type
Eval
Detail
{% for u in unified %}
{% set type_cls = u["type"] %}
{% set safe_name = u["name"] | replace("'", "\\'") %}
{% set passed_str = "true" if u["passed"] else "false" %}
{% if u["type"] == "sim" and "run_results" in u %}
{% set p = u["run_results"] | selectattr('passed') | list | length %}
{% set t = u["run_results"] | length %}
{% if p == t %}
PASSED
{% elif p == 0 %}
FAILED
{% else %}
MIXED{{ p }}/{{ t }}
{% endif %}
{% elif u["passed"] %}
PASSED
{% else %}
FAILED
{% endif %}
{{ u["type"] }}
{{ u["name"] | escape }}
{% if u["type"] == "sim" and "run_results" in u %}
{% for r in u["run_results"] %}
{% set dot_cls = "p" if r.get("passed") else "f" %}
{% endfor %}
{% else %}
{{ u["detail"] | escape }}
{% endif %}
{% endfor %}
{% if failure_groups %}
Failure Patterns
{% for reason, evals in failure_groups.items() | sort(attribute='1', reverse=true) %}
{{ reason | escape }}
{{ evals | length }} eval(s):
{% for type, name in evals | sort %}
{{ name | escape }} ({{ type }})
{% endfor %}
{% endfor %}
{% endif %}
{% if golden_results %}
{% set g_pct_str = "%.0f%%" | format(100 * g_passed / g_total) if g_total else "0%" %}
{% for r in golden_results | sort(attribute='name') | sort(attribute='passed') %}
{% set cls = "pass-bg" if r["passed"] else "fail-bg" %}
{% set status = "PASS" if r["passed"] else "FAIL" %}
{% set failed_str = "false" if r["passed"] else "true" %}
{% set passed_str = "true" if r["passed"] else "false" %}
{{ r["name"] | escape }} golden{{ status }}
Golden Evaluation — {{ status }} | {{ r.get('turns',[]) | length }} turns{% if r.get('duration_s') %} | {{ _fmt_duration(r['duration_s']) }}{% endif %}
{% if r.get('session_id') %}
{% endif %}
Conversation & Comparison
{% for turn in r.get('turns', []) %}
{% set sem = turn.get('semantic_score') %}
{% set sem_cls = "sem-" ~ sem if sem is not none else "" %}
{% set any_fail = (turn.get('comparisons', []) | selectattr('outcome', 'equalto', 'FAIL') | list | length > 0) or (sem is not none and sem < 3) %}
{% set row_cls = "mismatch" if any_fail else "match" %}
Turn {{ turn["index"] }}
{% if sem is not none %}
{{ sem }}/4
{% if turn.get('semantic_explanation') %}
({{ turn['semantic_explanation'] | escape }})
{% endif %}
{% endif %}
{% if turn.get('user_input') %}
User: {{ turn['user_input'] | escape }}
{% if golden_modality == "audio" and r.get('session_id') %}
{% endif %}
{% elif golden_modality == "audio" and r.get('session_id') %}
{% endif %}
{% if golden_modality == "audio" and r.get('session_id') %}
{% endif %}
{% for comp in turn.get('comparisons', []) %}
{% set is_fail = comp["outcome"] == "FAIL" or (ctype == "text" and sem is not none and sem < 3) %}
{% set c_cls = "fail-bg" if is_fail else "pass-bg" %}
{% set badge_cls = "fail" if is_fail else ("pass" if comp["outcome"] == "PASS" else "") %}
{% set ctype = comp.get('type', '?') %}
{% set icon = {"text": "💬", "tool_call": "🔧", "transfer": "🔀"}.get(ctype, "") %}
{% if ctype != "text" or comp["outcome"] != "UNSPECIFIED" %}
{{ comp["outcome"] }}
{% endif %}
{{ icon | safe }} {{ ctype }} Expected:{{ comp.get('expected', '') | string | escape }} Actual:{{ comp.get('actual', '') | string | escape }}
{% if ctype == "tool_call" %}
{% if comp.get('tool_invocation_score') is not none %}
{# We group simulations by name in template or pass them grouped. Let's assume unified has grouped them for summary, but for detail we can loop over sorted results #}
{% set grouped_sims = {} %}
{% for r in sim_results %}
{% if r.name not in grouped_sims %}
{% set _ = grouped_sims.update({r.name: {"pass": 0, "total": 0, "runs": []}}) %}
{% endif %}
{% set _ = grouped_sims[r.name].update({"total": grouped_sims[r.name]["total"] + 1}) %}
{% if r.get("passed") %}
{% set _ = grouped_sims[r.name].update({"pass": grouped_sims[r.name]["pass"] + 1}) %}
{% endif %}
{% set _ = grouped_sims[r.name]["runs"].append(r) %}
{% endfor %}
{% for name, s in grouped_sims.items() | sort(attribute='1.pass', reverse=false) %}
{% set score = s['pass'] ~ '/' ~ s['total'] %}
{% set cls = "pass-bg" if s['pass'] == s['total'] else "fail-bg" %}
{% set passed_str = "true" if s['pass'] == s['total'] else "false" %}
{{ name | escape }} sim{{ score }}
{% for r in s["runs"] %}
{% set run_cls = "pass" if r.get("passed") else "fail" %}
{% set failed_str = "false" if r.get("passed") else "true" %}
Run {{ r.get('run','?') }} — {% if r.get('passed') %}PASS{% else %}FAIL{% endif %} | goals: {{ r.get('goals','?') }} | expectations: {{ r.get('expectations','?') }} | turns: {{ r.get('turns','?') }}{% if r.get('duration_s') %} | {{ _fmt_duration(r['duration_s']) }}{% endif %}
{% if r.get('session_id') %}
{% if ces_base %}
{% for r in tool_results | sort(attribute='passed') %}
{% set cls = "pass" if r["passed"] else "fail" %}
{% set passed_str = "true" if r["passed"] else "false" %}
{% set lat = "%.0fms" | format(r["latency_ms"]) if r.get("latency_ms") else "-" %}
{% for r in callback_results | sort(attribute='passed') %}
{% set cls = "pass" if r["passed"] else "fail" %}
{% set passed_str = "true" if r["passed"] else "false" %}