Combined Eval Report

{{ "%.0f" | format(pct) }}%
{{ passed }}/{{ total }} total passed
Generated {{ ts }}
{% if golden_results %} {% set g_cls = "pass" if g_passed == g_total else "fail" %}
Goldens ({{ golden_modality }})
{{ g_passed }}/{{ g_total }}
{% if g_duration > 0 %}
{{ _fmt_duration(g_duration) }} {% endif %}
{% endif %} {% if sim_results %} {% set s_cls = "pass" if s_passed == s_total else "fail" %} {% set s_time = "" %}
Sims ({{ sim_modality }})
{{ s_passed }}/{{ s_total }}
{% if sim_wall_clock_s %}
{{ _fmt_duration(sim_wall_clock_s) }} {% elif s_dur > 0 %}
~{{ _fmt_duration(s_dur) }} {% endif %}
{% endif %} {% if tool_results %} {% set t_cls = "pass" if t_passed == t_total else "fail" %}
Tool Tests
{{ t_passed }}/{{ t_total }}
{% endif %} {% if callback_results %} {% set c_cls = "pass" if c_passed == c_total else "fail" %}
Callback Tests
{{ c_passed }}/{{ c_total }}
{% endif %}

All Evals

{% for u in unified %} {% set type_cls = u["type"] %} {% set safe_name = u["name"] | replace("'", "\\'") %} {% set passed_str = "true" if u["passed"] else "false" %} {% endfor %}
ResultTypeEvalDetail
{% if u["type"] == "sim" and "run_results" in u %} {% set p = u["run_results"] | selectattr('passed') | list | length %} {% set t = u["run_results"] | length %} {% if p == t %} PASSED {% elif p == 0 %} FAILED {% else %} MIXED {{ p }}/{{ t }} {% endif %} {% elif u["passed"] %} PASSED {% else %} FAILED {% endif %} {{ u["type"] }} {{ u["name"] | escape }} {% if u["type"] == "sim" and "run_results" in u %} {% for r in u["run_results"] %} {% set dot_cls = "p" if r.get("passed") else "f" %} {% endfor %} {% else %} {{ u["detail"] | escape }} {% endif %}
{% if failure_groups %}

Failure Patterns

{% for reason, evals in failure_groups.items() | sort(attribute='1', reverse=true) %}

{{ reason | escape }}

{{ evals | length }} eval(s): {% for type, name in evals | sort %} {{ name | escape }} ({{ type }}) {% endfor %}
{% endfor %} {% endif %} {% if golden_results %} {% set g_pct_str = "%.0f%%" | format(100 * g_passed / g_total) if g_total else "0%" %}

Goldens ({{ g_passed }}/{{ g_total }} — {{ g_pct_str }})

{% for r in golden_results | sort(attribute='name') | sort(attribute='passed') %} {% set cls = "pass-bg" if r["passed"] else "fail-bg" %} {% set status = "PASS" if r["passed"] else "FAIL" %} {% set failed_str = "false" if r["passed"] else "true" %} {% set passed_str = "true" if r["passed"] else "false" %}
{{ r["name"] | escape }} golden {{ status }}
Golden Evaluation — {{ status }} | {{ r.get('turns',[]) | length }} turns{% if r.get('duration_s') %} | {{ _fmt_duration(r['duration_s']) }}{% endif %} {% if r.get('session_id') %} {% if golden_modality == "audio" %}
Full Conversation
{% endif %} {% endif %} {% if r.get('session_parameters') %}
Session Parameters
{{ json.dumps(r['session_parameters'], indent=2) | escape }}
{% endif %}
Conversation & Comparison {% for turn in r.get('turns', []) %} {% set sem = turn.get('semantic_score') %} {% set sem_cls = "sem-" ~ sem if sem is not none else "" %} {% set any_fail = (turn.get('comparisons', []) | selectattr('outcome', 'equalto', 'FAIL') | list | length > 0) or (sem is not none and sem < 3) %} {% set row_cls = "mismatch" if any_fail else "match" %}
Turn {{ turn["index"] }} {% if sem is not none %} {{ sem }}/4 {% if turn.get('semantic_explanation') %} ({{ turn['semantic_explanation'] | escape }}) {% endif %} {% endif %}
{% if turn.get('user_input') %}
User: {{ turn['user_input'] | escape }} {% if golden_modality == "audio" and r.get('session_id') %} {% endif %}
{% elif golden_modality == "audio" and r.get('session_id') %}
{% endif %} {% if golden_modality == "audio" and r.get('session_id') %} {% endif %} {% for comp in turn.get('comparisons', []) %} {% set is_fail = comp["outcome"] == "FAIL" or (ctype == "text" and sem is not none and sem < 3) %} {% set c_cls = "fail-bg" if is_fail else "pass-bg" %} {% set badge_cls = "fail" if is_fail else ("pass" if comp["outcome"] == "PASS" else "") %} {% set ctype = comp.get('type', '?') %} {% set icon = {"text": "💬", "tool_call": "🔧", "transfer": "🔀"}.get(ctype, "") %}
{% if ctype != "text" or comp["outcome"] != "UNSPECIFIED" %} {{ comp["outcome"] }} {% endif %} {{ icon | safe }} {{ ctype }}
Expected: {{ comp.get('expected', '') | string | escape }}
Actual: {{ comp.get('actual', '') | string | escape }} {% if ctype == "tool_call" %} {% if comp.get('tool_invocation_score') is not none %}
Tool Invocation Score: {{ comp['tool_invocation_score'] }}
{% endif %} {% if comp.get('tool_invocation_explanation') %}
Explanation: {{ comp['tool_invocation_explanation'] | escape }}
{% endif %} {% endif %} {% if ctype == "tool_call" and (comp.get('expected_args') or comp.get('actual_args')) %}
Tool call args
{% if comp.get('expected_args') %}
Expected args:
{{ json.dumps(comp['expected_args'], indent=2, default=str) | escape }}
{% endif %} {% if comp.get('actual_args') %}
Actual args:
{{ json.dumps(comp['actual_args'], indent=2, default=str) | escape }}
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
{% for exp in r.get('expectations', []) %} {% set e_cls = "met" if exp["status"] == "Met" else "not-met" %}
{{ exp["status"] | escape }} {{ exp["expectation"] | string | truncate(150) | escape }} {% if exp.get('justification') %}
{{ exp['justification'] | string | truncate(250) | escape }} {% endif %}
{% endfor %}
{% endfor %} {% endif %} {% if sim_results %} {% set s_pct_str = "%.0f%%" | format(100 * s_passed / s_total) if s_total else "0%" %}

Simulations ({{ s_passed }}/{{ s_total }} — {{ s_pct_str }})

{# We group simulations by name in template or pass them grouped. Let's assume unified has grouped them for summary, but for detail we can loop over sorted results #} {% set grouped_sims = {} %} {% for r in sim_results %} {% if r.name not in grouped_sims %} {% set _ = grouped_sims.update({r.name: {"pass": 0, "total": 0, "runs": []}}) %} {% endif %} {% set _ = grouped_sims[r.name].update({"total": grouped_sims[r.name]["total"] + 1}) %} {% if r.get("passed") %} {% set _ = grouped_sims[r.name].update({"pass": grouped_sims[r.name]["pass"] + 1}) %} {% endif %} {% set _ = grouped_sims[r.name]["runs"].append(r) %} {% endfor %} {% for name, s in grouped_sims.items() | sort(attribute='1.pass', reverse=false) %} {% set score = s['pass'] ~ '/' ~ s['total'] %} {% set cls = "pass-bg" if s['pass'] == s['total'] else "fail-bg" %} {% set passed_str = "true" if s['pass'] == s['total'] else "false" %}
{{ name | escape }} sim {{ score }}
{% for r in s["runs"] %} {% set run_cls = "pass" if r.get("passed") else "fail" %} {% set failed_str = "false" if r.get("passed") else "true" %}
Run {{ r.get('run','?') }} — {% if r.get('passed') %}PASS{% else %}FAIL{% endif %} | goals: {{ r.get('goals','?') }} | expectations: {{ r.get('expectations','?') }} | turns: {{ r.get('turns','?') }}{% if r.get('duration_s') %} | {{ _fmt_duration(r['duration_s']) }}{% endif %} {% if r.get('session_id') %} {% if ces_base %} {% else %} {% endif %} {% if sim_modality == "audio" %}
Full Conversation
{% endif %} {% endif %} {% if r.get('session_parameters') %}
Session Parameters
{{ json.dumps(r['session_parameters'], indent=2) | escape }}
{% endif %} {% if "error" in r %}
Error: {{ r["error"] | escape }}
{% else %} {% for step in r.get('step_details', []) %} {% set s_cls = "pass" if step["status"] == "Completed" else "fail" %}
Goal: {{ step["goal"] | escape }}
Criteria: {{ step["success_criteria"] | escape }}
Status: {{ step["status"] | escape }} {% if step.get('justification') %}
Justification: {{ step["justification"] | escape }} {% endif %}
{% endfor %} {% for exp in r.get('expectation_details', []) %} {% set e_cls = "met" if exp["status"] == "Met" else "not-met" %}
{{ exp["status"] | escape }} {{ exp["expectation"] | escape }} {% if exp.get('justification') %}
Justification: {{ exp["justification"] | escape }} {% endif %}
{% endfor %} {% if r.get('_processed_trace') %}
Conversation Trace ({{ r.get('turns','?') }} turns)
{% set ns = namespace(user_turn=0, agent_turn=0) %} {% for item in r['_processed_trace'] %} {% set kind = item[0] %} {% if kind == "user" %} {% set ns.user_turn = ns.user_turn + 1 %}
User: {{ item[1] | escape }} {% if sim_modality == "audio" and r.get('session_id') %} {% endif %}
{% elif kind == "agent" %} {% set ns.agent_turn = ns.agent_turn + 1 %}
Agent: {{ item[1] | escape }} {% if sim_modality == "audio" and r.get('session_id') %} {% endif %}
{% elif kind in ("tool_call", "tool_pair") %} {% set lbl, _, args = item[1].partition(" with args ") %} {% set lbl = lbl.replace("Tool Call: ", "").replace("Tool Call (Output): ", "").split("/")[-1] %}
🔧 {{ lbl | escape }} {% if args %}
Input:
{{ args | escape }}
{% endif %} {% if kind == "tool_pair" %} {% set _, _, result = item[2].partition(" with result ") %} {% if result %}
Output:
{{ result | escape }}
{% endif %} {% endif %}
{% elif kind == "tool_resp" %} {% set lbl, _, result = item[1].partition(" with result ") %} {% set lbl = lbl.replace("Tool Response: ", "").split("/")[-1] %}
📤 {{ lbl | escape }} {% if result %}
{{ result | escape }}
{% endif %}
{% else %}
{{ item[1] | escape }}
{% endif %} {% endfor %}
{% endif %} {% endif %}
{% endfor %}
{% endfor %} {% endif %} {% if tool_results %} {% set t_pct = 100 * t_passed / t_total if t_total else 0 %}

Tool Tests ({{ t_passed }}/{{ t_total }} — {{ "%.0f" | format(t_pct) }}%)

{% for r in tool_results | sort(attribute='passed') %} {% set cls = "pass" if r["passed"] else "fail" %} {% set passed_str = "true" if r["passed"] else "false" %} {% set lat = "%.0fms" | format(r["latency_ms"]) if r.get("latency_ms") else "-" %} {% endfor %}
ResultToolTestLatencyErrors
{{ r["status"] }} {{ r["tool"] | escape }} {{ r["name"] | escape }} {{ lat }} {{ r.get("errors", "") | string | truncate(100) | escape }}
{% endif %} {% if callback_results %} {% set c_pct = 100 * c_passed / c_total if c_total else 0 %}

Callback Tests ({{ c_passed }}/{{ c_total }} — {{ "%.0f" | format(c_pct) }}%)

{% for r in callback_results | sort(attribute='passed') %} {% set cls = "pass" if r["passed"] else "fail" %} {% set passed_str = "true" if r["passed"] else "false" %} {% endfor %}
ResultAgentCallbackTestError
{{ r["status"] }} {{ r["agent"] | escape }} {{ r["callback_type"] | escape }} {{ r["name"] | escape }} {{ r.get("error", "") | string | truncate(100) | escape }}
{% endif %}