{% if is_comparison %}

Search Evaluation Comparison: {{ config_a }} vs {{ config_b }}

{% if sibling_report %}
This run also evaluated autocomplete. View Autocomplete Report →
{% endif %}

Metrics Comparison

{% for d in comparison_data %} {% if d.a_na and d.b_na %} {% else %} {% endif %} {% endfor %}
Metric {{ config_a }} {{ config_b }} Delta % Change
{{ d.name }} {% if d.a_na %}N/A{% else %}{{ "%.4f"|format(d.value_a) }}{% endif %} {% if d.b_na %}N/A{% else %}{{ "%.4f"|format(d.value_b) }}{% endif %}- - {{ "%+.4f"|format(d.delta) }} {{ "%+.1f"|format(d.pct_change) }}%
{% if win_loss %}

Query-Level Outcome

{{ win_loss.total }} queries compared on NDCG@10
{% if win_loss.win_pct > 0 %}
{% if win_loss.win_pct >= 8 %}{{ win_loss.wins }}{% endif %}
{% endif %} {% if win_loss.tie_pct > 0 %}
{% if win_loss.tie_pct >= 8 %}{{ win_loss.ties }}{% endif %}
{% endif %} {% if win_loss.loss_pct > 0 %}
{% if win_loss.loss_pct >= 8 %}{{ win_loss.losses }}{% endif %}
{% endif %}
{{ config_b }} wins — {{ win_loss.wins }} ({{ win_loss.win_pct }}%) Tied — {{ win_loss.ties }} ({{ win_loss.tie_pct }}%) {{ config_a }} wins — {{ win_loss.losses }} ({{ win_loss.loss_pct }}%)
{% endif %} {% if overlap_summary %}

Result Set Overlap

{{ overlap_summary.mean_jaccard }}
Mean Jaccard Index
{{ overlap_summary.interpretation }}
Based on {{ overlap_summary.query_count }} queries
{% endif %} {% if scatter_plot %}

NDCG@10 Scatter Plot

Each point is a query. Points above the diagonal improved in {{ config_b }}; points below regressed.

{% for g in scatter_plot.gridlines %} {{ g.val }} {{ g.val }} {% endfor %} {% for p in scatter_plot.points %} {{ p.query }} โ€” {{ config_a }}: {{ p.ndcg_a }}, {{ config_b }}: {{ p.ndcg_b }} ({{ p.type }}) {% endfor %} {{ config_a }} NDCG@10 {{ config_b }} NDCG@10 {% if scatter_plot.legend %}
{% for entry in scatter_plot.legend %} {{ entry.type }} {% endfor %}
{% endif %}
{% endif %} {% if score_dist_a or score_dist_b %}

Score Distribution

{% for label, dist in [(config_a, score_dist_a), (config_b, score_dist_b)] %} {% if dist %}
{{ label }}
{{ dist.total }} judgments
{% if dist.pcts[3] > 0 %}
{% if dist.pcts[3] >= 8 %}{{ dist.counts[3] }}{% endif %}
{% endif %} {% if dist.pcts[2] > 0 %}
{% if dist.pcts[2] >= 8 %}{{ dist.counts[2] }}{% endif %}
{% endif %} {% if dist.pcts[1] > 0 %}
{% if dist.pcts[1] >= 8 %}{{ dist.counts[1] }}{% endif %}
{% endif %} {% if dist.pcts[0] > 0 %}
{% if dist.pcts[0] >= 8 %}{{ dist.counts[0] }}{% endif %}
{% endif %}
{% endif %} {% endfor %}
3 2 1 0
{% endif %} {% if position_line_chart %}

Mean Relevance by Position

Average relevance score (0โ€“3) at each result position. Lines should slope downward if ranking places the most relevant results first.

{% for g in position_line_chart.gridlines %} 0 %}stroke-dasharray="4,4"{% endif %} /> {{ g.label }} {% endfor %} {% for t in position_line_chart.x_ticks %} #{{ t.label }} {% endfor %} {% if position_line_chart.polyline_a %} {% for p in position_line_chart.points_a %} Position {{ p.pos }}: {{ p.score }}/3 ({{ config_a }}) {% endfor %} {% endif %} {% if position_line_chart.polyline_b %} {% for p in position_line_chart.points_b %} Position {{ p.pos }}: {{ p.score }}/3 ({{ config_b }}) {% endfor %} {% endif %}
{{ config_a }} {{ config_b }}
{% endif %} {% if check_comparison %}

Deterministic Checks

{% for c in check_comparison %} {% endfor %}
Check Failed ({{ config_a }}) Failed ({{ config_b }}) Delta
{{ c.display_name }} {% if check_descriptions and check_descriptions.get(c.name) %} ? {% endif %} {{ c.failed_a }} {{ c.failed_b }} {{ "%+d"|format(c.delta) }}
{% endif %} {% if correction_data_a or correction_data_b %}

Query Corrections

{% for label, cdata in [(config_a, correction_data_a), (config_b, correction_data_b)] %} {% if cdata %}

{{ label }}

{% for c in cdata %} {% endfor %}
Original Query Corrected Query Verdict Reasoning
{{ c.original_query }} {{ c.corrected_query }} {{ c.verdict }} {{ c.reasoning }}
{% endif %} {% endfor %} {% endif %} {% if winners %}

Biggest Improvements (NDCG@10)

{% for w in winners %} {% if w.has_details %} {% endif %} {% endfor %}
Query {{ config_a }} {{ config_b }} Delta
{{ w.query }} {{ "%.4f"|format(w.value_a) }} {{ "%.4f"|format(w.value_b) }} {{ "%+.4f"|format(w.delta) }}
Show results side-by-side
{% for label, results in [(config_a, w.results_a), (config_b, w.results_b)] %} {% if results %}
{{ label }}
{% for r in results %} {% endfor %}
#ProductScoreAttributes
{{ r.position }} {{ r.title }}
{{ r.product_id }}
{{ r.score }}/3 {{ r.attribute_verdict }}
{% endif %} {% endfor %}
{% endif %} {% if losers %}

Biggest Regressions (NDCG@10)

{% for l in losers %} {% if l.has_details %} {% endif %} {% endfor %}
Query {{ config_a }} {{ config_b }} Delta
{{ l.query }} {{ "%.4f"|format(l.value_a) }} {{ "%.4f"|format(l.value_b) }} {{ "%+.4f"|format(l.delta) }}
Show results side-by-side
{% for label, results in [(config_a, l.results_a), (config_b, l.results_b)] %} {% if results %}
{{ label }}
{% for r in results %} {% endfor %}
#ProductScoreAttributes
{{ r.position }} {{ r.title }}
{{ r.product_id }}
{{ r.score }}/3 {{ r.attribute_verdict }}
{% endif %} {% endfor %}
{% endif %} {% if type_comparison %}

Metrics by Query Type

{% for qt in query_types %}
{% for m in type_comparison %} {% set t = m.types[qt] %} {% if t.value_a is not none or t.value_b is not none %} {% endif %} {% endfor %}
{{ query_type_display_names.get(qt, qt) }} {% if query_type_descriptions and query_type_descriptions.get(qt) %} ? {% endif %}
Metric {{ config_a }} {{ config_b }} Delta
{{ m.name }} {% if t.value_a is not none %}{{ "%.4f"|format(t.value_a) }}{% else %}-{% endif %} {% if t.value_b is not none %}{{ "%.4f"|format(t.value_b) }}{% else %}-{% endif %} {% if t.delta is not none %}{{ "%+.4f"|format(t.delta) }}{% else %}-{% endif %}
{% endfor %} {% endif %} {% if shift_checks %}

Biggest Position Shifts

{% for c in shift_checks %} {% endfor %}
Query Product Detail
{{ c.query }} {{ c.product_id }} {{ c.detail }}
{% endif %} {% else %}

Search Evaluation Report

{% if sibling_report %}
This run also evaluated autocomplete. View Autocomplete Report →
{% endif %}

IR Metrics

{% for m in metrics %} {% endfor %}
Metric Value
{{ m.metric_name }} {% if metric_descriptions and metric_descriptions.get(m.metric_name) %} ? {% endif %} {% if m.query_count is not none and m.query_count == 0 %} N/A (no queries with attribute constraints) {% elif m.query_count is not none and m.total_queries is not none and m.query_count < m.total_queries %} {{ "%.4f"|format(m.value) }} (n={{ m.query_count }} of {{ m.total_queries }} queries) {% else %} {{ "%.4f"|format(m.value) }} {% endif %}
{% if position_chart %}

Average Score by Result Position

Bars should decrease from left to right if your ranking places the most relevant results first.

{% for score in [0, 1, 2, 3] %} 0 %}stroke-dasharray="4,4"{% endif %} /> {{ score }} {% endfor %} {% for bar in position_chart %} Position {{ bar.position }}: avg {{ "%.2f"|format(bar.avg_score) }}/3 ({{ bar.count }} queries) {{ "%.2f"|format(bar.avg_score) }} #{{ bar.position }} {% endfor %}
{% endif %} {% if ndcg_histogram %}

NDCG@10 Distribution

{% if ndcg_stats %}
NDCG@10 spread:  min={{ "%.4f"|format(ndcg_stats.min) }} · median={{ "%.4f"|format(ndcg_stats.median) }} · max={{ "%.4f"|format(ndcg_stats.max) }} · stdev={{ "%.4f"|format(ndcg_stats.stdev) }}
{% endif %}

Distribution of per-query NDCG@10 scores. Red bins indicate poor relevance; green bins indicate strong relevance.

{% for bar in ndcg_histogram %} {{ bar.bin_label }}โ€“{{ "%.1f"|format(bar.bin_label|float + 0.1) }}: {{ bar.count }} queries {% if bar.count > 0 %} {{ bar.count }} {% endif %} {{ bar.bin_label }} {% endfor %}
{% endif %} {% if score_counts %}

Score Distribution

{{ score_total }} judgments across all queries
{% if score_pcts[3] > 0 %}
{% if score_pcts[3] >= 8 %}{{ score_counts[3] }}{% endif %}
{% endif %} {% if score_pcts[2] > 0 %}
{% if score_pcts[2] >= 8 %}{{ score_counts[2] }}{% endif %}
{% endif %} {% if score_pcts[1] > 0 %}
{% if score_pcts[1] >= 8 %}{{ score_counts[1] }}{% endif %}
{% endif %} {% if score_pcts[0] > 0 %}
{% if score_pcts[0] >= 8 %}{{ score_counts[0] }}{% endif %}
{% endif %}
3 — {{ score_counts[3] }} ({{ "%.1f"|format(score_pcts[3]) }}%) 2 — {{ score_counts[2] }} ({{ "%.1f"|format(score_pcts[2]) }}%) 1 — {{ score_counts[1] }} ({{ "%.1f"|format(score_pcts[1]) }}%) 0 — {{ score_counts[0] }} ({{ "%.1f"|format(score_pcts[0]) }}%)
{% if score_by_type %}
By Query Type
{% for dist in score_by_type %}
{{ dist.display_name }} ({{ dist.total }} judgments)
{% if dist.pcts[3] > 0 %}
{% if dist.pcts[3] >= 10 %}{{ dist.counts[3] }}{% endif %}
{% endif %} {% if dist.pcts[2] > 0 %}
{% if dist.pcts[2] >= 10 %}{{ dist.counts[2] }}{% endif %}
{% endif %} {% if dist.pcts[1] > 0 %}
{% if dist.pcts[1] >= 10 %}{{ dist.counts[1] }}{% endif %}
{% endif %} {% if dist.pcts[0] > 0 %}
{% if dist.pcts[0] >= 10 %}{{ dist.counts[0] }}{% endif %}
{% endif %}
{% endfor %}
{% endif %}
{% endif %} {% if query_types %}

Metrics by Query Type

{% for qt in query_types %} {% endfor %} {% for m in type_metrics %} {% for qt in query_types %} {% endfor %} {% endfor %}
Metric {{ query_type_display_names.get(qt, qt) }} {% if query_type_descriptions and query_type_descriptions.get(qt) %} ? {% endif %}
{{ m.metric_name }}{% if m.by_query_type.get(qt) is not none %}{{ "%.4f"|format(m.by_query_type[qt]) }}{% else %}-{% endif %}
{% endif %} {% if check_summary %}

Deterministic Checks

{% for name, counts in check_summary %} {% set has_failures = check_failures and check_failures.get(name) %} {% endfor %}
Check Passed Failed
{% if has_failures %} {% set cf = check_failures[name] %}
{{ counts.display_name }} {% if check_descriptions and check_descriptions.get(name) %} ? {% endif %}
{% for item in cf["entries"] %}
{{ item.query }} {% if item.product_id %}{{ item.product_id }}{% endif %} {{ item.detail }} {{ item.severity }}
{% endfor %} {% if cf["total"] > cf["entries"]|length %}
…and {{ cf["total"] - cf["entries"]|length }} more
{% endif %}
{% else %} {{ counts.display_name }} {% if check_descriptions and check_descriptions.get(name) %} ? {% endif %} {% endif %}
{{ counts.passed_display }} {{ counts.failed }}
{% endif %} {% if correction_summary %}

Query Corrections

{% for c in correction_summary %} {% endfor %}
Original Query Corrected Query Verdict Reasoning
{{ c.original_query }} {{ c.corrected_query }} {{ c.verdict }} {{ c.reasoning }}
{% endif %} {% if worst_queries %}

Worst Performing Queries (NDCG@10)

{% for wq in worst_queries %} {% endfor %}
Query Query Type NDCG@10 Failed Checks
{% if wq.anchor_id %}{{ wq.query }}{% else %}{{ wq.query }}{% endif %} {{ wq.query_type }} {{ "%.4f"|format(wq.ndcg) }} {{ wq.failed_checks }}
{% endif %} {% if judgments_for_template %}

Judgment Details

Click a query to expand individual product scores and LLM reasoning. Sorted by worst average score first.

{% for item in judgments_for_template %}
{{ item.query }}{% if item.correction %} → {{ item.correction.corrected_query }} ({{ item.correction.verdict }}){% endif %} avg score {{ "%.2f"|format(item.avg_score) }}  ยท  {{ item.judgments|length }} results {% for j in item.judgments %} {% endfor %}
# Product Score Attributes Reasoning
{{ j.product.position + 1 }} {{ j.product.title }}
{{ j.product.product_id }} {% if j.product.category or j.product.price > 0 or j.product.in_stock is false %}
{% if j.product.category %}{{ j.product.category }}{% endif %} {% if j.product.price > 0 %}${{ "%.2f"|format(j.product.price) }}{% endif %} {% if j.product.in_stock is false %}Out of stock{% endif %}
{% endif %}
{{ j.score }}/3 {{ j.attribute_verdict }} {{ j.reasoning }} {% if j.metadata.get('failed_checks') %}
{% for fc in j.metadata.failed_checks %} {{ fc.check_name }}: {{ fc.detail }}{% if not loop.last %}; {% endif %} {% endfor %}
{% endif %}
{% endfor %}
{% endif %} {% endif %}