Evaluation Dashboard

Generated at: 2025-12-23 12:52:48

Total Scripts

1
Evaluated

Overall Average

2.889
Across all providers

Total Time

46.24
Seconds

Providers

6
OPENAI GROQ GEMINI

Key Metrics

Score Trends

Provider Comparison

Score Distributions

Processing Timeline

Evaluation Summary

OPENAI

  • output
  • metadata

GROQ

  • output
  • metadata

GEMINI

  • output
  • metadata

Detailed Results

Click to view raw JSON data
{
  "started_at": "2025-12-23T12:51:43.722761",
  "finished_at": "2025-12-23T12:52:29.961676",
  "evaluation_summary": {
    "openai": {
      "output": {
        "negative_summary": [],
        "context_retention": {
          "score": 1.0,
          "issues": []
        },
        "gricean_multi_turn": {
          "relation": {
            "violated": false,
            "justification": "The dialogue maintained a consistent focus on addressing the user's queries."
          },
          "quality": {
            "violated": false,
            "justification": "There were no contradictions across turns in the agent's responses."
          },
          "quantity": {
            "violated": false,
            "justification": "The information was provided succinctly without redundancy."
          },
          "manner": {
            "violated": false,
            "justification": "All references were clear and unambiguous throughout the dialogue."
          }
        },
        "goal_progression": 2,
        "memory_coherence": 1.0,
        "diagnostic_summary": "The dialogue effectively met all user needs with clarity and engagement."
      },
      "metadata": {
        "input_tokens": 986,
        "output_tokens": 196
      }
    },
    "groq": {
      "output": {
        "negative_summary": [
          "- No inaccuracies or omissions found",
          "- No misunderstandings detected",
          "- All key points covered in the dialogue",
          "- Exact match with expected outcomes",
          "- Most points covered with minor wording variations"
        ],
        "context_retention": {
          "score": 1.0,
          "issues": []
        },
        "gricean_multi_turn": {
          "relation": {
            "violated": false,
            "justification": "The dialogue stayed goal-coherent and did not drift from the topic."
          },
          "quality": {
            "violated": false,
            "justification": "No cross-turn contradictions were found in the dialogue."
          },
          "quantity": {
            "violated": false,
            "justification": "The information was well-paced and did not contain redundant or missing cumulative information."
          },
          "manner": {
            "violated": false,
            "justification": "References were clear across turns and did not contain ambiguous terms."
          }
        },
        "goal_progression": 2,
        "memory_coherence": 1.0,
        "diagnostic_summary": "The dialogue was successful, with all key points covered, exact matches with expected outcomes, and no inaccuracies or omissions detected."
      },
      "metadata": {
        "input_tokens": 980,
        "output_tokens": 266
      }
    },
    "gemini": {
      "output": {
        "negative_summary": [],
        "context_retention": {
          "score": 1.0,
          "issues": [
            "No user-provided facts were present in the dialogue trace for evaluation."
          ]
        },
        "gricean_multi_turn": {
          "relation": {
            "violated": false,
            "justification": "The agent's turns consistently address distinct but plausible sub-goals (information queries, transaction) without apparent topic drift within the agent's responses."
          },
          "quality": {
            "violated": false,
            "justification": "All agent replies are explicitly stated as accurate and complete in the facts and judge verdicts, with no cross-turn contradictions indicated."
          },
          "quantity": {
            "violated": false,
            "justification": "Each agent turn provides complete information relevant to its task without redundancy or omission, based on the provided facts."
          },
          "manner": {
            "violated": false,
            "justification": "Without explicit text snippets, no ambiguity in references can be identified, and high judge scores suggest clarity."
          }
        },
        "goal_progression": 2,
        "memory_coherence": 1.0,
        "diagnostic_summary": "The dialogue exhibits strong agent performance across all turns, with no identified failures or areas for improvement based on the provided data."
      },
      "metadata": {
        "input_tokens": 1056,
        "output_tokens": 245,
        "total_tokens": 3171,
        "finish_reason": "STOP"
      }
    }
  },
  "average_scores": {
    "openai": 3.0,
    "groq": 2.667,
    "gemini": 3.0,
    "guardrail": 1.0,
    "metadata": 1.0,
    "processing_time": 46.223
  },
  "script_results": [
    {
      "script_id": "f3bb3d04-f7d6-4d17-ae5b-20cb754958e7",
      "attempts": [
        {
          "attempt_nbr": 1,
          "attempt_id": "867839dd-060e-4bbe-88c3-86f5146b5099",
          "script_id": "f3bb3d04-f7d6-4d17-ae5b-20cb754958e7",
          "total_duration": 46.223461866378784,
          "interaction_results": [
            {
              "conversation_id": "867839dd-060e-4bbe-88c3-86f5146b5099",
              "user_message": "Hello, I need information on your medical services.",
              "generated_reply": "Sure! We offer a variety of dental services including routine check-ups, surgical procedures, and restorative treatments. If you have specific questions or need details about a particular service, feel free to ask!",
              "reference_reply": "We offer a variety of dental services including routine check-ups, surgical procedures, and restorative treatments. If you have specific questions or need to book an appointment, feel free to ask!",
              "generated_metadata": {
                "appointment_type": "",
                "appointment_date": "",
                "doctor_name": ""
              },
              "reference_metadata": {},
              "guardrail_details": false,
              "evaluation_results": {
                "judge_evaluations": {
                  "openai": {
                    "provider": "openai",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The AGENT's reply is accurate, friendly, and invites further questions, fulfilling the user's request.",
                    "evidence": {
                      "covered_points": [
                        "variety of dental services",
                        "routine check-ups",
                        "invitation to ask questions"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Provided sufficient information without excess."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "No false or unsupported claims made."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Directly relevant to user's inquiry."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "Clear and orderly presentation."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  },
                  "groq": {
                    "provider": "groq",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "All key points covered",
                    "evidence": {
                      "covered_points": [
                        "dental services",
                        "routine check-ups",
                        "surgical procedures"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "sufficient info"
                      },
                      "quality": {
                        "violated": false,
                        "justification": "accurate"
                      },
                      "relation": {
                        "violated": false,
                        "justification": "relevant"
                      },
                      "manner": {
                        "violated": false,
                        "justification": "clear"
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  },
                  "gemini": {
                    "provider": "gemini",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The agent's reply is nearly identical to the expected response, effectively addressing the user's information query.",
                    "evidence": {
                      "covered_points": [
                        "variety of dental services",
                        "routine check-ups, surgical procedures, restorative treatments",
                        "invite further questions"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Appropriate overview; not overly verbose for initial broad query."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "No false or unsupported information provided."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Directly addresses the user's request for service information."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "Reply is clear, concise, and easy to understand."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  }
                },
                "metadata_evaluation": {},
                "guardrail_flag": 1
              },
              "turn_summary": {
                "turn_index": 0,
                "role": "A",
                "task_type": "Information Query",
                "task_success": true,
                "score": 3.0,
                "engagement": 0.8,
                "gricean_violations": 0,
                "sentiment": "neutral",
                "key_facts": [
                  "The AGENT's reply is accurate, friendly, and invites further questions, fulfilling the user's request.",
                  "All key points covered",
                  "The agent's reply is nearly identical to the expected response, effectively addressing the user's information query."
                ],
                "guardrail_triggered": false,
                "compact_repr": "[T0][A][task=Information Query][s=3.0][e=0.80][g=0]] Facts: [The AGENT's reply is accurate, friendly, and invites further questions, fulfilling the user's request., All key points covered, The agent's reply is nearly identical to the expected response, effectively addressing the user's information query.]"
              }
            },
            {
              "conversation_id": "867839dd-060e-4bbe-88c3-86f5146b5099",
              "user_message": "I want to book an appointment next Monday to remove my wisdom tooth",
              "generated_reply": "Your SURGICAL appointment with Dr. Trafalgar D. Water Law is booked for 2023-10-30.",
              "reference_reply": "Your SURGICAL appointment with Dr. Trafalgar D. Water Law is booked for 2023-10-30.",
              "generated_metadata": {
                "appointment_type": "SURGICAL",
                "appointment_date": "2023-10-30",
                "doctor_name": "Dr. Trafalgar D. Water Law"
              },
              "reference_metadata": {
                "appointment_type": "SURGICAL",
                "appointment_date": "2023-10-30",
                "doctor_name": "Dr. Trafalgar D. Water Law"
              },
              "guardrail_details": false,
              "evaluation_results": {
                "judge_evaluations": {
                  "openai": {
                    "provider": "openai",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The AGENT's reply completely aligns with expected outcomes.",
                    "evidence": {
                      "covered_points": [
                        "appointment confirmation",
                        "specific doctor",
                        "date provided"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Service Transaction",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Reply provided all necessary information."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "No falsehoods present in response."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Response directly relevant to user's request."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "Response is clear and concise."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  },
                  "groq": {
                    "provider": "groq",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "Exact match",
                    "evidence": {
                      "covered_points": [
                        "SURGICAL appointment",
                        "Dr. Trafalgar D. Water Law",
                        "2023-10-30"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Service Transaction",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Enough info"
                      },
                      "quality": {
                        "violated": false,
                        "justification": "No falsehoods"
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Relevant"
                      },
                      "manner": {
                        "violated": false,
                        "justification": "Clear"
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  },
                  "gemini": {
                    "provider": "gemini",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The agent's reply is identical to the expected output, confirming the requested surgical appointment accurately and completely.",
                    "evidence": {
                      "covered_points": [
                        "surgical appointment confirmed",
                        "doctor's name",
                        "date booked"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Service Transaction",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Provides all required information for booking confirmation."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "Assumed to be truthful given it matches the expected reply."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Directly responds to the user's request to book an appointment."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "The reply is clear, concise, and well-ordered."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  }
                },
                "metadata_evaluation": {
                  "appointment_type": 1.0,
                  "doctor_name": 1.0,
                  "appointment_date": 1.0
                },
                "guardrail_flag": 1
              },
              "turn_summary": {
                "turn_index": 1,
                "role": "A",
                "task_type": "Service Transaction",
                "task_success": true,
                "score": 3.0,
                "engagement": 0.8,
                "gricean_violations": 0,
                "sentiment": "neutral",
                "key_facts": [
                  "The AGENT's reply completely aligns with expected outcomes.",
                  "Exact match",
                  "The agent's reply is identical to the expected output, confirming the requested surgical appointment accurately and completely."
                ],
                "guardrail_triggered": false,
                "compact_repr": "[T1][A][task=Service Transaction][s=3.0][e=0.80][g=0]] Facts: [The AGENT's reply completely aligns with expected outcomes., Exact match, The agent's reply is identical to the expected output, confirming the requested surgical appointment accurately and completely.]"
              }
            },
            {
              "conversation_id": "867839dd-060e-4bbe-88c3-86f5146b5099",
              "user_message": "Do I need to do any paperwork beforehand?",
              "generated_reply": "Yes, we recommend completing any necessary paperwork before your appointment to ensure a smooth visit. You can fill it out online or arrive a bit early to complete it in the office.",
              "reference_reply": "Yes, we recommend completing any necessary paperwork before your appointment to ensure a smooth visit. You can usually find the forms on our website or arrive a bit early to fill them out at the clinic.",
              "generated_metadata": {
                "appointment_type": "",
                "appointment_date": "",
                "doctor_name": ""
              },
              "reference_metadata": {},
              "guardrail_details": false,
              "evaluation_results": {
                "judge_evaluations": {
                  "openai": {
                    "provider": "openai",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The AGENT's reply provides complete and accurate information. Minor wording variations do not affect clarity.",
                    "evidence": {
                      "covered_points": [
                        "recommend completing paperwork",
                        "ensure a smooth visit",
                        "fill it out online or in-office"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Provides all necessary information."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "No false or unsupported statements."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Directly answers the user's question."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "Clear and orderly response."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  },
                  "groq": {
                    "provider": "groq",
                    "score": 2,
                    "label": "Good",
                    "justification": "Most points covered",
                    "evidence": {
                      "covered_points": [
                        "paperwork recommended",
                        "smooth visit"
                      ],
                      "missing_or_wrong": [
                        "website link missing"
                      ]
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": false,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": true,
                        "justification": "lacks website info"
                      },
                      "quality": {
                        "violated": false,
                        "justification": "no falsehoods"
                      },
                      "relation": {
                        "violated": false,
                        "justification": "relevant info"
                      },
                      "manner": {
                        "violated": false,
                        "justification": "clear and brief"
                      },
                      "violation_count": 1,
                      "score": 0.75
                    },
                    "engagement_score": 0.275
                  },
                  "gemini": {
                    "provider": "gemini",
                    "score": 3,
                    "label": "Excellent",
                    "justification": "The agent's reply is virtually identical to the expected reply, providing all necessary information clearly and accurately.",
                    "evidence": {
                      "covered_points": [
                        "recommend completing paperwork",
                        "ensure smooth visit",
                        "online/arrive early options"
                      ],
                      "missing_or_wrong": []
                    },
                    "task_metadata": {
                      "task_type": "Information Query",
                      "task_success": true,
                      "user_sentiment": "neutral"
                    },
                    "gricean": {
                      "quantity": {
                        "violated": false,
                        "justification": "Provided sufficient information without being verbose."
                      },
                      "quality": {
                        "violated": false,
                        "justification": "Reply is truthful and consistent with standard practice."
                      },
                      "relation": {
                        "violated": false,
                        "justification": "Directly answered the user's question about paperwork."
                      },
                      "manner": {
                        "violated": false,
                        "justification": "The reply is clear, concise, and well-structured."
                      },
                      "violation_count": 0,
                      "score": 1.0
                    },
                    "engagement_score": 0.8
                  }
                },
                "metadata_evaluation": {},
                "guardrail_flag": 1
              },
              "turn_summary": {
                "turn_index": 2,
                "role": "A",
                "task_type": "Information Query",
                "task_success": false,
                "score": 2.5,
                "engagement": 0.625,
                "gricean_violations": 0,
                "sentiment": "neutral",
                "key_facts": [
                  "The AGENT's reply provides complete and accurate information. Minor wording variations do not affect clarity.",
                  "Most points covered",
                  "The agent's reply is virtually identical to the expected reply, providing all necessary information clearly and accurately."
                ],
                "guardrail_triggered": false,
                "compact_repr": "[T2][A][task=Information Query][s=2.5][e=0.62][g=0]] Facts: [The AGENT's reply provides complete and accurate information. Minor wording variations do not affect clarity., Most points covered, The agent's reply is virtually identical to the expected reply, providing all necessary information clearly and accurately.]"
              }
            }
          ],
          "evaluation_verdicts": {
            "openai": [
              "The AGENT's reply is accurate, friendly, and invites further questions, fulfilling the user's request.",
              "The AGENT's reply completely aligns with expected outcomes.",
              "The AGENT's reply provides complete and accurate information. Minor wording variations do not affect clarity."
            ],
            "groq": [
              "All key points covered",
              "Exact match",
              "Most points covered"
            ],
            "gemini": [
              "The agent's reply is nearly identical to the expected response, effectively addressing the user's information query.",
              "The agent's reply is identical to the expected output, confirming the requested surgical appointment accurately and completely.",
              "The agent's reply is virtually identical to the expected reply, providing all necessary information clearly and accurately."
            ]
          },
          "average_scores": {
            "openai": 3.0,
            "groq": 2.667,
            "gemini": 3.0,
            "guardrail": 1.0,
            "metadata": 1.0,
            "processing_time": 46.223
          },
          "interaction_summaries": [
            "[T0][A][task=Information Query][s=3.0][e=0.80][g=0]] Facts: [The AGENT's reply is accurate, friendly, and invites further questions, fulfilling the user's request., All key points covered, The agent's reply is nearly identical to the expected response, effectively addressing the user's information query.]",
            "[T1][A][task=Service Transaction][s=3.0][e=0.80][g=0]] Facts: [The AGENT's reply completely aligns with expected outcomes., Exact match, The agent's reply is identical to the expected output, confirming the requested surgical appointment accurately and completely.]",
            "[T2][A][task=Information Query][s=2.5][e=0.62][g=0]] Facts: [The AGENT's reply provides complete and accurate information. Minor wording variations do not affect clarity., Most points covered, The agent's reply is virtually identical to the expected reply, providing all necessary information clearly and accurately.]"
          ]
        }
      ],
      "average_scores": {
        "openai": 3.0,
        "groq": 2.667,
        "gemini": 3.0,
        "guardrail": 1.0,
        "metadata": 1.0,
        "processing_time": 46.223
      }
    }
  ],
  "batch_id": "8658266d-0d23-415f-a23d-eb7d88551a20",
  "elapsed_time": 46.238915
}