Skip to content

meshflow — Eval API Reference

Evaluation harness, baselines, regression gating, and production feedback.

EvalSuite

from meshflow import EvalSuite, EvalScenario, EvalResult, ScenarioResult, run_eval

# From YAML
suite = EvalSuite.from_yaml("evals.yaml")
result: EvalResult = await suite.run(agent)

# Shorthand
result = await run_eval(agent, "evals.yaml")

# EvalResult fields
result.suite_name
result.passed          # int
result.failed          # int
result.pass_rate       # float 0–1
result.scenarios       # list[ScenarioResult]
result.total_cost_usd
result.total_tokens

YAML format:

suite: my-agent-eval
scenarios:
  - name: math
    input: "What is 2 + 2?"
    expected: "4"
    judge: exact_match

  - name: summarization
    input: "Summarize: The sky is blue."
    judge: llm
    criteria: "Response is a concise accurate summary"

  - name: contains-check
    input: "List 3 European capitals"
    expected: "Paris"
    judge: contains

Baselines + Regression

from meshflow import EvalBaseline, BaselineDiff

# Save baseline
result = await suite.run(agent)
baseline = EvalBaseline.from_result(result)
baseline.save("baseline.json")

# Compare
current = await suite.run(agent)
diff: BaselineDiff = BaselineDiff.compare(baseline, current)
print(diff.pass_rate_delta)
print(diff.cost_delta_usd)
if diff.is_regression:
    sys.exit(1)

CLI:

meshflow eval run evals.yaml --save-baseline baseline.json
meshflow eval run evals.yaml --compare-baseline baseline.json --fail-on-regression

Cost Regression CI Gate

from meshflow.eval.ci_gate import CIBudgetGate, GateResult

gate = CIBudgetGate(
    max_cost_regression=0.10,    # fail if cost increases > 10%
    max_token_regression=0.15,   # fail if tokens increase > 15%
    min_quality_score=0.80,      # fail if quality drops below 80%
)

result: GateResult = gate.evaluate(baseline, current_result)
if result.failed:
    print(result.summary())
    sys.exit(1)

GitHub Actions workflow (.github/workflows/cost-regression.yml is included in the repo).

LLMJudge

from meshflow import LLMJudge, JudgeScore, JudgeSuiteResult

judge = LLMJudge(
    criteria="Is the response accurate, helpful, and concise?",
    scale=10,
)

score: JudgeScore = await judge.score(
    input="What is Python?",
    output="Python is a high-level programming language.",
)
print(score.score)      # 0–10
print(score.rationale)

ConversationEval

from meshflow import ConversationEval, ConversationCase, EvalTurn, EvalConversationResult

eval = ConversationEval(agent=agent)
case = ConversationCase(
    name="multi-turn-support",
    turns=[
        EvalTurn(input="Hi, I need help.", expected_contains="help"),
        EvalTurn(input="What are your hours?", judge="llm",
                 criteria="Provides business hours or asks for clarification"),
    ]
)

result: EvalConversationResult = await eval.run(case)
for turn in result.turn_results:
    print(turn.passed, turn.score)

ABTest

from meshflow import ABTest, ABVariant, ABTestResult

test = ABTest(
    variants=[
        ABVariant(name="gpt4o",  agent=Agent(name="a", model="gpt-4o")),
        ABVariant(name="sonnet", agent=Agent(name="b", model="claude-sonnet-4-6")),
    ],
    inputs=["Summarize AI safety", "What is RAG?"],
    judge=LLMJudge(criteria="accuracy and conciseness"),
)

result: ABTestResult = await test.run()
print(result.winner)           # variant name
print(result.cost_comparison)  # dict of variant → total cost

QualityGate

from meshflow import QualityGate, QualityReport

gate = QualityGate(
    min_pass_rate=0.90,
    min_avg_score=7.5,
    judge=LLMJudge(criteria="correctness"),
)

report: QualityReport = await gate.evaluate(agent, eval_suite)
if report.failed:
    print(report.reason)
    sys.exit(1)

Production Feedback + Shadow

from meshflow import FeedbackStore, FeedbackRecord, shadow_run, ShadowResult

# Collect feedback
store = FeedbackStore("feedback.db")
store.record(FeedbackRecord(
    run_id="run-123",
    step_id="node-2",
    rating=4,
    comment="Good but could be more concise",
))

# Shadow run — run new agent in parallel, compare without affecting prod
result: ShadowResult = await shadow_run(
    production_agent=prod_agent,
    shadow_agent=candidate_agent,
    task="Summarize the quarterly report",
)
print(result.production_output)
print(result.shadow_output)
print(result.regression_detected)