Source code for scitex_agent_container.config._validation

"""YAML config validation."""

from __future__ import annotations

import os
import re
import sys
from pathlib import Path

import yaml

# F-CS6 — yaml-field rename for ``spec.runtime``.
#
# The internal codebase still keys dispatch on the original names
# (``claude-code`` / ``claude-session``), so the new aliases are
# normalised back to the canonical form at load time. A stderr
# warning is emitted once per shell session per renamed value so
# stale yamls keep working without a constant log nag.
#
# §5 of the scitex CLI conventions mandates HARD redirects, but that
# rule governs CLI commands; yaml field values can't be atomically
# rewritten across every host's checked-in agent definitions, so a
# soft alias is the right level of breakage here.
_RUNTIME_RENAMES = {
    "claude-cli-tui": "claude-code",
    "claude-sdk-persistent": "claude-session",
}


def _runtime_alias_warn_marker(old_name: str) -> Path:
    """One marker file per shell session per renamed value.

    Keying on PPID gives one warning per *interactive shell* — child
    invocations from the same shell don't re-print. Matches the
    pattern documented in scitex/general/03_interface_02_cli/
    11_deprecation.md §5a.
    """
    runtime_dir = os.environ.get("XDG_RUNTIME_DIR", "/tmp")
    user = os.environ.get("USER", "u")
    ppid = os.environ.get("PPID", "0")
    return Path(runtime_dir) / f"sac-runtime-rename-{user}-{ppid}-{old_name}.flag"


def normalize_runtime(value: str | None) -> str | None:
    """Return the canonical runtime value; warn on first use of an alias.

    Accepts the new yaml-friendly aliases (``claude-cli-tui``,
    ``claude-sdk-persistent``) and returns the long-standing internal
    names (``claude-code``, ``claude-session``). Unknown / canonical
    values pass through unchanged. ``None`` becomes ``None``.
    """
    if value is None:
        return None
    canonical = _RUNTIME_RENAMES.get(value)
    if canonical is None:
        return value
    marker = _runtime_alias_warn_marker(value)
    if not marker.exists():
        try:
            marker.parent.mkdir(parents=True, exist_ok=True)
            marker.touch()
        except OSError:  # stx-allow: fallback (reason: marker is best-effort; missing /tmp shouldn't block load)
            pass
        sys.stderr.write(
            f"warning: spec.runtime: '{value}' is the new alias for "
            f"'{canonical}'. Both work; the alias will become canonical "
            "in a future major release. (F-CS6)\n"
        )
        sys.stderr.flush()
    return canonical


# Accepted shapes for ``spec.model`` (F-CS7).
#
# claude-agent-sdk silently rejects unknown aliases — the runner stays
# alive, the heartbeat is fresh, but every turn returns 0 input tokens
# and 0 output tokens because the SDK never makes the API call. Pin
# the validation here so the failure surfaces at yaml-validate time
# instead of as a hung-looking agent.
#
# Two acceptable shapes:
#   1. Bare alias: ``opus`` / ``sonnet`` / ``haiku`` / ``inherit`` /
#      ``default``, optionally with a context-suffix (``[1m]``).
#   2. Full versioned form: ``claude-<family>-N-M`` with optional date
#      tail (``-20251001``) and optional context-suffix.
#
# Reproduction (2026-05-05): ``claude-opus[1m]`` (abbreviated, missing
# the version digits) was accepted by the YAML loader but silently
# rejected by the SDK — every turn returned ``input_tokens=0``,
# ``output_tokens=0``, ``iterations=[]``. Other peers using
# ``claude-opus-4-7[1m]`` worked fine.
_VALID_MODEL_RE = re.compile(
    r"""
    ^(?:
        (?:opus|sonnet|haiku|inherit|default)
        |
        claude-(?:opus|sonnet|haiku)-\d+-\d+(?:-[a-z0-9]+)*
    )
    (?:\[[a-zA-Z0-9_]+\])?
    $
    """,
    re.VERBOSE,
)

_VALID_API_VERSIONS = ("scitex-agent-container/v3",)

_KNOWN_TOP_LEVEL_KEYS = frozenset({"apiVersion", "kind", "metadata", "spec"})


def _legacy_runtime_redirect(old: str, new: str, image: str) -> str:
    """Render the §5-style hard-error message for a renamed runtime.

    Each legacy value names a specific replacement so a stale yaml
    gets fixed in one pass:

      spec.runtime: claude-session   ->   spec.runtime: docker
                                          spec.image:   scitex-agent-container:sdk-persistent
                                          spec.dockerfile: ./containers/Dockerfile.sdk-persistent
    """
    return (
        f"spec.runtime: '{old}' was renamed to spec.runtime: '{new}'. "
        f"Set:\n"
        f"  spec.runtime:    {new}\n"
        f"  spec.image:      {image}\n"
        f"  spec.dockerfile: ./containers/Dockerfile.sdk-persistent\n"
        "(F-CS16 phase 2e: sac is container-only; the old runtime "
        "names are no longer accepted.)"
    )


# F-CS16 phase 2e — every legacy runtime value hard-errors with a
# redirect that names the new shape. Mapping kept module-level so
# tests can pin individual messages without re-deriving the dict.
_SDK_IMAGE = "scitex-agent-container:sdk-persistent"


def legacy_runtime_redirect_message(runtime: str) -> str | None:
    """Return the §5-style redirect text for a legacy runtime, or None.

    Phase 2e.1 callers (lifecycle, dispatch helpers, error reporters)
    use this to surface "use ``runtime: docker`` + image + dockerfile"
    guidance even while the validator still accepts the legacy value.
    F-CS17's sweep flips the validator itself to call this function
    and append the result to ``errors``.
    """
    return _LEGACY_RUNTIME_REDIRECTS.get(runtime)


_LEGACY_RUNTIME_REDIRECTS = {
    "claude-session": _legacy_runtime_redirect("claude-session", "docker", _SDK_IMAGE),
    "claude-sdk-persistent": _legacy_runtime_redirect(
        "claude-sdk-persistent", "docker", _SDK_IMAGE
    ),
    "claude-code": (
        "spec.runtime: 'claude-code' (CLI/TUI runtime) is no longer "
        "supported by sac. The CLI/TUI path was removed in F-CS17. "
        "Use the SDK runner instead:\n"
        "  spec.runtime:    docker\n"
        f"  spec.image:      {_SDK_IMAGE}\n"
        "  spec.dockerfile: ./containers/Dockerfile.sdk-persistent"
    ),
    "claude-cli-tui": (
        "spec.runtime: 'claude-cli-tui' is no longer supported "
        "(CLI/TUI runtime removed in F-CS17). Use:\n"
        "  spec.runtime:    docker\n"
        f"  spec.image:      {_SDK_IMAGE}\n"
        "  spec.dockerfile: ./containers/Dockerfile.sdk-persistent"
    ),
    "slurm": (
        "spec.runtime: 'slurm' is no longer supported. Sac is a "
        "container wrapper; HPC scheduling is the operator's "
        "concern (submit your own sbatch and run 'sac agent start' "
        "inside the allocation). See F-CS16 design doc."
    ),
    "slurm-tenant": (
        "spec.runtime: 'slurm-tenant' is no longer supported (see "
        "the redirect for 'slurm'). Submit sbatch yourself and "
        "invoke 'sac agent start' inside the allocation."
    ),
}

# All spec keys read by load_v3, parsers, or a2a/_server.py.
# Unknown keys are rejected at parse time so typos surface at boot.
# Intentional extension data belongs under spec.extensions.
_KNOWN_SPEC_KEYS = frozenset(
    {
        "runtime",
        "image",  # F-CS16 phase 2a — flattened from spec.container.image
        "dockerfile",  # F-CS16 phase 2a — auto-build source when image missing
        "model",
        "workdir",
        "python-venv",
        "env",
        "screen",
        "container",
        "claude",
        "health",
        "watchdog",
        "restart",
        "hooks",
        "telegram",
        "remote",
        "skills",
        "startup_commands",
        "startup",
        "context_management",
        "listen",
        "extensions",
        "mcp_servers",
        "multiplexer",
        "host",
        "hosts",
        "session",  # shortcut alias for spec.claude.session
        "scheduling",  # rejected with a specific actionable message below
        "a2a",  # A2A sidecar config read by a2a/_server.py
        "orochi",  # Orochi-specific extension namespace
        "autonomous",  # F-CS3 — drive-until-done block
        "apptainer",  # F-CS18 — apptainer-specific build extension
    }
)


def validate_raw(raw: dict, path: str) -> list[str]:
    """Validate raw YAML dict. Returns list of error strings (empty means valid)."""
    errors: list[str] = []

    if not isinstance(raw, dict):
        return [f"Config file is not a YAML mapping: {path}"]

    # Unknown top-level keys
    unknown_top = set(raw.keys()) - _KNOWN_TOP_LEVEL_KEYS
    for k in sorted(unknown_top):
        errors.append(
            f"Unknown top-level field '{k}'. "
            f"Valid keys: {sorted(_KNOWN_TOP_LEVEL_KEYS)}."
        )

    # apiVersion
    api_version = raw.get("apiVersion")
    if api_version not in _VALID_API_VERSIONS:
        errors.append(
            f"apiVersion must be one of {_VALID_API_VERSIONS}, got '{api_version}'"
        )

    # kind
    kind = raw.get("kind")
    if kind != "Agent":
        errors.append(f"kind must be 'Agent', got '{kind}'")

    # metadata (optional dict — agent name comes from parent dir, not from
    # metadata.name; the field is no longer accepted)
    metadata = raw.get("metadata")
    if metadata is not None and not isinstance(metadata, dict):
        errors.append("metadata, if present, must be a mapping")
    elif isinstance(metadata, dict) and "name" in metadata:
        errors.append(
            "metadata.name is no longer accepted; the agent name is "
            "derived from the parent directory (dir-as-SSoT). Remove "
            "the metadata.name field and ensure the YAML lives at "
            "<name>/<name>.yaml."
        )

    # spec
    spec = raw.get("spec")
    if not isinstance(spec, dict):
        errors.append("spec is required and must be a mapping")
    else:
        # Unknown spec keys
        unknown_spec = set(spec.keys()) - _KNOWN_SPEC_KEYS
        for k in sorted(unknown_spec):
            errors.append(
                f"Unknown spec field '{k}'. "
                f"Use spec.extensions for custom data; "
                f"known keys: {sorted(_KNOWN_SPEC_KEYS)}."
            )

        # spec.runtime — F-CS17 stage 2.
        #
        # The migration's grace period (phase 2e.1) is over. Every
        # legacy value now hard-errors with the redirect string from
        # ``legacy_runtime_redirect_message`` — see the §5-style
        # guidance there. Canonical engines (docker / podman /
        # apptainer) remain the only accepted values.
        runtime = spec.get("runtime")
        valid_runtimes = ("docker", "podman", "apptainer")
        legacy_msg = legacy_runtime_redirect_message(runtime or "")
        if legacy_msg is not None:
            errors.append(legacy_msg)
        elif runtime and runtime not in valid_runtimes:
            errors.append(
                f"spec.runtime must be one of {valid_runtimes}, got '{runtime}'"
            )

        # spec.image (F-CS16 phase 2a) — top-level container image tag.
        # Empty string is allowed and falls back to the default at
        # dispatch time. Type check only here.
        image = spec.get("image")
        if image is not None and not isinstance(image, str):
            errors.append(f"spec.image must be a string, got {type(image).__name__}")

        # spec.dockerfile (F-CS16 phase 2a) — host-relative path to a
        # Dockerfile sac auto-builds when ``image`` is missing locally
        # (phase 2d wires the build). Type check only.
        dockerfile = spec.get("dockerfile")
        if dockerfile is not None and not isinstance(dockerfile, str):
            errors.append(
                f"spec.dockerfile must be a string, got {type(dockerfile).__name__}"
            )

        # spec.model — F-CS7: validate against accepted SDK aliases /
        # versioned forms. The SDK silently rejects unknown values
        # (heartbeat fresh, every turn returns 0 tokens), so we surface
        # bad strings at yaml-validate time. Empty / missing is allowed
        # — runtime falls back to its default.
        model = spec.get("model")
        if model is not None:
            if not isinstance(model, str):
                errors.append(
                    f"spec.model must be a string, got {type(model).__name__}"
                )
            elif model and not _VALID_MODEL_RE.match(model):
                errors.append(
                    f"spec.model '{model}' is not an accepted alias. "
                    "Use a bare alias ('opus', 'sonnet', 'haiku', 'inherit', "
                    "'default'), optionally with a context suffix like "
                    "'opus[1m]'; OR the full versioned form "
                    "'claude-<family>-N-M[-<tail>]' (e.g. 'claude-opus-4-7', "
                    "'claude-opus-4-7[1m]', 'claude-haiku-4-5-20251001'). "
                    "Abbreviated forms like 'claude-opus[1m]' are rejected "
                    "by the SDK without raising — every turn returns 0 "
                    "tokens."
                )

        # container.runtime
        container = spec.get("container", {}) or {}
        cr = container.get("runtime")
        if cr and cr not in ("none", "docker", "podman", "apptainer"):
            errors.append(
                f"spec.container.runtime must be none|docker|podman|apptainer, got '{cr}'"
            )

        # container.mount_host_claude (opt-in; default False)
        mhc = container.get("mount_host_claude")
        if mhc is not None and not isinstance(mhc, bool):
            errors.append(
                "spec.container.mount_host_claude must be a boolean, got "
                f"{type(mhc).__name__}"
            )

        # container.network
        network = container.get("network")
        if network and network not in ("host", "bridge", "none"):
            errors.append(
                f"spec.container.network must be host|bridge|none, got '{network}'"
            )

        # restart.policy
        restart = spec.get("restart", {}) or {}
        policy = restart.get("policy")
        if policy and policy not in ("never", "on-failure", "always"):
            errors.append(
                f"spec.restart.policy must be never|on-failure|always, got '{policy}'"
            )

        # multiplexer
        mux = spec.get("multiplexer")
        if mux and mux not in ("screen", "tmux"):
            errors.append(f"spec.multiplexer must be 'screen' or 'tmux', got '{mux}'")

        # health.method
        health = spec.get("health", {}) or {}
        method = health.get("method")
        if method and method not in ("multiplexer-alive",):
            errors.append(
                f"spec.health.method must be 'multiplexer-alive', got '{method}'"
            )

        # host / hosts (mutually exclusive)
        has_host = "host" in spec
        has_hosts = "hosts" in spec
        if has_host and has_hosts:
            errors.append(
                "spec.host and spec.hosts are mutually exclusive — set "
                "exactly one (host: singleton, hosts: multi-instance)"
            )
        if has_host:
            host_val = spec.get("host")
            if host_val is not None and not isinstance(host_val, (str, list)):
                errors.append(
                    f"spec.host must be a string, list of strings, or empty; "
                    f"got {type(host_val).__name__}"
                )
            elif isinstance(host_val, list) and not all(
                isinstance(h, str) for h in host_val
            ):
                errors.append("spec.host list must contain only strings")
        if has_hosts:
            hosts_val = spec.get("hosts")
            if hosts_val is None:
                errors.append(
                    "spec.hosts cannot be empty — use 'all' (every fleet "
                    "host) or a list of host names"
                )
            elif isinstance(hosts_val, str) and hosts_val != "all":
                errors.append(f"spec.hosts string must be 'all', got '{hosts_val}'")
            elif isinstance(hosts_val, list) and not all(
                isinstance(h, str) for h in hosts_val
            ):
                errors.append("spec.hosts list must contain only strings")
            elif not isinstance(hosts_val, (str, list)):
                errors.append(
                    f"spec.hosts must be 'all' or a list of strings; "
                    f"got {type(hosts_val).__name__}"
                )

        # spec.autonomous (F-CS3 phase 1) — drive-until-done.
        autonomous = spec.get("autonomous")
        if autonomous is not None:
            if not isinstance(autonomous, dict):
                errors.append(
                    "spec.autonomous must be a mapping; got "
                    f"{type(autonomous).__name__}"
                )
            else:
                drive_until = autonomous.get("drive_until")
                if drive_until is not None and not isinstance(drive_until, str):
                    errors.append("spec.autonomous.drive_until must be a string")
                elif drive_until == "":
                    errors.append("spec.autonomous.drive_until must be non-empty")
                for fld in ("max_turns", "idle_kick_after_s"):
                    val = autonomous.get(fld)
                    if val is not None:
                        if not isinstance(val, int) or isinstance(val, bool):
                            errors.append(f"spec.autonomous.{fld} must be an integer")
                        elif val <= 0:
                            errors.append(f"spec.autonomous.{fld} must be > 0")
                kick = autonomous.get("kick_text")
                if kick is not None and not isinstance(kick, str):
                    errors.append("spec.autonomous.kick_text must be a string")
                enabled = autonomous.get("enabled")
                if enabled is not None and not isinstance(enabled, bool):
                    errors.append("spec.autonomous.enabled must be a boolean")

        # Reject the old `scheduling:` block — replaced by host/hosts.
        if "scheduling" in spec:
            errors.append(
                "spec.scheduling block is no longer accepted. Use spec.host "
                "(singleton, optionally with fallback list) or spec.hosts "
                "(multi-instance, 'all' or list)."
            )

    return errors


[docs] def validate_config(path: str | Path) -> list[str]: """Validate a config file and return list of errors (empty = valid).""" path = Path(path).resolve() try: with open(path) as f: raw = yaml.safe_load(f) except ( FileNotFoundError ): # stx-allow: fallback (reason: file may not exist on first use) return [f"File not found: {path}"] except ( yaml.YAMLError ) as exc: # stx-allow: fallback (reason: expected failure — see inline comment) return [f"YAML parse error: {exc}"] return validate_raw(raw, str(path))