"""Dataclass definitions for agent configuration."""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict
[docs]
@dataclass
class ContainerSpec:
runtime: str = "none" # none | docker | apptainer
image: str = "scitex-agent-container:latest"
volumes: list[str] = field(default_factory=list)
network: str = "host"
# Opt-in auto-mount of the host's ``~/.claude`` directory at
# ``/home/agent/.claude:ro`` inside the container. Default False: the
# container is the isolation boundary, and auto-mounting leaks host
# identity/skills/MCP/memory into every agent — surprising default.
# Set ``mount_host_claude: true`` in the YAML only when the agent
# actually needs host-agent identity/memory/skills from ``~/.claude``.
mount_host_claude: bool = False
[docs]
@dataclass
class ClaudeSpec:
channels: list[str] = field(default_factory=list)
flags: list[str] = field(default_factory=list)
# Session restart strategy. One of:
# continue-or-new try --continue, fall back to a fresh launch if no prior session (default)
# continue always pass --continue (fails if no prior session exists)
# new never pass --continue
# resume pass --resume <resume_id> (explicit session ID)
session: str = "continue-or-new"
# Only resume if the most recent session jsonl is newer than this many minutes.
# None = no age check (always resume if session exists).
continue_max_age_minutes: int | None = None
# Explicit session ID to pass to --resume. Only used when session="resume".
resume_id: str = ""
auto_accept: bool = True
[docs]
@dataclass
class HealthSpec:
enabled: bool = False
interval: int = 30
timeout: int = 5
method: str = "multiplexer-alive"
# Parsed for backward compat but not interpreted by runtime.
# Watchdog lifecycle is managed externally via hooks.
[docs]
@dataclass
class WatchdogSpec:
enabled: bool = False
interval: float = 1.5
resp_y_n: str = "1"
resp_y_y_n: str = "2"
resp_waiting: str = "/speak-and-call"
# F-CS3 — autonomous drive-until-done.
#
# claude-session runners do ONE turn and idle by default; multi-turn
# tasks have to wrap externally with a2a peer post-turn loops, and
# every project ends up rewriting that scaffolding. The autonomous
# block lets the runner natively:
#
# 1. Watch each assistant turn for a text match (``drive_until``);
# hitting it exits the runner with code 0.
# 2. After ``idle_kick_after_s`` of no tool activity AND no match,
# post ``kick_text`` so the conversation keeps moving.
# 3. Cap at ``max_turns`` to prevent runaway loops.
#
# Phase 1 (this dataclass + parser + validator) lands the schema so
# yamls can author the contract today; the runner-side enforcement
# (consume these fields in _runners.claude_session) lands in phase 2.
# An ``enabled`` row authored under the schema before phase 2 ships
# is harmless — the runner just ignores it for now.
# F-CS18 — apptainer-specific extension hook.
#
# Apptainer reads OCI images natively (`apptainer build sif docker://...`),
# so for the no-extras case spec.image alone is enough — sac just
# `apptainer build`s the SIF and runs it. For HPC-specific layering
# (extra pip packages, system libs, env vars), the operator can either:
#
# * declare `spec.apptainer.post` — sac synthesises a `.def` with
# `Bootstrap: docker` + `%post` + `%environment` and builds from it.
# * declare `spec.apptainer.def_file` — sac runs `apptainer build`
# against the operator's hand-written `.def` (full control).
#
# All fields are optional; an `apptainer:` block with no fields set is
# equivalent to none at all.
@dataclass
class ApptainerSpec:
"""Apptainer-specific image-build extensions (F-CS18)."""
post: str = ""
"""Shell snippet run inside the SIF build (apptainer's `%post`).
Lines are concatenated verbatim. Empty = no extension."""
environment: dict = field(default_factory=dict)
"""Env vars baked into the SIF (apptainer's `%environment`). Same
shape as ``spec.env`` — KEY: VALUE pairs."""
def_file: str = ""
"""Path to a hand-authored ``.def`` file (apptainer's native
build language). Mutually exclusive with `post`/`environment`:
when set, sac uses this file verbatim and ignores `post`."""
nv: bool = False
"""Forward host NVIDIA driver/libs into the container (apptainer's
``--nv``). Required for CUDA workloads on GPU nodes; harmless on
CPU-only hosts but only set when needed."""
rocm: bool = False
"""Forward host AMD ROCm libs (apptainer's ``--rocm``). Mutually
exclusive with ``nv`` in practice (no host has both)."""
@dataclass
class AutonomousSpec:
enabled: bool = False
drive_until: str = "DONE"
max_turns: int = 50
idle_kick_after_s: int = 120
kick_text: str = "Continue. Print DONE when finished."
[docs]
@dataclass
class RestartSpec:
policy: str = "never" # never | on-failure | always
max_retries: int = 3
backoff_initial: int = 30
backoff_max: int = 300
backoff_multiplier: int = 2
# Parsed for backward compat but not interpreted by runtime.
# Telegram setup is managed externally via hooks.
[docs]
@dataclass
class TelegramSpec:
bot_token_env: str = "SCITEX_AGENT_CONTAINER_TELEGRAM_BOT_TOKEN"
allowed_users: list[str] = field(default_factory=list)
auto_connect: bool = True
greeting: str = ""
[docs]
@dataclass
class OrochiSpec:
enabled: bool = False
hosts: list[str] = field(default_factory=list)
port: int = 8559
token_env: str = "SCITEX_OROCHI_TOKEN"
channels: list[str] = field(default_factory=list)
heartbeat_interval: int = 60
[docs]
@dataclass
class RemoteSpec:
# Chain-based remote: list of SSH config aliases (new format).
# Populated when spec.remote is a str or list[str].
# Empty when using legacy dict format.
hops: list = field(default_factory=list)
host: str = "" # SSH host (hostname or IP)
user: str = "" # SSH user
key: str = "" # Path to SSH key (optional)
port: int = 22 # SSH port
timeout: int = 60 # SSH command timeout in seconds
login_shell: bool = True # Use bash -l -c (needed for PATH on most hosts)
no_preflight: bool = False # Skip preflight checks (HPC with module loads)
@property
def is_remote(self) -> bool:
"""Return True if this agent should be deployed via SSH."""
return bool(self.hops or self.host)
[docs]
@dataclass
class ContextManagementConfig:
"""Context-lifecycle policy for an agent.
Defaults mirror ``strategy="noop"`` so absence of the ``context_management``
block preserves existing behavior (sensor disabled).
"""
trigger_at_percent: float = 70.0
strategy: str = "noop" # "compact" | "restart" | "noop"
warn_before_n_checks: int = 0
check_interval_seconds: int = 300
state_file: str = "~/.scitex/agent-container/state/<agent>.json"
@property
def enabled(self) -> bool:
return self.strategy != "noop"
[docs]
@dataclass
class SkillsSpec:
required: list[str] = field(default_factory=list) # Auto-loaded at startup
available: list[str] = field(default_factory=list) # Available but not auto-loaded
# How sac materializes the skill list into the agent's CLAUDE.md:
# "at-import" — resolve each name to file paths and emit `@<path>` lines
# so Claude Code inlines the content at session start
# (default — eager loading per Anthropic @-import).
# "block" — emit a ```skills <name>``` block (legacy lazy form).
injection_mode: str = "at-import"
# Strategies used to resolve a skill name → file paths in at-import mode.
# Each entry runs independently; results are unioned + deduped.
# "skill-id" — Anthropic-canonical: walk skill roots, for each
# ``<dir>/SKILL.md`` resolve identity as
# ``frontmatter.name`` (if set) ELSE ``<dir>.name``.
# Match if identity equals the requested value.
# See https://docs.claude.com/en/docs/claude-code/skills.
# "tag" — files where frontmatter ``tags:`` contains the value
# (orchestration extension; not in Anthropic spec but
# used by ywatanabe ``tags-expand`` pattern).
# "filename" — files whose basename (without ``.md``) matches
# (opt-in; broader than ``skill-id``, can over-match).
match_by: list[str] = field(default_factory=lambda: ["skill-id", "tag"])
# Comparison style for ``match_by`` strategies.
# "exact" — value == candidate (default)
# "partial" — value substring of candidate (case-sensitive)
match_style: str = "exact"
[docs]
@dataclass
class HostsSpec:
"""Where an agent should run, in either singleton or multi-instance form.
Mutually exclusive — exactly one of ``host`` or ``hosts`` may be set:
* ``host`` (singular) — exactly one instance runs:
- empty / absent: local singleton (runs wherever sac is invoked)
- string: pinned to that host
- list: priority order; first available host wins (fallback chain)
* ``hosts`` (plural) — multiple instances run, one per host:
- "all": one per fleet host (replaces the old per-host mode)
- list of host names: one per listed host (subset)
Validator (in ``_validation.py``) enforces mutual exclusion + types.
Loader composes effective ids: ``hosts`` triggers the
``<name>-<HOST>`` suffix; ``host`` keeps the bare name.
"""
host: str | list[str] = ""
hosts: str | list[str] = field(default_factory=list)
[docs]
@dataclass
class SchedulingSpec:
"""Fleet-wide scheduling policy for an agent (shared-host layout).
``mode`` controls effective-id composition and launch-skip behavior:
* ``per-host`` (default): agent is started on every host that runs
``sac agent start <name>``; the effective id is ``<metadata.name>-<HOST>``
unless the name already ends with ``-<HOST>``.
* ``singleton``: exactly one instance fleet-wide. The effective id
stays as the bare ``<metadata.name>``. Only launched on
``preferred-host``; on other hosts the launch is a no-op.
``fallback-hosts`` is recorded for observability but not acted on
automatically — manual failover today.
"""
mode: str = "per-host"
preferred_host: str = ""
fallback_hosts: list[str] = field(default_factory=list)
[docs]
@dataclass
class ListenPort:
"""Declaration of a port/socket an external tool binds on behalf of an agent.
The container NEVER binds these — it just validates the shape and
echoes them in ``status --json`` so orchestrators can see what
sidecars are expected to exist. ``owner`` is free-form (e.g.
``"orochi"``) to identify the plugin that actually listens.
"""
port: int = 0
proto: str = "tcp" # tcp | udp | unix
path: str = "" # unix-socket path (when proto == "unix")
name: str = ""
owner: str = ""
[docs]
@dataclass
class HookSpec:
"""All hook points supported by the container.
Each entry is a list of opaque commands — shell strings or http(s)
URLs. The container executes them fire-and-forget; errors are
logged but never raised to the caller. Absent keys default to
empty lists (feature disabled).
"""
pre_start: list[str] = field(default_factory=list)
post_start: list[str] = field(default_factory=list)
pre_stop: list[str] = field(default_factory=list)
post_stop: list[str] = field(default_factory=list)
on_compact: list[str] = field(default_factory=list)
on_restart: list[str] = field(default_factory=list)
on_diff: list[str] = field(default_factory=list)
[docs]
def counts(self) -> dict[str, int]:
return {
"pre_start": len(self.pre_start),
"post_start": len(self.post_start),
"pre_stop": len(self.pre_stop),
"post_stop": len(self.post_stop),
"on_compact": len(self.on_compact),
"on_restart": len(self.on_restart),
"on_diff": len(self.on_diff),
}
[docs]
@dataclass
class StartupCommand:
delay: int = 0 # seconds after startup
command: str = ""
[docs]
@dataclass
class ReadyPattern:
"""A single regex the pane content must match for the agent to be ready."""
regex: str = ""
[docs]
@dataclass
class StartupSpec:
"""Opt-in ready-state gate for startup commands (todo#291).
When ``ready_patterns`` is empty, legacy fire-and-hope behavior is
preserved. Otherwise ``agent_start`` polls the tmux pane content and
only dispatches ``commands`` once all patterns match against the tail
of the capture AND the pane has been byte-identical for
``ready_idle_ticks`` consecutive polls.
"""
ready_patterns: list[ReadyPattern] = field(default_factory=list)
ready_idle_ticks: int = 3
ready_poll_interval_seconds: float = 0.5
ready_timeout_seconds: float = 60.0
# "capture_and_fail" | "capture_and_proceed"
on_timeout: str = "capture_and_proceed"
commands: list[StartupCommand] = field(default_factory=list)
[docs]
@dataclass
class AgentConfig:
"""Parsed agent configuration from a YAML definition file."""
name: str
runtime: str = "claude-code"
# F-CS16 phase 2a — top-level fields that flatten the old
# spec.container.{image, dockerfile} block. Empty string means
# "use the default" (resolved by phase 2d's auto-build path
# against ContainerSpec.image / containers/Dockerfile.<target>).
image: str = ""
dockerfile: str = ""
model: str = "sonnet"
workdir: str = "~/proj"
python_venv: str = "" # resolved venv path (post _resolve_python_venv)
env: dict[str, str] = field(default_factory=dict)
env_files: list[str] = field(
default_factory=list
) # .env file paths (workspace-relative ok)
screen_name: str = ""
labels: dict[str, str] = field(default_factory=dict)
container: ContainerSpec = field(default_factory=ContainerSpec)
claude: ClaudeSpec = field(default_factory=ClaudeSpec)
health: HealthSpec = field(default_factory=HealthSpec)
watchdog: WatchdogSpec = field(default_factory=WatchdogSpec)
restart: RestartSpec = field(default_factory=RestartSpec)
autonomous: AutonomousSpec = field(default_factory=AutonomousSpec)
apptainer: ApptainerSpec = field(default_factory=ApptainerSpec)
hooks: dict[str, list[str]] = field(default_factory=dict)
listen: list[ListenPort] = field(default_factory=list)
extensions: Dict[str, Any] = field(default_factory=dict)
telegram: TelegramSpec = field(default_factory=TelegramSpec)
remote: RemoteSpec = field(default_factory=RemoteSpec)
skills: SkillsSpec = field(default_factory=SkillsSpec)
context_management: ContextManagementConfig = field(
default_factory=ContextManagementConfig
)
startup_commands: list[StartupCommand] = field(default_factory=list)
startup: "StartupSpec" = field(default_factory=lambda: StartupSpec())
mcp_servers: dict[str, dict] = field(default_factory=dict)
multiplexer: str = "tmux" # "tmux" (default) or "screen"
hosts_spec: HostsSpec = field(default_factory=HostsSpec)
scheduling: SchedulingSpec = field(default_factory=SchedulingSpec)
orochi: OrochiSpec = field(default_factory=OrochiSpec)
config_path: str = ""
def __post_init__(self) -> None:
if not self.screen_name:
self.screen_name = f"cld-{self.name}"
@property
def expanded_workdir(self) -> str:
return str(Path(self.workdir).expanduser())