command: [Unit]
Description=Qwen3.6-35B-A3B MTP IQ4_XS (preserve_thinking)
After=network.target
StartLimitBurst=5
StartLimitIntervalSec=60

[Service]
Type=simple
Environment="CUDA_VISIBLE_DEVICES=0"


# gpumod-ecr/gpumod-aeb: ExecStartPre runs `gpumod preflight all` to
# short-circuit starts when model-file, VRAM, or RAM checks fail.
# Catches direct systemctl starts, Restart=on-failure cascades, and
# autostart paths that bypass the Python API safeguard.
ExecStartPre=$HOME/AI/gpumod/.venv/bin/gpumod preflight all --service-id qwen36-35b-a3b-mtp-iq4xs-preserve

$HOME/bin/llama.cpp/build/bin/llama-server \ --model $HOME/bin/Qwen3.6-35B-A3B-MTP-UD-IQ4_XS.gguf --port 7104 --host 127.0.0.1 --ctx-size 131072 --n-gpu-layers -1 --jinja --flash-attn on --parallel 1 --threads 16 --cache-type-k q8_0 --cache-type-v q8_0 --spec-type draft-mtp --spec-draft-n-max 2 --chat-template-kwargs '{"preserve_thinking":true}'
