# Multi-stage build. Pick CUDA or CPU wheel of llama-cpp-python via TARGET arg.
#   docker build --build-arg TARGET=cuda -t my-app .   (default)
#   docker build --build-arg TARGET=cpu  -t my-app .
#
# Models are NOT baked into the image — they're uploaded directly to the
# RunPod network volume by `infera deploy runpod` (volume.upload_models).

ARG TARGET=cuda

# ── CUDA stage: lean — slim base + CUDA runtime libs from pip (no NVIDIA OS image) ──
FROM python:3.11-slim AS deps-cuda
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \
    && rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir \
        nvidia-cuda-runtime-cu12==12.1.105 \
        nvidia-cublas-cu12==12.1.3.1
RUN pip install --no-cache-dir \
        --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 \
        llama-cpp-python==0.3.21 runpod==1.8.2
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.11/site-packages/nvidia/cuda_runtime/lib:/usr/local/lib/python3.11/site-packages/nvidia/cublas/lib

# ── CPU stage: slim image, CPU-only wheel ──
FROM python:3.11-slim AS deps-cpu
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \
    && rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir \
        llama-cpp-python==0.3.21 runpod==1.8.2

# ── Final stage ──
FROM deps-${TARGET} AS final
WORKDIR /app

COPY .build/engine.py .
COPY .build/handler.py .
COPY .build/upload_server.py .
COPY .build/start.sh .
RUN chmod +x start.sh

EXPOSE 8080
CMD ["./start.sh"]
