# =============================================================================
# pybind11 zero-copy GPU bindings (Phase 3)
# =============================================================================
#
# This module replaces the disk-serialized ctypes path for the GPU backend.
# Phase 3 is staged across PRs so each commit is independently testable:
#
#   PR1 (this commit) — skeleton + version queries (no engine link).
#                       Validates: scikit-build descent, NVCC compile of a
#                       pybind11 .cu translation unit, side-by-side install
#                       beside libmultigedi_gpu.so, $ORIGIN RPATH.
#   PR2 — MultiGEDICudaModel.__init__ + add_modality + train(iterations=0).
#         Adds link to cuda_multigedi (engine static lib).
#   PR3 — Real train(iterations=N) with gil_scoped_release + std::function
#         progress callback.
#   PR4 — download() / download_shared_Bi() returning numpy.
#   PR5 — Cut over tools/_gedi.py; delete _io.py + ctypes shim.
#
# We compile bindings as .cu (NVCC) rather than .cpp (host compiler) because
# subsequent PRs include engine headers (gedi_params.hpp, multigedi_cuda_lowmem.cuh)
# that pull in __device__ lambdas via --expt-extended-lambda.  Doing the .cu
# split now means PR1's build wiring is the same as PR2-5's, which lets us
# catch NVCC-vs-pybind11 issues before they're entangled with binding logic.

# CUDA architectures are inherited from the parent project() declaration in
# src/_multigedi_gpu/CMakeLists.txt (which runs first under
# add_subdirectory order).  If the user is building this subdir standalone
# (uncommon), default to the same matrix.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;90"
        CACHE STRING "CUDA compute capabilities to build for")
endif()

# Ensure CUDA is enabled in this scope.  The sibling subdir already did this
# at top level, so this is normally a no-op; keeping it makes the file work
# under a hypothetical standalone configure too.
enable_language(CUDA)

# pybind11 module registration.  pybind11_add_module recognises .cu sources
# and dispatches them to NVCC; .cpp / .h still go through the host compiler.
pybind11_add_module(_multigedi_gpu_py
    bindings.cu
)

# The top-level CMakeLists sets CMAKE_CXX_STANDARD=14 (a CPU-module
# constraint).  Engine headers and our binding both use C++17 features
# (structured bindings, std::optional, fold expressions in helpers).
# Override per-target so the rest of the build is unaffected.
set_target_properties(_multigedi_gpu_py PROPERTIES
    CXX_STANDARD 17
    CXX_STANDARD_REQUIRED ON
    CUDA_STANDARD 17
    CUDA_STANDARD_REQUIRED ON
)

# Match engine NVCC flags from src/_multigedi_gpu/CMakeLists.txt so future
# PRs can include engine headers without flag mismatches.  --expt-extended-lambda
# is required because gedi_cuda_lowmem.cuh (PR2+) uses __device__ lambdas
# inside thrust transforms.
target_compile_options(_multigedi_gpu_py PRIVATE
    $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
)

# Link the engine static library (defined in the sibling subdir
# src/_multigedi_gpu/CMakeLists.txt as `cuda_multigedi`).  cuda_multigedi's
# PUBLIC link interface transitively pulls Eigen3::Eigen, OpenMP::OpenMP_CXX,
# CUDA::cudart, CUDA::cublas, CUDA::cusolver, CUDA::cusparse, plus the include
# directories for both the engine and Eigen.  Cross-subdir target resolution
# works because both subdirs are added under the same top-level project().
#
# We link `cuda_multigedi` (the static lib used by tests and FFI consumers)
# rather than `multigedi_gpu` (the SHARED lib for the Rust C ABI).  Going
# through the SHARED lib would route every call through the disk-format C ABI
# wrappers — exactly the layer Phase 3 is removing.
target_link_libraries(_multigedi_gpu_py PRIVATE cuda_multigedi)

# Pass the package version to the binding so version() returns the same
# string as multigedi.__version__.  Sourced from the top-level
# project(... VERSION 0.1.0) line in CMakeLists.txt; if scikit-build-core
# overrides via SKBUILD_PROJECT_VERSION (the conventional pyproject -> CMake
# bridge), prefer that.
if(DEFINED SKBUILD_PROJECT_VERSION)
    target_compile_definitions(_multigedi_gpu_py PRIVATE
        MULTIGEDI_VERSION_STRING="${SKBUILD_PROJECT_VERSION}"
    )
else()
    target_compile_definitions(_multigedi_gpu_py PRIVATE
        MULTIGEDI_VERSION_STRING="${PROJECT_VERSION}"
    )
endif()

set_target_properties(_multigedi_gpu_py PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    POSITION_INDEPENDENT_CODE ON
    # Loader needs to find the sibling libmultigedi_gpu.so (when it's used
    # as a transitive dep in PR2+) without LD_LIBRARY_PATH.  $ORIGIN matches
    # the multigedi_gpu target's RPATH set in the sibling subdir.
    INSTALL_RPATH "$ORIGIN"
    BUILD_WITH_INSTALL_RPATH ON
)

# Install into multigedi/_gpu/ so the import is
#   from multigedi._gpu import _multigedi_gpu_py
# This places the .so beside libmultigedi_gpu.so for the duration of v1.1's
# rip-and-replace transition (the C ABI shared lib continues to ship for one
# minor version per the deprecation plan in the Phase 3 design doc).
install(TARGETS _multigedi_gpu_py LIBRARY DESTINATION multigedi/_gpu)
