cmake_minimum_required(VERSION 3.18)

# =============================================================================
# CUDA architecture matrix
# =============================================================================
#
# Default: build PTX/SASS for the production-relevant generations
#   70 = Volta    (V100)
#   75 = Turing   (T4, RTX 20-series)
#   80 = Ampere   (A100)
#   86 = Ampere   (A40, RTX 30-series)
#   90 = Hopper   (H100)
#
# Override with -DCMAKE_CUDA_ARCHITECTURES="<arches>" to trim the build
# matrix when iterating on a single GPU class (e.g. "70;90" on a heterogeneous
# CI fleet, or "90" for fastest H100-only iteration).  Setting an empty list
# is rejected by CMake; use "native" to autodetect the local device.
#
# This MUST run before `project(... LANGUAGES CUDA)` — once enable_language(CUDA)
# fires, CMake auto-populates the variable to its built-in default of 52 (the
# minimum reasonable arch), and a later `if(NOT DEFINED)` guard never trips.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;90"
        CACHE STRING "CUDA compute capabilities to build for")
endif()

project(cuda_multigedi LANGUAGES CXX CUDA)

# C++ and CUDA standards
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# Find CUDA Toolkit
find_package(CUDAToolkit REQUIRED)

# Verify CUDA targets exist (needed for some CI environments)
if(NOT TARGET CUDA::cudart)
    message(STATUS "CUDA::cudart target not found, using legacy linking")
    set(CUDA_USE_LEGACY_LINKING TRUE)
endif()

# Find Eigen3
find_package(Eigen3 3.3 REQUIRED NO_MODULE)

# Find OpenMP
find_package(OpenMP REQUIRED)

# Get Eigen3 include directory (works with both target and fallback)
if(TARGET Eigen3::Eigen)
    get_target_property(EIGEN3_INCLUDE_DIR Eigen3::Eigen INTERFACE_INCLUDE_DIRECTORIES)
    message(STATUS "Eigen3 include (from target): ${EIGEN3_INCLUDE_DIR}")
else()
    find_path(EIGEN3_INCLUDE_DIR Eigen/Dense
        PATHS /usr/include/eigen3 /usr/local/include/eigen3
              $ENV{EIGEN3_INCLUDE_DIR}
    )
    if(EIGEN3_INCLUDE_DIR)
        message(STATUS "Found Eigen3 at: ${EIGEN3_INCLUDE_DIR}")
    else()
        message(FATAL_ERROR "Eigen3 not found!")
    endif()
endif()

# CUDA flags
#   -O3                                : standard release optimization
#   -Xcompiler=-fopenmp -fPIC          : OpenMP + position-independent host code
#                                        (POSITION_INDEPENDENT_CODE only covers
#                                        host translation units; -Xcompiler
#                                        forwards to the host compiler that
#                                        NVCC delegates to)
#   --expt-extended-lambda             : allow __device__ lambdas
#   --expt-relaxed-constexpr           : allow host constexpr in __device__ ctx
#   -lineinfo (release only)           : embed file:line in cubin so
#                                        compute-sanitizer / Nsight Compute can
#                                        attribute warnings without reverting
#                                        to a debug build (which would disable
#                                        the optimizer)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fopenmp,-fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -lineinfo")

# C++ flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fopenmp")

# Kernel source files
set(KERNEL_SOURCES
    src/kernels/dbi_optimized_gpu.cu
    src/kernels/normalize_b_gpu.cu
    src/kernels/solve_bi_gpu.cu
    src/kernels/solve_offsets_gpu.cu
    src/kernels/solve_qi_gpu.cu
    src/kernels/solve_shared_bi_gpu.cu
    src/kernels/solve_sigma2_gpu.cu
    src/kernels/solve_yi_gpu.cu
    src/kernels/solve_z_gpu.cu
    src/kernels/update_auxiliary_gpu.cu
)

# Core source files (v1.1: high-memory variants gedi_cuda.cu /
# multigedi_cuda.cu were removed; the low-memory variants are now the
# only public engine path).
set(CORE_SOURCES
    src/gedi_cuda_lowmem.cu
    src/multigedi_cuda_lowmem.cu
)

# =============================================================================
# Main CUDA library (static, for tests)
# =============================================================================
add_library(cuda_multigedi STATIC
    ${KERNEL_SOURCES}
    ${CORE_SOURCES}
)

target_include_directories(cuda_multigedi PUBLIC
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${EIGEN3_INCLUDE_DIR}
)

if(TARGET Eigen3::Eigen)
    target_link_libraries(cuda_multigedi PUBLIC Eigen3::Eigen)
endif()

# Link CUDA libraries
if(CUDA_USE_LEGACY_LINKING)
    find_library(CUDART_LIBRARY cudart HINTS ${CUDAToolkit_LIBRARY_DIR})
    find_library(CUBLAS_LIBRARY cublas HINTS ${CUDAToolkit_LIBRARY_DIR})
    find_library(CUSOLVER_LIBRARY cusolver HINTS ${CUDAToolkit_LIBRARY_DIR})
    find_library(CUSPARSE_LIBRARY cusparse HINTS ${CUDAToolkit_LIBRARY_DIR})
    target_link_libraries(cuda_multigedi PUBLIC
        ${CUDART_LIBRARY}
        ${CUBLAS_LIBRARY}
        ${CUSOLVER_LIBRARY}
        ${CUSPARSE_LIBRARY}
    )
    target_include_directories(cuda_multigedi PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
else()
    target_link_libraries(cuda_multigedi PUBLIC
        CUDA::cudart
        CUDA::cublas
        CUDA::cusolver
        CUDA::cusparse
    )
endif()

target_link_libraries(cuda_multigedi PUBLIC OpenMP::OpenMP_CXX)

set_target_properties(cuda_multigedi PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    POSITION_INDEPENDENT_CODE ON
)

# =============================================================================
# Test executables
# =============================================================================
# v1.1: test_gpu_only / test_multigedi_cuda / test_memory_benchmark were
# removed alongside the high-memory MultiGEDICuda class they exercised.
# test_vram_sweep is kept (uses MultiGEDICudaLowMem).
# test_compare_results is kept (CPU-side helper, no engine dependency).
add_executable(test_vram_sweep tests/test_vram_sweep.cu)
target_link_libraries(test_vram_sweep PRIVATE cuda_multigedi OpenMP::OpenMP_CXX Eigen3::Eigen)

add_executable(test_compare_results tests/test_compare_results.cu)
target_include_directories(test_compare_results PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_link_libraries(test_compare_results PRIVATE Eigen3::Eigen OpenMP::OpenMP_CXX)

# =============================================================================
# MULTIGEDI Shared Library (for Rust CLI FFI)
# =============================================================================
add_library(multigedi_gpu SHARED
    src/multigedi_api.cu
    ${KERNEL_SOURCES}
    ${CORE_SOURCES}
)

target_include_directories(multigedi_gpu PUBLIC
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${EIGEN3_INCLUDE_DIR}
)

if(TARGET Eigen3::Eigen)
    target_link_libraries(multigedi_gpu PUBLIC Eigen3::Eigen)
endif()

if(CUDA_USE_LEGACY_LINKING)
    target_link_libraries(multigedi_gpu PUBLIC
        ${CUDART_LIBRARY}
        ${CUBLAS_LIBRARY}
        ${CUSOLVER_LIBRARY}
        ${CUSPARSE_LIBRARY}
    )
    target_include_directories(multigedi_gpu PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
else()
    target_link_libraries(multigedi_gpu PUBLIC
        CUDA::cudart
        CUDA::cublas
        CUDA::cusolver
        CUDA::cusparse
    )
endif()

target_link_libraries(multigedi_gpu PUBLIC OpenMP::OpenMP_CXX)

set_target_properties(multigedi_gpu PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    POSITION_INDEPENDENT_CODE ON
    VERSION 1.0.0
    SOVERSION 1
)

# =============================================================================
# Wheel install layout
# =============================================================================
#
# When this subproject is built as part of the top-level scikit-build wheel
# (MULTIGEDI_BUILD_GPU=ON), drop the .so beside the Python `_gpu/` package
# so `_ctypes_api._find_lib` can locate it via `Path(__file__).parent`
# without an env-var override.  $ORIGIN RPATH lets the loader find sibling
# CUDA runtime / Eigen libs if a future packaging change bundles them.
#
# Standalone builds (cmake -S src/_multigedi_gpu -B build, the legacy
# developer workflow) still get the system-style `lib/` install for backwards
# compatibility with the dev MULTIGEDI_GPU_LIB pointer.
if(DEFINED MULTIGEDI_BUILD_GPU AND MULTIGEDI_BUILD_GPU)
    set_target_properties(multigedi_gpu PROPERTIES
        INSTALL_RPATH "$ORIGIN"
        BUILD_WITH_INSTALL_RPATH ON
    )
    install(TARGETS multigedi_gpu LIBRARY DESTINATION multigedi/_gpu)
else()
    install(TARGETS multigedi_gpu LIBRARY DESTINATION lib)
endif()
install(FILES include/multigedi/multigedi_api.h DESTINATION include/multigedi)

message(STATUS "MULTIGEDI shared library enabled: libmultigedi_gpu.so")

# =============================================================================
# Configuration Summary
# =============================================================================
message(STATUS "=== CUDA MultiGEDI Configuration ===")
message(STATUS "CUDA Version: ${CUDAToolkit_VERSION}")
message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")
message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
