cmake_minimum_required(VERSION 3.18)
project(auvux_dsp LANGUAGES C CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_C_STANDARD 11)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
endif()

find_package(pybind11 CONFIG REQUIRED)
find_package(Threads REQUIRED)

# FFT: PFFFT is always compiled (vendored, guaranteed fallback); vDSP is added
# on Apple. AUVUX_FFT=pffft drops vDSP even on Apple.
set(AUVUX_FFT "auto" CACHE STRING "FFT backend: auto | vdsp | pffft")

# GPU: Metal on Apple, CUDA where a toolkit is found, stub otherwise.
# Override with -DAUVUX_GPU=metal|cuda|none; pick a CUDA toolkit with the
# CUDACXX environment variable.
set(AUVUX_GPU "auto" CACHE STRING "GPU backend: auto | metal | cuda | none")
if(AUVUX_GPU STREQUAL "auto")
  if(APPLE)
    set(AUVUX_GPU "metal")
  else()
    include(CheckLanguage)
    check_language(CUDA)
    if(CMAKE_CUDA_COMPILER)
      set(AUVUX_GPU "cuda")
    else()
      set(AUVUX_GPU "none")
    endif()
  endif()
endif()
message(STATUS "auvux-dsp GPU backend: ${AUVUX_GPU}")

pybind11_add_module(_native
  src/fft/fft.cpp
  src/fft/fft_pffft.cpp
  src/common/threadpool.cpp
  src/common/host_pool.cpp
  src/ops/window.cpp
  src/ops/ola.cpp
  src/ops/stft/stft_cpu.cpp
  src/ops/istft/istft_cpu.cpp
  src/ops/mel/mel_filterbank.cpp
  src/ops/mel/mel_cpu.cpp
  src/ops/cqt/cqt_filterbank.cpp
  src/ops/cqt/cqt_cpu.cpp
  src/ops/chroma/chroma_cpu.cpp
  src/gpu/gpu_stub.cpp
  src/gpu/cqt_plan.cpp
  src/third_party/pffft.c
  src/bindings/module.cpp
  src/bindings/bind_fft.cpp
  src/bindings/bind_stft.cpp
  src/bindings/bind_mel.cpp
  src/bindings/bind_cqt.cpp
  src/bindings/bind_util.cpp)
target_include_directories(_native PRIVATE src src/third_party)
target_link_libraries(_native PRIVATE Threads::Threads)
target_compile_definitions(_native PRIVATE AUVUX_HAVE_PFFFT=1)

if(AUVUX_FFT STREQUAL "vdsp" AND NOT APPLE)
  message(FATAL_ERROR "AUVUX_FFT=vdsp requires macOS")
endif()
if(AUVUX_FFT STREQUAL "vdsp" OR (AUVUX_FFT STREQUAL "auto" AND APPLE))
  target_sources(_native PRIVATE src/fft/fft_vdsp.cpp)
  target_compile_definitions(_native PRIVATE AUVUX_HAVE_VDSP=1)
  find_library(ACCELERATE Accelerate REQUIRED)
  target_link_libraries(_native PRIVATE ${ACCELERATE})
  message(STATUS "auvux-dsp FFT backends: vdsp, pffft")
elseif(AUVUX_FFT STREQUAL "auto" OR AUVUX_FFT STREQUAL "pffft")
  message(STATUS "auvux-dsp FFT backends: pffft")
else()
  message(FATAL_ERROR "Unknown AUVUX_FFT='${AUVUX_FFT}' (use auto, vdsp, or pffft)")
endif()

if(AUVUX_GPU STREQUAL "metal")
  if(NOT APPLE)
    message(FATAL_ERROR "AUVUX_GPU=metal requires macOS")
  endif()
  enable_language(OBJCXX)
  set(CMAKE_OBJCXX_STANDARD 17)
  set(CMAKE_OBJCXX_STANDARD_REQUIRED ON)
  # Kernel sources live in real .metal files; a generated header embeds them
  # as strings for runtime compilation (no Metal toolchain needed at build).
  set(auvux_metal_kernels
      ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/metal/kernels/common.metal
      ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/metal/kernels/stft.metal
      ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/metal/kernels/mel.metal
      ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/metal/kernels/cqt.metal)
  set(auvux_metal_gen ${CMAKE_CURRENT_BINARY_DIR}/generated/auvux_metal_kernels.h)
  string(JOIN "," auvux_metal_kernels_arg ${auvux_metal_kernels})
  add_custom_command(
    OUTPUT ${auvux_metal_gen}
    COMMAND ${CMAKE_COMMAND} -DOUT=${auvux_metal_gen} "-DINPUTS=${auvux_metal_kernels_arg}"
            -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/embed_text.cmake
    DEPENDS ${auvux_metal_kernels} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/embed_text.cmake
    COMMENT "Embedding Metal kernel sources"
    VERBATIM)
  set(auvux_metal_srcs
      src/gpu/metal/metal_common.mm
      src/gpu/metal/metal_stft.mm
      src/gpu/metal/metal_mel.mm
      src/gpu/metal/metal_cqt.mm)
  target_sources(_native PRIVATE ${auvux_metal_srcs} ${auvux_metal_gen})
  set_source_files_properties(${auvux_metal_srcs} PROPERTIES COMPILE_OPTIONS "-fobjc-arc")
  target_include_directories(_native PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/generated)
  target_compile_definitions(_native PRIVATE AUVUX_METAL=1)
  find_library(METAL_FRAMEWORK Metal REQUIRED)
  find_library(FOUNDATION_FRAMEWORK Foundation REQUIRED)
  target_link_libraries(_native PRIVATE ${METAL_FRAMEWORK} ${FOUNDATION_FRAMEWORK})
elseif(AUVUX_GPU STREQUAL "cuda")
  # Detect a user-supplied arch list before enable_language fills in the
  # toolkit default and masks the distinction.
  if(DEFINED CMAKE_CUDA_ARCHITECTURES AND CMAKE_CUDA_ARCHITECTURES)
    set(auvux_user_archs TRUE)
  else()
    set(auvux_user_archs FALSE)
  endif()
  enable_language(CUDA)
  target_sources(_native PRIVATE
    src/gpu/cuda/cuda_common.cu
    src/gpu/cuda/cuda_stft.cu
    src/gpu/cuda/cuda_mel.cu
    src/gpu/cuda/cuda_cqt.cu)
  target_compile_definitions(_native PRIVATE AUVUX_CUDA=1)
  if(NOT auvux_user_archs)
    # Distributable fatbin: SASS per supported generation plus PTX (the plain
    # "90" entry embeds both) so future architectures JIT.
    set(auvux_archs 75-real 80-real 86-real 89-real 90)
    set(AUVUX_MIN_CC 75)
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13)
      list(PREPEND auvux_archs 60-real 70-real)  # CUDA 13 dropped pre-Turing
      set(AUVUX_MIN_CC 60)
    endif()
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
      list(APPEND auvux_archs 100-real 120-real)  # Blackwell
    endif()
    set(CMAKE_CUDA_ARCHITECTURES ${auvux_archs})
    # Devices below the oldest shipped SASS get a clean CPU fallback.
    target_compile_definitions(_native PRIVATE AUVUX_MIN_CC=${AUVUX_MIN_CC})
  endif()
  set_property(TARGET _native PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
  # Static cudart: only the NVIDIA driver is needed at runtime; without one
  # the wheel degrades to gpu_available() == False.
  set_property(TARGET _native PROPERTY CUDA_RUNTIME_LIBRARY Static)
elseif(NOT AUVUX_GPU STREQUAL "none")
  message(FATAL_ERROR "Unknown AUVUX_GPU='${AUVUX_GPU}' (use auto, metal, cuda, or none)")
endif()

if(MSVC)
  target_compile_options(_native PRIVATE
    $<$<COMPILE_LANGUAGE:C,CXX>:/O2;/fp:fast>
    $<$<COMPILE_LANGUAGE:CUDA>:-O3;-Xcompiler=/fp:fast>)
else()
  target_compile_options(_native PRIVATE
    $<$<COMPILE_LANGUAGE:C,CXX>:-O3;-ffast-math;-funroll-loops>
    $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
endif()

install(TARGETS _native DESTINATION auvux/dsp)
