Add cuFFTDx-backed FFT2 JIT support

cliffburdick · cliffburdick · commit 49912be610b3 · 2026-05-26T13:34:04.000-07:00
Generate JIT classes and LTO IR for single-block C2C fft2/ifft2 fusions, including shared-memory tiling through cuFFTDx 1D passes.

Teach the JIT launcher about grouped 2D blocks and vectorized EPT indexing so FFT2 operators can return multiple columns per thread.

Document the supported FFT2 JIT shape/type limits and add forward/inverse FFT2 JIT fusion coverage.
diff --git a/docs_input/api/dft/fft/fft2d.rst b/docs_input/api/dft/fft/fft2d.rst
@@ -5,6 +5,10 @@ fft2
 
 Perform a 2D FFT. Batching is supported for any tensor with a rank higher than 2.
 
+FFT kernel fusion is supported by cuFFTDx for complex-to-complex power-of-two square
+transforms that fit in a single CUDA block when ``-DMATX_EN_MATHDX=ON`` is enabled.
+Unsupported 2D FFT sizes and real-valued 2D FFTs use the existing cuFFT execution path.
+
 .. versionadded:: 0.6.0
 
 .. doxygenfunction:: fft2(const OpA &a, FFTNorm norm = FFTNorm::BACKWARD)
@@ -31,4 +35,4 @@ Examples
   :language: cpp
   :start-after: example-begin fft2-2
   :end-before: example-end fft2-2
-  :dedent:  
+  :dedent:
diff --git a/docs_input/api/dft/fft/ifft2.rst b/docs_input/api/dft/fft/ifft2.rst
@@ -5,6 +5,10 @@ ifft2
 
 Perform a 2D inverse FFT. Batching is supported for any tensor with a rank higher than 2.
 
+IFFT kernel fusion is supported by cuFFTDx for complex-to-complex power-of-two square
+transforms that fit in a single CUDA block when ``-DMATX_EN_MATHDX=ON`` is enabled.
+Unsupported 2D IFFT sizes and real-valued inverse 2D FFTs use the existing cuFFT execution path.
+
 .. versionadded:: 0.6.0
 
 .. doxygenfunction:: ifft2(const OpA &a, FFTNorm norm = FFTNorm::BACKWARD)
@@ -31,4 +35,4 @@ Examples
   :language: cpp
   :start-after: example-begin ifft2-2
   :end-before: example-end ifft2-2
-  :dedent:  
+  :dedent:
diff --git a/docs_input/basics/fusion.rst b/docs_input/basics/fusion.rst
@@ -59,8 +59,9 @@ CUDA JIT Kernel Fusion
 
     CUDA JIT kernel fusion is considered an experimental feature. There may be bugs that don't occur with JIT disabled, and new features are being added over time.
 
-MatX supports CUDA JIT kernel fusion that compiles the entire expression into a single kernel. Currently this is enabled 
-for all standard MatX element-wise operators and FFT and GEMM operations via MathDx. To enable fusion with MathDx, 
+MatX supports CUDA JIT kernel fusion that compiles the entire expression into a single kernel. Currently this is enabled
+for all standard MatX element-wise operators and FFT and GEMM operations via MathDx. cuFFTDx supports 1D FFT fusion and
+single-block complex-to-complex 2D ``fft2``/``ifft2`` fusion for supported power-of-two square transforms. To enable fusion with MathDx,
 the following options must be enabled: ``-DMATX_EN_MATHDX=ON``. Once enabled, the ``CUDAJITExecutor`` can be used perform JIT compilation
 in supported situations. If the expression cannot be JIT compiled, the JITExecutor may throw an error.
 
@@ -118,12 +119,10 @@ MathDx Compatibility
      - Enabled via ``-DMATX_EN_MATHDX=ON`` for GEMM fusion paths.
    * - cuFFTDx
      - Yes
-     - Enabled via ``-DMATX_EN_MATHDX=ON`` for FFT fusion paths.
+     - Enabled via ``-DMATX_EN_MATHDX=ON`` for 1D FFT fusion paths and supported single-block 2D C2C FFT fusion paths.
    * - cuSolverDx
      - No
      - Not supported yet by MatX CUDA JIT fusion.
    * - cuRandDx
      - No
      - Not supported yet by MatX CUDA JIT fusion.
-
-
diff --git a/include/matx/core/get_grid_dims.h b/include/matx/core/get_grid_dims.h
@@ -338,12 +338,17 @@ inline bool get_grid_dims_block_reduce(dim3 &blocks, dim3 &threads, const cuda::
 template <int RANK>
 inline bool get_grid_dims_block_2d(dim3 &blocks, dim3 &threads, 
                                     const cuda::std::array<index_t, RANK> &sizes,
-                                    int block_dim) {
+                                    int block_dim,
+                                    int groups_per_block = 1) {
   // Threads are set to block_dim in x, y and z are 1
   // All threads cooperate via flattened thread ID in the kernel
   threads.x = block_dim;
-  threads.y = 1;
+  threads.y = groups_per_block;
   threads.z = 1;
+
+  if (static_cast<int64_t>(threads.x) * static_cast<int64_t>(threads.y) > 1024) {
+    MATX_THROW(matxInvalidParameter, "Block2D launch exceeds CUDA maximum threads per block");
+  }
   
   // Grid covers batch dimensions only (dims 0 to RANK-3)
   blocks.x = 1;
@@ -372,7 +377,8 @@ inline bool get_grid_dims_block_2d(dim3 &blocks, dim3 &threads,
     }
   }
   
-  MATX_LOG_DEBUG("Block2D: Blocks {}x{}x{} Threads {}x{}x{}", blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z);
+  MATX_LOG_DEBUG("Block2D: Blocks {}x{}x{} Threads {}x{}x{} groups_per_block={}",
+                 blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, groups_per_block);
   
   // No stride needed for now - could be extended for very large batches
   return false;
diff --git a/include/matx/executors/jit_cuda.h b/include/matx/executors/jit_cuda.h
@@ -289,16 +289,20 @@ namespace matx
                 if (block_dim_range[0] == detail::capability_attributes<detail::OperatorCapability::BLOCK_DIM>::invalid) {
                   MATX_THROW(matxInvalidParameter, "No valid JIT block dimension satisfies the fused operator requirements");
                 }
+                auto group_range = detail::get_operator_capability<detail::OperatorCapability::GROUPS_PER_BLOCK>(op);
                 block_size = block_dim_range[0];
-                stride = detail::get_grid_dims_block_2d<RANK>(blocks, threads, sizes, block_size);
+                groups_per_block = group_range[0];
+                if (groups_per_block == detail::capability_attributes<detail::OperatorCapability::GROUPS_PER_BLOCK>::invalid) {
+                  MATX_THROW(matxInvalidParameter, "No valid JIT groups-per-block value satisfies the fused operator requirements");
+                }
+                stride = detail::get_grid_dims_block_2d<RANK>(blocks, threads, sizes, block_size, groups_per_block);
 
-                // EPT is 1 for 2D block operators - the operator handles elements internally
-                best_ept = detail::ElementsPerThread::ONE;
+                // Block-level operators can still return vectorized output lanes.
+                best_ept = jit_ept_bounds[1];
                 shm_size = detail::get_operator_capability<detail::OperatorCapability::DYN_SHM_SIZE>(op);
-                groups_per_block = 1;
 
-                MATX_LOG_DEBUG("Block2D: EPT {}, Shm size {}, Block size {}",
-                               static_cast<int>(best_ept), shm_size, block_size);
+                MATX_LOG_DEBUG("Block2D: EPT {}, Shm size {}, Block size {}, Groups per block {}",
+                               static_cast<int>(best_ept), shm_size, block_size, groups_per_block);
               } else if constexpr (is_dynamic_rank_op_v<Op>) {
                 // Dynamic tensor expressions: pre-compiled kernels don't exist for this Op type,
                 // so we cannot query register pressure. Use conservative defaults.
diff --git a/include/matx/executors/jit_kernel.h b/include/matx/executors/jit_kernel.h
@@ -395,8 +395,10 @@ namespace matx {
     template <class Op>\n\
     __global__ void matxOpT2KernelBlock2D(Op op, matx::index_t size0, matx::index_t size1) {\n\
       int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;\n\
-      matx::index_t idx = tid % size1;\n\
-      matx::index_t idy = tid / size1;\n\
+      constexpr int ept = static_cast<int>(CurrentCapabilities::ept);\n\
+      matx::index_t size1_vectors = (size1 + ept - 1) / ept;\n\
+      matx::index_t idx = tid % size1_vectors;\n\
+      matx::index_t idy = tid / size1_vectors;\n\
       if constexpr (cuda::std::is_pointer_v<Op>) {\n\
         (*op).template operator()<CurrentCapabilities>(idy, idx);\n\
       } else {\n\
@@ -407,8 +409,10 @@ namespace matx {
     template <class Op>\n\
     __global__ void matxOpT3KernelBlock2D(Op op, matx::index_t size0, matx::index_t size1, matx::index_t size2) {\n\
       int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;\n\
-      matx::index_t idx = tid % size2;\n\
-      matx::index_t idy = tid / size2;\n\
+      constexpr int ept = static_cast<int>(CurrentCapabilities::ept);\n\
+      matx::index_t size2_vectors = (size2 + ept - 1) / ept;\n\
+      matx::index_t idx = tid % size2_vectors;\n\
+      matx::index_t idy = tid / size2_vectors;\n\
       matx::index_t idz = blockIdx.x;\n\
       if constexpr (cuda::std::is_pointer_v<Op>) {\n\
         (*op).template operator()<CurrentCapabilities>(idz, idy, idx);\n\
@@ -420,8 +424,10 @@ namespace matx {
     template <class Op>\n\
     __global__ void matxOpT4KernelBlock2D(Op op, matx::index_t size0, matx::index_t size1, matx::index_t size2, matx::index_t size3) {\n\
       int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;\n\
-      matx::index_t idx = tid % size3;\n\
-      matx::index_t idy = tid / size3;\n\
+      constexpr int ept = static_cast<int>(CurrentCapabilities::ept);\n\
+      matx::index_t size3_vectors = (size3 + ept - 1) / ept;\n\
+      matx::index_t idx = tid % size3_vectors;\n\
+      matx::index_t idy = tid / size3_vectors;\n\
       matx::index_t idz = blockIdx.x;\n\
       matx::index_t idw = blockIdx.y;\n\
       if constexpr (cuda::std::is_pointer_v<Op>) {\n\
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
diff --git a/include/matx/transforms/fft/fft_cufftdx.h b/include/matx/transforms/fft/fft_cufftdx.h
diff --git a/test/00_transform/FFT.cu b/test/00_transform/FFT.cu