fix: address review feedback on flash attention quick-build and pre-SM80 fallback

tianleiwu · tianleiwu · commit e81b491bbae9 · 2026-05-26T17:20:49.000-07:00
- Update quick-build comment to reflect that both FP16 and BF16 hdim128 kernels
  are intentionally retained (not just FP16).
- Add fallback for pre-SM80 builds: when no SM80+ architectures are configured,
  flash attention sources are added back to the parent target so the linker can
  find host-side symbols referenced by flash_api.cc.
diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -14,11 +14,11 @@ function(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
   set(_list "${${CU_SRC_LIST}}")
 
   # Quick build mode: Filter flash attention kernels for faster development iteration.
-  #   - We keep only hdim128 fp16 flash attention kernels in quick build mode.
+  #   - We keep only hdim128 fp16 and bf16 flash attention kernels in quick build mode.
   #   - All other listed head dimensions are excluded (e.g., 32, 64, 96, 192, 256).
   #     If new head dimensions are added or removed, update this list to match the supported set.
   if(onnxruntime_QUICK_BUILD)
-    message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels")
+    message(STATUS "Quick build mode enabled: Only building hdim128 fp16/bf16 flash attention kernels")
     list(FILTER _list EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)")
   endif()
 
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -480,6 +480,11 @@
           CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}"
           NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
           SOURCES ${onnxruntime_cuda_flash_attention_srcs})
+      else()
+        # No SM80+ architectures available: compile flash sources in parent target so the
+        # linker can find the host-side symbols referenced by flash_api.cc. The kernels
+        # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards.
+        target_sources(onnxruntime_providers_cuda PRIVATE ${onnxruntime_cuda_flash_attention_srcs})
       endif()
     endif()
 
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -274,6 +274,11 @@ if(_cuda_plugin_flash_attention_srcs)
       NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
       COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
       SOURCES ${_cuda_plugin_flash_attention_srcs})
+  else()
+    # No SM80+ architectures available: compile flash sources in parent target so the
+    # linker can find the host-side symbols referenced by flash_api.cc. The kernels
+    # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards.
+    target_sources(onnxruntime_providers_cuda_plugin PRIVATE ${_cuda_plugin_flash_attention_srcs})
   endif()
 endif()