win: fix cuda build (#3204)

dhiltgen · web-flow · commit fd6d304b3a3f · 2026-03-11T12:58:04.000+09:00
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
@@ -118,8 +118,10 @@ target_compile_options(
   mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>")
 
 # Required for generating optimized CUTLASS code.
-target_compile_options(
-  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fno-strict-aliasing>")
+if(NOT MSVC)
+  target_compile_options(
+    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fno-strict-aliasing>")
+endif()
 
 # Suppress nvcc warnings on C++ headers.
 target_compile_options(
diff --git a/mlx/backend/cuda/device.cpp b/mlx/backend/cuda/device.cpp
@@ -66,8 +66,10 @@ Device::~Device() {
 
 void Device::make_current() {
   // We need to set/get current CUDA device very frequently, cache it to reduce
-  // actual calls of CUDA APIs.
-  static thread_local int current = 0;
+  // actual calls of CUDA APIs. Use -1 as sentinel so the first call on each
+  // new thread always calls cudaSetDevice (which establishes the CUDA primary
+  // context). Without this, device 0 would never get set on a new thread.
+  static thread_local int current = -1;
   if (current != device_) {
     CHECK_CUDA_ERROR(cudaSetDevice(device_));
     current = device_;
diff --git a/mlx/backend/cuda/eval.cpp b/mlx/backend/cuda/eval.cpp
@@ -21,6 +21,10 @@ void new_stream(Stream s) {
 
 void eval(array& arr) {
   nvtx3::scoped_range r("gpu::eval");
+  // Ensure CUDA context is active on this thread. Required when MLX is called
+  // from threads that have not yet established a CUDA context (e.g. thread
+  // pools, language runtimes that migrate work across OS threads).
+  cu::device(arr.primitive().stream().device).make_current();
   auto outputs = arr.outputs();
   {
     // If the array is a tracer hold a reference
diff --git a/mlx/backend/cuda/quantized/qmm/CMakeLists.txt b/mlx/backend/cuda/quantized/qmm/CMakeLists.txt
@@ -1,6 +1,6 @@
 target_sources(
   mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/qmm.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/qmm.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/fp_qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n16_m1.cu
diff --git a/mlx/backend/cuda/quantized/qmm/qmm.cu b/mlx/backend/cuda/quantized/qmm/qmm.cu
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh
@@ -111,7 +111,9 @@ void qmm_sm90(
       reinterpret_cast<void*>(kernel),
       gemm.get_grid_shape(gemm.params()),
       GemmKernel::get_block_shape(),
-      {get<0>(cluster), get<1>(cluster), get<2>(cluster)},
+      {static_cast<unsigned>(get<0>(cluster)),
+       static_cast<unsigned>(get<1>(cluster)),
+       static_cast<unsigned>(get<2>(cluster))},
       GemmKernel::SharedStorageSize,
       kernel_params);
 }
diff --git a/mlx/backend/cuda/scaled_dot_product_attention.cpp b/mlx/backend/cuda/scaled_dot_product_attention.cpp
@@ -318,7 +318,15 @@ bool supports_sdpa_cudnn(
     bool has_arr_mask,
     bool do_causal,
     Stream s) {
+#ifdef _WIN32
+  // On Windows (WDDM), cuDNN SDPA has severe performance issues due to
+  // high per-kernel-launch overhead in the WDDM driver model. cuDNN's
+  // multi-kernel SDPA amplifies this, making it much slower than the
+  // single-kernel sdpa_vector path for both prefill and generation.
+  static bool enabled = env::get_var("MLX_CUDA_USE_CUDNN_SDPA", 0);
+#else
   static bool enabled = env::get_var("MLX_CUDA_USE_CUDNN_SDPA", 1);
+#endif
   if (!enabled) {
     return false;
   }
diff --git a/mlx/distributed/nccl/CMakeLists.txt b/mlx/distributed/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(MLX_BUILD_CUDA)
+if(MLX_BUILD_CUDA AND NOT WIN32)
   target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nccl.cpp)
   find_package(NCCL)
   if(NCCL_FOUND)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-if(MLX_BUILD_CUDA)`
	`1`	`+if(MLX_BUILD_CUDA AND NOT WIN32)`
`2`	`2`	`target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nccl.cpp)`
`3`	`3`	`find_package(NCCL)`
`4`	`4`	`if(NCCL_FOUND)`