From f36eb60cbbaba0593685d51eaa6f6f69482c2e95 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 18 May 2026 15:27:04 -0700 Subject: [PATCH 1/2] init --- backends/cuda/CMakeLists.txt | 74 ++++++++++++++++++- .../cuda/runtime/shims/tests/CMakeLists.txt | 6 ++ examples/models/qwen3_5_moe/CMakeLists.txt | 3 +- 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 157cc05a54f..4a8a5c6efb3 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -110,7 +110,7 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp # Only build CUDA shims when CUDA language/toolchain is available. if(CMAKE_CUDA_COMPILER) list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu - runtime/shims/sort.cu runtime/shims/rand.cu + runtime/shims/sort.cu ) endif() @@ -152,7 +152,7 @@ endif() # retention. if(_cuda_is_msvc_toolchain) target_link_libraries( - aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand + aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart ${CMAKE_DL_LIBS} ) # Link object library directly so symbols are pulled exactly once while @@ -163,7 +163,7 @@ else() aoti_cuda_shims PRIVATE cuda_platform PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive - CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} + CUDA::cudart ${CMAKE_DL_LIBS} ) endif() @@ -177,6 +177,74 @@ install( DESTINATION lib ) +# CUDA-specific AOTI sampler shim symbols (rand/randint via curand). Split out +# of aoti_cuda_shims so the curand fatbin (~3.5MB precalc tables + Philox +# kernels per arch) and the CUDA::curand dependency are only paid by the +# small set of consumers that actually use them (e.g. qwen3_5_moe). Other +# CUDA examples (voxtral, parakeet, whisper, dinov2, ...) link only +# aoti_cuda_shims and stay small. +if(CMAKE_CUDA_COMPILER) + add_library(aoti_cuda_sampler_shims SHARED runtime/shims/rand.cu) + + # Match aoti_cuda_shims preprocessor defines for symbol export. + target_compile_definitions(aoti_cuda_sampler_shims PRIVATE CUDA_AVAILABLE=1) + if(WIN32) + target_compile_definitions( + aoti_cuda_sampler_shims PRIVATE EXPORT_AOTI_FUNCTIONS + ) + if(_cuda_is_windows_msvc) + set_target_properties( + aoti_cuda_sampler_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF + ) + endif() + endif() + + target_include_directories( + aoti_cuda_sampler_shims + PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ + $ + ) + + target_compile_options( + aoti_cuda_sampler_shims + PUBLIC "$<$:${_cuda_cxx_compile_options}>" + ) + + if(_cuda_export_dynamic_option) + target_link_options( + aoti_cuda_sampler_shims PUBLIC ${_cuda_export_dynamic_option} + ) + endif() + + # rand.cu calls into slim helpers (empty_strided, getCurrentCUDAStream, + # SlimTensor) which are linked into aoti_cuda_shims. Depend on that target + # so we resolve those symbols from the already-loaded shims library + # instead of duplicating slim's static archive into both DLLs. + if(_cuda_is_msvc_toolchain) + target_link_libraries( + aoti_cuda_sampler_shims + PRIVATE cuda_platform CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} + aoti_cuda_shims + ) + else() + target_link_libraries( + aoti_cuda_sampler_shims + PRIVATE cuda_platform + PUBLIC CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} aoti_cuda_shims + ) + endif() + + if(NOT _cuda_is_msvc_toolchain) + executorch_target_link_options_shared_lib(aoti_cuda_sampler_shims) + endif() + + install( + TARGETS aoti_cuda_sampler_shims + EXPORT ExecuTorchTargets + DESTINATION lib + ) +endif() + # CUDA backend implementation set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt index aec5219d680..5b948ac3a35 100644 --- a/backends/cuda/runtime/shims/tests/CMakeLists.txt +++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt @@ -67,3 +67,9 @@ foreach(test_name ${CUDA_SHIM_TESTS}) add_test(NAME ${test_name} COMMAND ${test_name}) endforeach() + +# rand symbols live in the separate aoti_cuda_sampler_shims DLL to keep the +# curand-induced binary-size cost out of aoti_cuda_shims. +target_link_libraries( + test_aoti_torch_cuda_rand PRIVATE aoti_cuda_sampler_shims +) diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt index d1cfe54a56f..e9c357c6c17 100644 --- a/examples/models/qwen3_5_moe/CMakeLists.txt +++ b/examples/models/qwen3_5_moe/CMakeLists.txt @@ -48,8 +48,9 @@ if(EXECUTORCH_BUILD_METAL) executorch_target_link_options_shared_lib(metal_backend) elseif(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda_backend) + list(APPEND link_libraries aoti_cuda_backend aoti_cuda_sampler_shims) executorch_target_link_options_shared_lib(aoti_cuda_backend) + executorch_target_link_options_shared_lib(aoti_cuda_sampler_shims) add_compile_definitions(EXECUTORCH_BUILD_CUDA) else() message( From eba9b71b086e3b157833fa2fcb80ddc8f626bb14 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 26 May 2026 00:04:43 -0700 Subject: [PATCH 2/2] lint --- backends/cuda/CMakeLists.txt | 30 ++++++++++++------- .../cuda/runtime/shims/tests/CMakeLists.txt | 4 +-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index dfccb15e74a..fee1a3262e1 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -152,8 +152,7 @@ endif() # retention. if(_cuda_is_msvc_toolchain) target_link_libraries( - aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart - ${CMAKE_DL_LIBS} + aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart ${CMAKE_DL_LIBS} ) # Link object library directly so symbols are pulled exactly once while # avoiding duplicate static/object inclusion and interface leakage. @@ -179,10 +178,10 @@ install( # CUDA-specific AOTI sampler shim symbols (rand/randint via curand). Split out # of aoti_cuda_shims so the curand fatbin (~3.5MB precalc tables + Philox -# kernels per arch) and the CUDA::curand dependency are only paid by the -# small set of consumers that actually use them (e.g. qwen3_5_moe). Other -# CUDA examples (voxtral, parakeet, whisper, dinov2, ...) link only -# aoti_cuda_shims and stay small. +# kernels per arch) and the CUDA::curand dependency are only paid by the small +# set of consumers that actually use them (e.g. qwen3_5_moe). Other CUDA +# examples (voxtral, parakeet, whisper, dinov2, ...) link only aoti_cuda_shims +# and stay small. if(CMAKE_CUDA_COMPILER) add_library(aoti_cuda_sampler_shims SHARED runtime/shims/rand.cu) @@ -217,19 +216,28 @@ if(CMAKE_CUDA_COMPILER) endif() # rand.cu calls into slim helpers (empty_strided, getCurrentCUDAStream, - # SlimTensor) which are linked into aoti_cuda_shims. Depend on that target - # so we resolve those symbols from the already-loaded shims library - # instead of duplicating slim's static archive into both DLLs. + # SlimTensor) which are linked into aoti_cuda_shims. Depend on that target so + # we resolve those symbols from the already-loaded shims library instead of + # duplicating slim's static archive into both DLLs. + # + # Also link `slimtensor` (INTERFACE / header-only) directly so the c10 include + # root (runtime/core/portable_type/c10) is on this target's compile command. + # aoti_cuda_shims links aoti_common_shims_slim PUBLIC on non-MSVC (so includes + # propagate transitively on Linux) but only PRIVATELY via the *_obj OBJECT lib + # on MSVC, which does NOT forward the slimtensor INTERFACE include dirs. + # Linking slimtensor here makes the include path explicit on both toolchains + # and keeps Windows MSVC happy without changing aoti_cuda_shims' propagation + # semantics. if(_cuda_is_msvc_toolchain) target_link_libraries( aoti_cuda_sampler_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} - aoti_cuda_shims + aoti_cuda_shims slimtensor ) else() target_link_libraries( aoti_cuda_sampler_shims - PRIVATE cuda_platform + PRIVATE cuda_platform slimtensor PUBLIC CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} aoti_cuda_shims ) endif() diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt index f9d28c68942..548b00c1947 100644 --- a/backends/cuda/runtime/shims/tests/CMakeLists.txt +++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt @@ -93,6 +93,4 @@ endforeach() # rand symbols live in the separate aoti_cuda_sampler_shims DLL to keep the # curand-induced binary-size cost out of aoti_cuda_shims. -target_link_libraries( - test_aoti_torch_cuda_rand PRIVATE aoti_cuda_sampler_shims -) +target_link_libraries(test_aoti_torch_cuda_rand PRIVATE aoti_cuda_sampler_shims)