@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
325325#define C10_HIP_HOST_DEVICE
326326#endif
327327
328- #if defined(USE_ROCM)
329328// C10_WARP_SIZE is only allowed for device code.
330- // Host code _must_ use at::cuda::warp_size()
329+ // Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
330+ // Host or device statically-sized arrays _must_ use either
331+ // C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
332+ //
331333// HIP header used to define warpSize as a constexpr that was either 32 or 64
332334// depending on the target device, and then always set it to 64 for host code.
333- // Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
334- // set it to something unreasonable to trigger obvious host code errors.
335-
335+ // For a time, that allowed C10_WARP_SIZE to be defined like so:
336+ //
337+ // #ifdef USE_ROCM
338+ // #define C10_WARP_SIZE warpSize
339+ // #else
340+ // #define C10_WARP_SIZE 32
341+ // #endif
342+ //
343+ // In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
344+ // We can now only use warpSize for C10_WARP_SIZE in device code and this is
345+ // enforced by using __device__ in its definition. In host code where
346+ // C10_WARP_SIZE was previously used as a compile-time constant, this will now
347+ // cause a compile-time error.
348+ //
349+ // If an array was previously expected to be sized at compile-time using
350+ // C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
351+ // C10_WARP_SIZE_LOWER_BOUND depending on the situation.
352+ //
353+ // If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
354+ // must now use at::cuda::warp_size() for the dynamic runtime query.
355+ //
356+ // Unfortunately, C10_WARP_SIZE has been public and available for both host and
357+ // device since approximately 2019, so forcing it to be device-only would break
358+ // existing code in the wild.
359+ #if defined(USE_ROCM)
336360namespace at ::cuda {
337361TORCH_CUDA_CPP_API int warp_size ();
338362}
339- #ifdef __HIPCC__
340- static inline int __host__ C10_WARP_SIZE_INTERNAL () {
363+ #if defined( __HIPCC__)
364+ static __host__ inline int C10_WARP_SIZE_INTERNAL () {
341365 return at::cuda::warp_size ();
342366}
343-
344- static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL () {
367+ // NOTE: __device__ C10_WARP_SIZE_INTERNAL
368+ // For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
369+ // we can use constexpr. This matches prior behavior. We preserve this for
370+ // backward compatibility instead of forcing old code to use dynamic warpSize
371+ // and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
372+ // could expose where C10_WARP_SIZE was used incorrectly where the dynamic
373+ // warpSize is not allowed.
374+ #if defined(__SPIRV__)
375+ static __device__ inline int C10_WARP_SIZE_INTERNAL () {
376+ return warpSize;
377+ }
378+ #else // __SPIRV__
379+ static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL () {
345380#if defined(__GFX9__)
346381 return 64 ;
347382#else // __GFX9__
348383 return 32 ;
349384#endif // __GFX9__
350385}
351- #else // __HIPCC__
386+ #endif // __SPIRV__
387+ #if defined(__SPIRV__)
388+ #define C10_WARP_SIZE_LOWER_BOUND 32
389+ #define C10_WARP_SIZE_UPPER_BOUND 64
390+ #elif defined(__GFX9__)
391+ #define C10_WARP_SIZE_LOWER_BOUND 64
392+ #define C10_WARP_SIZE_UPPER_BOUND 64
393+ #else
394+ #define C10_WARP_SIZE_LOWER_BOUND 32
395+ #define C10_WARP_SIZE_UPPER_BOUND 32
396+ #endif
397+ #else // !__HIPCC__
352398static inline int C10_WARP_SIZE_INTERNAL () {
353399 return at::cuda::warp_size ();
354400}
401+ #define C10_WARP_SIZE_LOWER_BOUND 32
402+ #define C10_WARP_SIZE_UPPER_BOUND 64
355403#endif // __HIPCC__
356-
357404#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
358- #define C10_WARP_SIZE_STATIC 64
359-
360- #else // defined(USE_ROCM)
405+ #else // !USE_ROCM
361406#define C10_WARP_SIZE 32
362- #endif
407+ #define C10_WARP_SIZE_LOWER_BOUND 32
408+ #define C10_WARP_SIZE_UPPER_BOUND 32
409+ #endif // USE_ROCM
363410
364411#if defined(_MSC_VER) && _MSC_VER <= 1900
365412#define __func__ __FUNCTION__
@@ -629,7 +676,7 @@ __host__ __device__
629676// This macro is used to find older C++ compilers
630677// that don't support move optimization for return values.
631678
632- #if (defined(__GNUC__) && __GNUC__ < 13) || \
679+ #if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L ) || \
633680 (defined (__clang_major__) && __clang_major__ < 13 )
634681#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
635682#else
0 commit comments