Skip to content

Commit 5aff251

Browse files
committed
Bump PyTorch pin to nightly dev20260422
1 parent e7b38a3 commit 5aff251

4 files changed

Lines changed: 72 additions & 24 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
release/2.11
1+
9daaaa239547cafe96a1689972bcb34b8fe8afdd

runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
325325
#define C10_HIP_HOST_DEVICE
326326
#endif
327327

328-
#if defined(USE_ROCM)
329328
// C10_WARP_SIZE is only allowed for device code.
330-
// Host code _must_ use at::cuda::warp_size()
329+
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
330+
// Host or device statically-sized arrays _must_ use either
331+
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
332+
//
331333
// HIP header used to define warpSize as a constexpr that was either 32 or 64
332334
// depending on the target device, and then always set it to 64 for host code.
333-
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
334-
// set it to something unreasonable to trigger obvious host code errors.
335-
335+
// For a time, that allowed C10_WARP_SIZE to be defined like so:
336+
//
337+
// #ifdef USE_ROCM
338+
// #define C10_WARP_SIZE warpSize
339+
// #else
340+
// #define C10_WARP_SIZE 32
341+
// #endif
342+
//
343+
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
344+
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
345+
// enforced by using __device__ in its definition. In host code where
346+
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
347+
// cause a compile-time error.
348+
//
349+
// If an array was previously expected to be sized at compile-time using
350+
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
351+
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
352+
//
353+
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
354+
// must now use at::cuda::warp_size() for the dynamic runtime query.
355+
//
356+
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
357+
// device since approximately 2019, so forcing it to be device-only would break
358+
// existing code in the wild.
359+
#if defined(USE_ROCM)
336360
namespace at::cuda {
337361
TORCH_CUDA_CPP_API int warp_size();
338362
}
339-
#ifdef __HIPCC__
340-
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
363+
#if defined(__HIPCC__)
364+
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
341365
return at::cuda::warp_size();
342366
}
343-
344-
static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
367+
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
368+
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
369+
// we can use constexpr. This matches prior behavior. We preserve this for
370+
// backward compatibility instead of forcing old code to use dynamic warpSize
371+
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
372+
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
373+
// warpSize is not allowed.
374+
#if defined(__SPIRV__)
375+
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
376+
return warpSize;
377+
}
378+
#else // __SPIRV__
379+
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
345380
#if defined(__GFX9__)
346381
return 64;
347382
#else // __GFX9__
348383
return 32;
349384
#endif // __GFX9__
350385
}
351-
#else // __HIPCC__
386+
#endif // __SPIRV__
387+
#if defined(__SPIRV__)
388+
#define C10_WARP_SIZE_LOWER_BOUND 32
389+
#define C10_WARP_SIZE_UPPER_BOUND 64
390+
#elif defined(__GFX9__)
391+
#define C10_WARP_SIZE_LOWER_BOUND 64
392+
#define C10_WARP_SIZE_UPPER_BOUND 64
393+
#else
394+
#define C10_WARP_SIZE_LOWER_BOUND 32
395+
#define C10_WARP_SIZE_UPPER_BOUND 32
396+
#endif
397+
#else // !__HIPCC__
352398
static inline int C10_WARP_SIZE_INTERNAL() {
353399
return at::cuda::warp_size();
354400
}
401+
#define C10_WARP_SIZE_LOWER_BOUND 32
402+
#define C10_WARP_SIZE_UPPER_BOUND 64
355403
#endif // __HIPCC__
356-
357404
#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
358-
#define C10_WARP_SIZE_STATIC 64
359-
360-
#else // defined(USE_ROCM)
405+
#else // !USE_ROCM
361406
#define C10_WARP_SIZE 32
362-
#endif
407+
#define C10_WARP_SIZE_LOWER_BOUND 32
408+
#define C10_WARP_SIZE_UPPER_BOUND 32
409+
#endif // USE_ROCM
363410

364411
#if defined(_MSC_VER) && _MSC_VER <= 1900
365412
#define __func__ __FUNCTION__
@@ -629,7 +676,7 @@ __host__ __device__
629676
// This macro is used to find older C++ compilers
630677
// that don't support move optimization for return values.
631678

632-
#if (defined(__GNUC__) && __GNUC__ < 13) || \
679+
#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
633680
(defined(__clang_major__) && __clang_major__ < 13)
634681
#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
635682
#else

runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <iosfwd>
1313
#include <ostream>
1414

15-
#if defined(__CUDACC__) && !defined(USE_ROCM)
15+
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
1616
#include <cuda_bf16.h>
1717
#endif
1818

@@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
4646
/* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
4747
inline C10_HOST_DEVICE operator float() const;
4848

49-
#if defined(__CUDACC__) && !defined(USE_ROCM)
49+
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
5050
inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
5151
explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
5252
#endif
@@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
124124
/// Constructors
125125
inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
126126
:
127-
#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
128-
__CUDA_ARCH__ >= 800
127+
#if defined(__CUDACC__) && \
128+
(!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
129+
defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
129130
x(__bfloat16_as_ushort(__float2bfloat16(value)))
130131
#elif defined(__SYCL_DEVICE_ONLY__) && \
131132
defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
139140

140141
/// Implicit conversions
141142
inline C10_HOST_DEVICE BFloat16::operator float() const {
142-
#if defined(__CUDACC__) && !defined(USE_ROCM)
143+
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
143144
return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
144145
#elif defined(__SYCL_DEVICE_ONLY__) && \
145146
defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
149150
#endif
150151
}
151152

152-
#if defined(__CUDACC__) && !defined(USE_ROCM)
153+
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
153154
inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
154155
x = *reinterpret_cast<const unsigned short*>(&value);
155156
}

torch_pin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
TORCH_VERSION = "2.11.0"
2-
# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287
2+
NIGHTLY_VERSION = "dev20260422"

0 commit comments

Comments
 (0)