Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/pytorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
release/2.11
9daaaa239547cafe96a1689972bcb34b8fe8afdd
79 changes: 63 additions & 16 deletions runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#define C10_HIP_HOST_DEVICE
#endif

#if defined(USE_ROCM)
// C10_WARP_SIZE is only allowed for device code.
// Host code _must_ use at::cuda::warp_size()
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
// Host or device statically-sized arrays _must_ use either
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
//
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
// set it to something unreasonable to trigger obvious host code errors.

// For a time, that allowed C10_WARP_SIZE to be defined like so:
//
// #ifdef USE_ROCM
// #define C10_WARP_SIZE warpSize
// #else
// #define C10_WARP_SIZE 32
// #endif
//
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
// enforced by using __device__ in its definition. In host code where
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
// cause a compile-time error.
//
// If an array was previously expected to be sized at compile-time using
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
//
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
// must now use at::cuda::warp_size() for the dynamic runtime query.
//
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
// device since approximately 2019, so forcing it to be device-only would break
// existing code in the wild.
#if defined(USE_ROCM)
namespace at::cuda {
TORCH_CUDA_CPP_API int warp_size();
}
#ifdef __HIPCC__
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
#if defined(__HIPCC__)
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}

static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
// we can use constexpr. This matches prior behavior. We preserve this for
// backward compatibility instead of forcing old code to use dynamic warpSize
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
// warpSize is not allowed.
#if defined(__SPIRV__)
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
return warpSize;
}
#else // __SPIRV__
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
return 64;
#else // __GFX9__
return 32;
#endif // __GFX9__
}
#else // __HIPCC__
#endif // __SPIRV__
#if defined(__SPIRV__)
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#elif defined(__GFX9__)
#define C10_WARP_SIZE_LOWER_BOUND 64
#define C10_WARP_SIZE_UPPER_BOUND 64
#else
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif
#else // !__HIPCC__
static inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#endif // __HIPCC__

#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
#define C10_WARP_SIZE_STATIC 64

#else // defined(USE_ROCM)
#else // !USE_ROCM
#define C10_WARP_SIZE 32
#endif
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif // USE_ROCM

#if defined(_MSC_VER) && _MSC_VER <= 1900
#define __func__ __FUNCTION__
Expand Down Expand Up @@ -629,7 +676,7 @@ __host__ __device__
// This macro is used to find older C++ compilers
// that don't support move optimization for return values.

#if (defined(__GNUC__) && __GNUC__ < 13) || \
#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
(defined(__clang_major__) && __clang_major__ < 13)
#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <iosfwd>
#include <ostream>

#if defined(__CUDACC__) && !defined(USE_ROCM)
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
#include <cuda_bf16.h>
#endif

Expand Down Expand Up @@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
/* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
inline C10_HOST_DEVICE operator float() const;

#if defined(__CUDACC__) && !defined(USE_ROCM)
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
#endif
Expand Down Expand Up @@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
/// Constructors
inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
:
#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
__CUDA_ARCH__ >= 800
#if defined(__CUDACC__) && \
(!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
x(__bfloat16_as_ushort(__float2bfloat16(value)))
#elif defined(__SYCL_DEVICE_ONLY__) && \
defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
Expand All @@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)

/// Implicit conversions
inline C10_HOST_DEVICE BFloat16::operator float() const {
#if defined(__CUDACC__) && !defined(USE_ROCM)
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
#elif defined(__SYCL_DEVICE_ONLY__) && \
defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
Expand All @@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
#endif
}

#if defined(__CUDACC__) && !defined(USE_ROCM)
#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
x = *reinterpret_cast<const unsigned short*>(&value);
}
Expand Down
2 changes: 1 addition & 1 deletion torch_pin.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
TORCH_VERSION = "2.11.0"
# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287
NIGHTLY_VERSION = "dev20260422"
Loading