Update torch pin to nightly 2.12.0.dev20260410

mergennachin · claude · mergennachin · commit b3931e084b60 · 2026-04-23T12:37:06.000-04:00
Bumps torch/torchvision/torchaudio/torchcodec to the 2026-04-10 nightly,
updates the PyTorch commit pin and grafts the corresponding c10/headeronly
headers, and restores the guarding_hint_or_throw fallback in
exir/sym_util.py so shape_env.size_hint is only used when the newer API
is unavailable.

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-659af3c353e49b35c191cdd2dba3b3c79d0e6822
+7527b8c5c21e98eaa88ba6a1e86a56c7019beb9e
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.12.0.dev20260410 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.12.0.dev20260410 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
@@ -507,13 +507,11 @@ def _create_metal_partitioners(programs):
 
     # Run decompositions for non-preprocessor programs
     updated_programs = {}
+    decomp_table = torch.export.default_decompositions()
+    decomp_table[torch.ops.aten.linear.default] = _linear_bias_decomposition
     for key, ep in programs.items():
-        # print(f"Running decompositions for {key}")
-        # print(ep.graph_module)
         if key != "preprocessor":
-            updated_programs[key] = ep.run_decompositions(
-                {torch.ops.aten.linear.default: _linear_bias_decomposition}
-            )
+            updated_programs[key] = ep.run_decompositions(decomp_table)
         else:
             updated_programs[key] = ep
 
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
@@ -402,10 +402,10 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
 
         # Run decompositions for Metal backend
         updated_programs = {}
+        decomp_table = torch.export.default_decompositions()
+        decomp_table[torch.ops.aten.linear.default] = _linear_bias_decomposition
         for key, ep in programs.items():
-            updated_programs[key] = ep.run_decompositions(
-                {torch.ops.aten.linear.default: _linear_bias_decomposition}
-            )
+            updated_programs[key] = ep.run_decompositions(decomp_table)
         programs = updated_programs
 
         partitioner = {}
diff --git a/exir/sym_util.py b/exir/sym_util.py
@@ -25,7 +25,11 @@ def eval_expr(symint: Union[int, torch.SymInt]) -> Optional[int]:
     shape_env = node.shape_env
     expr = node.expr
     try:
-        output = shape_env.size_hint(expr)
+        if hasattr(shape_env, "guarding_hint_or_throw"):
+            output = shape_env.guarding_hint_or_throw(expr)
+        else:
+            # size_hint is deprecated, delete this code path.
+            output = shape_env.size_hint(expr)
     except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
         return None
     return int(output)
diff --git a/install_requirements.py b/install_requirements.py
@@ -119,7 +119,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.26.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.27.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h
@@ -86,6 +86,41 @@ C10_HOST_DEVICE inline c10::complex<T> pow(
 #endif
 }
 
+// Regression in ROCm 7.2. See https://github.com/ROCm/rocm-libraries/pull/3836.
+// Specialized version for complex<float> on AMD GPUs to use FMA-based
+// multiplication
+#if defined(__HIPCC__)
+namespace detail {
+// FMA-aware complex multiplication for float precision on AMD GPUs.
+// This prevents SLP vectorizer from breaking FMA formation, which causes
+// numerical precision loss in complex arithmetic.
+// The issue occurs when vectorizer packs scalar multiplies before backend
+// can form FMA instructions, resulting in double rounding instead of single.
+C10_HOST_DEVICE inline thrust::complex<float> complex_mul_fma(
+    thrust::complex<float> a,
+    thrust::complex<float> b) {
+  // Complex multiplication: (a.r + a.i*i) * (b.r + b.i*i)
+  // = (a.r*b.r - a.i*b.i) + (a.r*b.i + a.i*b.r)*i
+  // Using __builtin_fmaf ensures FMA at source level:
+  // real: a.r*b.r + (-(a.i*b.i)) = FMA(a.r, b.r, -(a.i*b.i))
+  // imag: a.i*b.r + a.r*b.i = FMA(a.r, b.i, a.i*b.r)
+  float real_part = __builtin_fmaf(a.real(), b.real(), -(a.imag() * b.imag()));
+  float imag_part = __builtin_fmaf(a.real(), b.imag(), a.imag() * b.real());
+  return thrust::complex<float>(real_part, imag_part);
+}
+} // namespace detail
+
+template <>
+C10_HOST_DEVICE inline c10::complex<float> pow(
+    const c10::complex<float>& x,
+    const c10::complex<float>& y) {
+  auto log_x = thrust::log(static_cast<thrust::complex<float>>(x));
+  auto y_log_x =
+      detail::complex_mul_fma(static_cast<thrust::complex<float>>(y), log_x);
+  return static_cast<c10::complex<float>>(thrust::exp(y_log_x));
+}
+#endif
+
 template <typename T>
 C10_HOST_DEVICE inline c10::complex<T> pow(
     const c10::complex<T>& x,
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
-#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code _must_ use at::cuda::warp_size()
+// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
+// Host or device statically-sized arrays _must_ use either
+// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
+//
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
-// set it to something unreasonable to trigger obvious host code errors.
-
+// For a time, that allowed C10_WARP_SIZE to be defined like so:
+//
+// #ifdef USE_ROCM
+// #define C10_WARP_SIZE warpSize
+// #else
+// #define C10_WARP_SIZE 32
+// #endif
+//
+// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
+// We can now only use warpSize for C10_WARP_SIZE in device code and this is
+// enforced by using __device__ in its definition.  In host code where
+// C10_WARP_SIZE was previously used as a compile-time constant, this will now
+// cause a compile-time error.
+//
+// If an array was previously expected to be sized at compile-time using
+// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
+// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
+//
+// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
+// must now use at::cuda::warp_size() for the dynamic runtime query.
+//
+// Unfortunately, C10_WARP_SIZE has been public and available for both host and
+// device since approximately 2019, so forcing it to be device-only would break
+// existing code in the wild.
+#if defined(USE_ROCM)
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#ifdef __HIPCC__
-static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+#if defined(__HIPCC__)
+static __host__ inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-
-static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
+// NOTE: __device__ C10_WARP_SIZE_INTERNAL
+// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
+// we can use constexpr. This matches prior behavior. We preserve this for
+// backward compatibility instead of forcing old code to use dynamic warpSize
+// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
+// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
+// warpSize is not allowed.
+#if defined(__SPIRV__)
+static __device__ inline int C10_WARP_SIZE_INTERNAL() {
+  return warpSize;
+}
+#else // __SPIRV__
+static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#else // __HIPCC__
+#endif // __SPIRV__
+#if defined(__SPIRV__)
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#elif defined(__GFX9__)
+#define C10_WARP_SIZE_LOWER_BOUND 64
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#else
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif
+#else // !__HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
-
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#define C10_WARP_SIZE_STATIC 64
-
-#else // defined(USE_ROCM)
+#else // !USE_ROCM
 #define C10_WARP_SIZE 32
-#endif
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif // USE_ROCM
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
@@ -629,7 +676,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13) || \
+#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -12,7 +12,7 @@
 #include <iosfwd>
 #include <ostream>
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
 #include <cuda_bf16.h>
 #endif
 
@@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
   inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
   explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
 #endif
@@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
     :
-#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
-    __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) &&                                                   \
+    (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
+     defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
 #endif
 }
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
 inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
diff --git a/torch_pin.py b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20260215"
+TORCH_VERSION = "2.12.0"
+NIGHTLY_VERSION = "dev20260410"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-659af3c353e49b35c191cdd2dba3b3c79d0e6822`
	`1`	`+7527b8c5c21e98eaa88ba6a1e86a56c7019beb9e`
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def install_optional_example_requirements(use_pytorch_nightly):`
`119`	`119`	`print("Installing torch domain libraries")`
`120`	`120`	`DOMAIN_LIBRARIES = [`
`121`	`121`	`(`
`122`		`- f"torchvision==0.26.0.{NIGHTLY_VERSION}"`
	`122`	`+ f"torchvision==0.27.0.{NIGHTLY_VERSION}"`
`123`	`123`	`if use_pytorch_nightly`
`124`	`124`	`else "torchvision"`
`125`	`125`	`),`