Roll torch nightly forward from 04-08 to 04-09

mergennachin · mergennachin · commit 7a8f892456ce · 2026-04-24T17:31:09.000-04:00
The 04-08 nightly is broken for iOS builds: vec128_convert.h's VecConvert
specializations call convert_float_bfloat16 / convert_float_half, but
those functions are gated behind !defined(C10_MOBILE) in the _neon.h
headers. iOS defines C10_MOBILE, so the symbols are undeclared and the
compile fails in kernels/portable/cpu/op_where.cpp. The offending
upstream change (#177009) was reverted on 04-09, so move the pin one
day forward. Refreshed the grafted c10/torch/headeronly/macros/Macros.h
from the 04-09 tip via .github/scripts/update_pytorch_pin.py.

Authored with Claude.
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-e926f3c03a2188c24e2e15b7faebe24287ef6e93
+358117c166b75167a09bca81ac9925940feda339
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.12.0.dev20260408 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.12.0.dev20260408 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
-#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code _must_ use at::cuda::warp_size()
+// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
+// Host or device statically-sized arrays _must_ use either
+// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
+//
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
-// set it to something unreasonable to trigger obvious host code errors.
-
+// For a time, that allowed C10_WARP_SIZE to be defined like so:
+//
+// #ifdef USE_ROCM
+// #define C10_WARP_SIZE warpSize
+// #else
+// #define C10_WARP_SIZE 32
+// #endif
+//
+// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
+// We can now only use warpSize for C10_WARP_SIZE in device code and this is
+// enforced by using __device__ in its definition.  In host code where
+// C10_WARP_SIZE was previously used as a compile-time constant, this will now
+// cause a compile-time error.
+//
+// If an array was previously expected to be sized at compile-time using
+// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
+// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
+//
+// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
+// must now use at::cuda::warp_size() for the dynamic runtime query.
+//
+// Unfortunately, C10_WARP_SIZE has been public and available for both host and
+// device since approximately 2019, so forcing it to be device-only would break
+// existing code in the wild.
+#if defined(USE_ROCM)
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#ifdef __HIPCC__
-static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+#if defined(__HIPCC__)
+static __host__ inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-
-static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
+// NOTE: __device__ C10_WARP_SIZE_INTERNAL
+// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
+// we can use constexpr. This matches prior behavior. We preserve this for
+// backward compatibility instead of forcing old code to use dynamic warpSize
+// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
+// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
+// warpSize is not allowed.
+#if defined(__SPIRV__)
+static __device__ inline int C10_WARP_SIZE_INTERNAL() {
+  return warpSize;
+}
+#else // __SPIRV__
+static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#else // __HIPCC__
+#endif // __SPIRV__
+#if defined(__SPIRV__)
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#elif defined(__GFX9__)
+#define C10_WARP_SIZE_LOWER_BOUND 64
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#else
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif
+#else // !__HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
-
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#define C10_WARP_SIZE_STATIC 64
-
-#else // defined(USE_ROCM)
+#else // !USE_ROCM
 #define C10_WARP_SIZE 32
-#endif
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif // USE_ROCM
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
diff --git a/torch_pin.py b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.12.0"
-NIGHTLY_VERSION = "dev20260408"
+NIGHTLY_VERSION = "dev20260409"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e926f3c03a2188c24e2e15b7faebe24287ef6e93`
	`1`	`+358117c166b75167a09bca81ac9925940feda339`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`TORCH_VERSION = "2.12.0"`
`2`		`-NIGHTLY_VERSION = "dev20260408"`
	`2`	`+NIGHTLY_VERSION = "dev20260409"`