Skip to content

Commit 7a8f892

Browse files
committed
Roll torch nightly forward from 04-08 to 04-09
The 04-08 nightly is broken for iOS builds: vec128_convert.h's VecConvert specializations call convert_float_bfloat16 / convert_float_half, but those functions are gated behind !defined(C10_MOBILE) in the _neon.h headers. iOS defines C10_MOBILE, so the symbols are undeclared and the compile fails in kernels/portable/cpu/op_where.cpp. The offending upstream change (#177009) was reverted on 04-09, so move the pin one day forward. Refreshed the grafted c10/torch/headeronly/macros/Macros.h from the 04-09 tip via .github/scripts/update_pytorch_pin.py. Authored with Claude.
1 parent da3a6f1 commit 7a8f892

5 files changed

Lines changed: 66 additions & 19 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
e926f3c03a2188c24e2e15b7faebe24287ef6e93
1+
358117c166b75167a09bca81ac9925940feda339

.ci/scripts/test_model_e2e.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
260260
elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
261261
conda install -y -c conda-forge "ffmpeg<8"
262262
pip install datasets soundfile
263-
pip install torchcodec==0.12.0.dev20260408 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
263+
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
264264
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
265265
fi
266266

examples/models/moshi/mimi/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
set -x
99

1010
sudo apt install ffmpeg -y
11-
pip install torchcodec==0.12.0.dev20260408 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
11+
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
1212
pip install moshi==0.2.11
1313
pip install bitsandbytes soundfile einops
1414
# Run llama2/install requirements for torchao deps

runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
325325
#define C10_HIP_HOST_DEVICE
326326
#endif
327327

328-
#if defined(USE_ROCM)
329328
// C10_WARP_SIZE is only allowed for device code.
330-
// Host code _must_ use at::cuda::warp_size()
329+
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
330+
// Host or device statically-sized arrays _must_ use either
331+
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
332+
//
331333
// HIP header used to define warpSize as a constexpr that was either 32 or 64
332334
// depending on the target device, and then always set it to 64 for host code.
333-
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
334-
// set it to something unreasonable to trigger obvious host code errors.
335-
335+
// For a time, that allowed C10_WARP_SIZE to be defined like so:
336+
//
337+
// #ifdef USE_ROCM
338+
// #define C10_WARP_SIZE warpSize
339+
// #else
340+
// #define C10_WARP_SIZE 32
341+
// #endif
342+
//
343+
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
344+
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
345+
// enforced by using __device__ in its definition. In host code where
346+
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
347+
// cause a compile-time error.
348+
//
349+
// If an array was previously expected to be sized at compile-time using
350+
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
351+
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
352+
//
353+
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
354+
// must now use at::cuda::warp_size() for the dynamic runtime query.
355+
//
356+
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
357+
// device since approximately 2019, so forcing it to be device-only would break
358+
// existing code in the wild.
359+
#if defined(USE_ROCM)
336360
namespace at::cuda {
337361
TORCH_CUDA_CPP_API int warp_size();
338362
}
339-
#ifdef __HIPCC__
340-
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
363+
#if defined(__HIPCC__)
364+
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
341365
return at::cuda::warp_size();
342366
}
343-
344-
static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
367+
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
368+
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
369+
// we can use constexpr. This matches prior behavior. We preserve this for
370+
// backward compatibility instead of forcing old code to use dynamic warpSize
371+
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
372+
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
373+
// warpSize is not allowed.
374+
#if defined(__SPIRV__)
375+
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
376+
return warpSize;
377+
}
378+
#else // __SPIRV__
379+
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
345380
#if defined(__GFX9__)
346381
return 64;
347382
#else // __GFX9__
348383
return 32;
349384
#endif // __GFX9__
350385
}
351-
#else // __HIPCC__
386+
#endif // __SPIRV__
387+
#if defined(__SPIRV__)
388+
#define C10_WARP_SIZE_LOWER_BOUND 32
389+
#define C10_WARP_SIZE_UPPER_BOUND 64
390+
#elif defined(__GFX9__)
391+
#define C10_WARP_SIZE_LOWER_BOUND 64
392+
#define C10_WARP_SIZE_UPPER_BOUND 64
393+
#else
394+
#define C10_WARP_SIZE_LOWER_BOUND 32
395+
#define C10_WARP_SIZE_UPPER_BOUND 32
396+
#endif
397+
#else // !__HIPCC__
352398
static inline int C10_WARP_SIZE_INTERNAL() {
353399
return at::cuda::warp_size();
354400
}
401+
#define C10_WARP_SIZE_LOWER_BOUND 32
402+
#define C10_WARP_SIZE_UPPER_BOUND 64
355403
#endif // __HIPCC__
356-
357404
#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
358-
#define C10_WARP_SIZE_STATIC 64
359-
360-
#else // defined(USE_ROCM)
405+
#else // !USE_ROCM
361406
#define C10_WARP_SIZE 32
362-
#endif
407+
#define C10_WARP_SIZE_LOWER_BOUND 32
408+
#define C10_WARP_SIZE_UPPER_BOUND 32
409+
#endif // USE_ROCM
363410

364411
#if defined(_MSC_VER) && _MSC_VER <= 1900
365412
#define __func__ __FUNCTION__

torch_pin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
TORCH_VERSION = "2.12.0"
2-
NIGHTLY_VERSION = "dev20260408"
2+
NIGHTLY_VERSION = "dev20260409"

0 commit comments

Comments
 (0)