Skip to content

Commit 8ea7387

Browse files
authored
Merge branch 'main' into baris_mletorch-1945-Part2_2_vgf_integration
2 parents 1020f0f + 3a9230d commit 8ea7387

231 files changed

Lines changed: 15898 additions & 1382 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ MODEL=$1
1717
script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
1818
et_root_dir=$(realpath "${script_dir}/../..")
1919

20-
# Quantization is the default for the cortex-m55+int8 target; run.sh's
20+
# Quantization is the default for the cortex-m55 target; run.sh's
2121
# arg parser only recognizes --no_quantize, so we omit any explicit flag.
2222
bash "${et_root_dir}/examples/arm/run.sh" \
2323
--model_name="${MODEL}" \
24-
--target=cortex-m55+int8 \
24+
--target=cortex-m55 \
2525
--bundleio

.github/workflows/cuda.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ jobs:
148148
# Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache + sampler)
149149
python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
150150
151+
# Run Gemma 4 31B tests (quant unit tests + pipeline integration tests)
152+
pip install gguf
153+
python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ -v -o "addopts="
154+
151155
export-model-cuda-artifact:
152156
name: export-model-cuda-artifact
153157
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)

.github/workflows/mlx.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -501,12 +501,26 @@ jobs:
501501
name: "gemma3-1b"
502502
use-custom: [false, true]
503503
qconfig: ["4w", "nvfp4"]
504+
runner: ["macos-14-xlarge"]
505+
include:
506+
- model:
507+
id: "google/gemma-4-E2B-it"
508+
name: "gemma4-e2b"
509+
use-custom: true
510+
qconfig: "4w"
511+
runner: "macos-15-xlarge"
512+
- model:
513+
id: "google/gemma-4-E2B-it"
514+
name: "gemma4-e2b"
515+
use-custom: false
516+
qconfig: "4w"
517+
runner: "macos-15-xlarge"
504518
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
505519
secrets: inherit
506520
with:
507521
default-packages: ""
508522
job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
509-
runner: macos-14-xlarge
523+
runner: ${{ matrix.runner }}
510524
python-version: "3.12"
511525
submodules: recursive
512526
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -521,12 +535,16 @@ jobs:
521535
MODEL_NAME="${{ matrix.model.name }}"
522536
USE_CUSTOM="${{ matrix.use-custom }}"
523537
QCONFIG="${{ matrix.qconfig }}"
524-
525538
CUSTOM_ARGS=""
526539
if [ "${USE_CUSTOM}" = "true" ]; then
527540
CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
528541
fi
529542
543+
QEMBEDDING_ARGS="--qembedding ${QCONFIG}"
544+
if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
545+
QEMBEDDING_ARGS=""
546+
fi
547+
530548
echo "::group::Install ExecuTorch and configure MLX build"
531549
${CONDA_RUN} python install_executorch.py > /dev/null
532550
${CONDA_RUN} cmake --preset mlx-release
@@ -537,6 +555,13 @@ jobs:
537555
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
538556
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
539557
${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
558+
if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
559+
# Gemma 4 requires a newer Transformers build than the CI-wide
560+
# optimum-executorch pin currently brings in. Keep this pinned to the
561+
# locally validated commit instead of floating on Transformers HEAD.
562+
GEMMA4_TRANSFORMERS_COMMIT=61461a7bcb458db7cf6eeea49678b9ab776a7821
563+
${CONDA_RUN} pip install -U "transformers @ git+https://github.com/huggingface/transformers.git@${GEMMA4_TRANSFORMERS_COMMIT}"
564+
fi
540565
echo "::endgroup::"
541566
542567
${CONDA_RUN} pip list
@@ -546,7 +571,7 @@ jobs:
546571
--model-id "${MODEL_ID}" \
547572
--output /tmp/${MODEL_NAME}.pte \
548573
--qlinear ${QCONFIG} \
549-
--qembedding ${QCONFIG} \
574+
${QEMBEDDING_ARGS} \
550575
${CUSTOM_ARGS}
551576
echo "::endgroup::"
552577

.github/workflows/pull.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,11 +1489,11 @@ jobs:
14891489
.ci/scripts/setup-linux.sh --build-tool "cmake"
14901490
14911491
# Custom operator tests
1492-
PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add
1493-
./cmake-out/backends/vulkan/test/custom_ops/q8csw_linear
1494-
./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
1495-
./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
1496-
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
1492+
PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh test_add
1493+
./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_linear
1494+
./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_conv2d
1495+
./cmake-out/backends/vulkan/test/custom_ops/test_q4gsw_linear
1496+
./cmake-out/backends/vulkan/test/custom_ops/test_choose_qparams_per_row
14971497
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
14981498
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
14991499
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary

.github/workflows/trunk.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,11 @@ jobs:
380380
381381
ARM_TEST=${{ matrix.test_arm_baremetal }}
382382
383+
# Output test report on pytest runs so that github can surface failing tests.
384+
if [[ -n "${RUNNER_TEST_RESULTS_DIR:-}" ]]; then
385+
export PYTEST_ADDOPTS="--junit-xml=${RUNNER_TEST_RESULTS_DIR}/${ARM_TEST}.xml ${PYTEST_ADDOPTS:-}"
386+
fi
387+
383388
# Test test_arm_baremetal.sh with test
384389
backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
385390
@@ -415,6 +420,11 @@ jobs:
415420
416421
ARM_TEST=${{ matrix.test_arm_baremetal }}
417422
423+
# Output test report on pytest runs so that github can surface failing tests.
424+
if [[ -n "${RUNNER_TEST_RESULTS_DIR:-}" ]]; then
425+
export PYTEST_ADDOPTS="--junit-xml=${RUNNER_TEST_RESULTS_DIR}/${ARM_TEST}.xml ${PYTEST_ADDOPTS:-}"
426+
fi
427+
418428
backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
419429
420430
test-arm-ootb-linux:

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ if(NOT EXECUTORCH_ENABLE_PROGRAM_VERIFICATION)
189189
add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
190190
endif()
191191

192+
# Disable the deprecated constant_buffer path.
193+
add_definitions(-DET_ENABLE_DEPRECATED_CONSTANT_BUFFER=0)
194+
192195
if(EXECUTORCH_ENABLE_EVENT_TRACER)
193196
add_definitions(-DET_EVENT_TRACER_ENABLED)
194197
endif()

Makefile

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -126,6 +126,7 @@ help:
126126
@echo " llava-cpu - Build Llava runner with CPU backend"
127127
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
128128
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
129+
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
129130
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
130131
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
131132
@echo " clean - Clean build artifacts"
@@ -425,6 +426,15 @@ qwen3_5_moe-cuda:
425426
@echo "✓ Build complete!"
426427
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
427428

429+
gemma4_31b-cuda:
430+
@echo "==> Building and installing ExecuTorch with CUDA..."
431+
cmake --workflow --preset llm-release-cuda
432+
@echo "==> Building Gemma 4 31B runner with CUDA..."
433+
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
434+
@echo ""
435+
@echo "✓ Build complete!"
436+
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
437+
428438
qwen3_5_moe-metal:
429439
@echo "==> Building and installing ExecuTorch with Metal..."
430440
cmake --workflow --preset llm-release-metal

Package.swift

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
// https://pytorch.org/executorch/main/using-executorch-ios
1919

2020
import PackageDescription
21+
import Foundation
2122

2223
let debug_suffix = "_debug"
2324
let dependencies_suffix = "_with_dependencies"
@@ -126,6 +127,48 @@ for (key, value) in products {
126127
packageTargets.append(target)
127128
}
128129

130+
// Test fixtures. add_coreml.pte and add_mul_coreml.pte are generated at CI
131+
// time by extension/apple/ExecuTorch/__tests__/resources/generate_coreml_test_models.py
132+
// (invoked by scripts/build_apple_frameworks.sh before `swift test`). They
133+
// are gitignored, so include them in test resources only when present so
134+
// that `swift test` runs on dev machines without CoreML python deps don't
135+
// fail at the SwiftPM resolve stage.
136+
let testResourcesDir = "extension/apple/ExecuTorch/__tests__/resources"
137+
var testResources: [Resource] = [.copy("resources/add.pte")]
138+
if FileManager.default.fileExists(atPath: "\(testResourcesDir)/add_coreml.pte") {
139+
testResources.append(.copy("resources/add_coreml.pte"))
140+
}
141+
if FileManager.default.fileExists(atPath: "\(testResourcesDir)/add_mul_coreml.pte") {
142+
testResources.append(.copy("resources/add_mul_coreml.pte"))
143+
}
144+
145+
// SwiftPM resources must live under the target's path, so the ObjC test
146+
// target uses copies of the canonical resources directory's fixtures. The
147+
// copies themselves are gitignored and (re)created by scripts/build_apple_frameworks.sh.
148+
let objcTestsDir = "extension/apple/ExecuTorch/__tests__/ObjC"
149+
var objcTestResources: [Resource] = []
150+
if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add.pte") {
151+
objcTestResources.append(.copy("add.pte"))
152+
}
153+
if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add_coreml.pte") {
154+
objcTestResources.append(.copy("add_coreml.pte"))
155+
}
156+
if FileManager.default.fileExists(atPath: "\(objcTestsDir)/add_mul_coreml.pte") {
157+
objcTestResources.append(.copy("add_mul_coreml.pte"))
158+
}
159+
160+
let testLinkerSettings: [LinkerSetting] = [
161+
.unsafeFlags([
162+
"-Xlinker", "-force_load",
163+
"-Xlinker", "cmake-out/kernels_optimized.xcframework/macos-arm64/libkernels_optimized_macos.a",
164+
// CoreML backend registers itself with the global delegate registry via a
165+
// static initializer; -force_load ensures that initializer is pulled in so
166+
// the CoreML-delegated test fixtures can actually instantiate the backend.
167+
"-Xlinker", "-force_load",
168+
"-Xlinker", "cmake-out/backend_coreml.xcframework/macos-arm64/libbackend_coreml_macos.a",
169+
])
170+
]
171+
129172
let package = Package(
130173
name: "executorch",
131174
platforms: [
@@ -139,17 +182,24 @@ let package = Package(
139182
dependencies: [
140183
.target(name: "executorch\(debug_suffix)"),
141184
.target(name: "kernels_optimized\(dependencies_suffix)"),
185+
.target(name: "backend_coreml\(dependencies_suffix)"),
142186
],
143187
path: "extension/apple/ExecuTorch/__tests__",
144-
resources: [
145-
.copy("resources/add.pte"),
188+
exclude: ["ObjC", "resources/generate_coreml_test_models.py", "resources/.gitignore"],
189+
resources: testResources,
190+
linkerSettings: testLinkerSettings
191+
),
192+
.testTarget(
193+
name: "objc_tests",
194+
dependencies: [
195+
.target(name: "executorch\(debug_suffix)"),
196+
.target(name: "kernels_optimized\(dependencies_suffix)"),
197+
.target(name: "backend_coreml\(dependencies_suffix)"),
146198
],
147-
linkerSettings: [
148-
.unsafeFlags([
149-
"-Xlinker", "-force_load",
150-
"-Xlinker", "cmake-out/kernels_optimized.xcframework/macos-arm64/libkernels_optimized_macos.a",
151-
])
152-
]
199+
path: "extension/apple/ExecuTorch/__tests__/ObjC",
200+
exclude: [".gitignore"],
201+
resources: objcTestResources,
202+
linkerSettings: testLinkerSettings
153203
)
154204
]
155205
)

backends/aoti/slim/core/slim_tensor.h

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -433,13 +433,19 @@ class SlimTensor {
433433
/**
434434
* Copy data from another tensor to this tensor.
435435
*
436-
* Both tensors must have the same numel and dtype.
437-
* Currently only supports CPU-to-CPU copy (contiguous tensors only).
436+
* Both tensors must have the same numel, sizes and dtype.
438437
*
439438
* @param other The source tensor to copy from
440439
* @return Reference to this tensor
441440
*/
442441
SlimTensor& copy_(const SlimTensor& other) {
442+
ET_CHECK_MSG(
443+
this->dim() == other.dim(),
444+
"copy_: dim of tensors must match (%zu vs %zu)",
445+
this->dim(),
446+
other.dim());
447+
ET_CHECK_MSG(
448+
this->sizes() == other.sizes(), "copy_: sizes of tensors must match");
443449
ET_CHECK_MSG(
444450
this->numel() == other.numel(), "copy_: numel of tensors must match");
445451
ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype must match");
@@ -463,29 +469,43 @@ class SlimTensor {
463469

464470
std::vector<int64_t> counter(this->dim(), 0);
465471
for (size_t i = 0; i < this->numel(); i++) {
466-
// Compute src offset in elements
467472
int64_t src_offset = 0;
468-
for (size_t d = 0; d < other.dim(); d++) {
469-
src_offset += counter[d] * other.stride(d);
470-
}
471-
472-
// Compute dst offset in elements
473473
int64_t dst_offset = 0;
474474
for (size_t d = 0; d < this->dim(); d++) {
475-
dst_offset += counter[d] * this->stride(d);
475+
int64_t src_term = 0;
476+
int64_t dst_term = 0;
477+
// src_offset = src_offset + counter[d] * other.stride(d)
478+
// dst_offset = dst_offset + counter[d] * this->stride(d)
479+
ET_CHECK_MSG(
480+
!::c10::mul_overflows(counter[d], other.stride(d), &src_term) &&
481+
!::c10::add_overflows(src_offset, src_term, &src_offset) &&
482+
!::c10::mul_overflows(counter[d], this->stride(d), &dst_term) &&
483+
!::c10::add_overflows(dst_offset, dst_term, &dst_offset),
484+
"copy_: offset computation overflow");
476485
}
486+
size_t src_byte_offset = 0;
487+
size_t dst_byte_offset = 0;
488+
// src_byte_offset = src_offset * elem_size
489+
// dst_byte_offset = dst_offset * elem_size
490+
ET_CHECK_MSG(
491+
src_offset >= 0 && dst_offset >= 0 &&
492+
!::c10::mul_overflows(
493+
static_cast<size_t>(src_offset),
494+
elem_size,
495+
&src_byte_offset) &&
496+
!::c10::mul_overflows(
497+
static_cast<size_t>(dst_offset), elem_size, &dst_byte_offset),
498+
"copy_: byte offset overflow");
477499

478500
// Copy elem_size bytes from src to dst
479501
if (this->device().is_cpu() && other.device().is_cpu()) {
480502
std::memcpy(
481-
dst_data + dst_offset * elem_size,
482-
src_data + src_offset * elem_size,
483-
elem_size);
503+
dst_data + dst_byte_offset, src_data + src_byte_offset, elem_size);
484504
} else if (this->device().is_cuda() || other.device().is_cuda()) {
485505
#if defined(CUDA_AVAILABLE)
486506
DeviceTraits<c10::DeviceType::CUDA>::memcpy(
487-
dst_data + dst_offset * elem_size,
488-
src_data + src_offset * elem_size,
507+
dst_data + dst_byte_offset,
508+
src_data + src_byte_offset,
489509
elem_size,
490510
device(), // dst device
491511
other.device() // src device

backends/arm/_passes/arm_pass_manager.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@
150150
from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
151151
from executorch.backends.arm.common.pipeline_config import (
152152
ArmPassPipelineConfig,
153-
FuseDuplicateUsersConfig,
154153
SoftmaxDecompositionConfig,
155154
)
156155
from executorch.backends.arm.tosa.specification import (
@@ -238,9 +237,6 @@ def configure_skip_passes(
238237
case SoftmaxDecompositionConfig.STABLE:
239238
skip_set.add(DecomposeMaskedFillPass)
240239

241-
if config.fuse_duplicate_users is FuseDuplicateUsersConfig.DISABLED:
242-
skip_set.add(FuseDuplicateUsersPass)
243-
244240
self._skip_pass_types = tuple(skip_set)
245241
skip_names = [skipped_pass.__name__ for skipped_pass in self._skip_pass_types]
246242
logger.debug(f"Passes in skip list: {skip_names}")
@@ -403,9 +399,6 @@ def _tosa_pipeline(
403399
ConvertToClampPass(),
404400
DecomposeTOSAUnsupportedClampPass(),
405401
DecomposeGroupNormPass(),
406-
DecomposeGruPass(),
407-
DecomposeLstmPass(),
408-
DecomposeRnnPass(),
409402
DecomposeLayerNormPass(),
410403
DecomposeVarPass(),
411404
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec),

0 commit comments

Comments
 (0)