Skip to content

Commit 76d7f72

Browse files
committed
Update on "[ET Device Support] DeviceAllocator interface and DeviceAllocatorRegistry"
This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.). **DeviceAllocator interface provides:** - `init_buffer()` - Initialize memory buffer pools for memory-planned tensors - `get_offset_address()` - Get pointer to offset within pre-allocated buffer - `allocate()` / `deallocate()` - Dynamic device memory allocation - `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device - `device_type()` - Returns the device type this allocator handles **DeviceAllocatorRegistry provides:** - Singleton registry mapping DeviceType → DeviceAllocator - `register_allocator()` / `get_allocator()` methods - Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly) **Design notes:** - Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime - Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry) - Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/) [ghstack-poisoned]
2 parents 8127b3b + 531dfb7 commit 76d7f72

1,056 files changed

Lines changed: 54549 additions & 10183 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/docker/build.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@ case "${IMAGE_NAME}" in
4040
LINTRUNNER=""
4141
GCC_VERSION=11
4242
;;
43+
executorch-ubuntu-22.04-gcc11-aarch64-android)
44+
LINTRUNNER=""
45+
GCC_VERSION=11
46+
ANDROID_NDK_VERSION=r28c
47+
;;
48+
executorch-ubuntu-22.04-gcc11-aarch64-arm-sdk)
49+
ARM_SDK=yes
50+
GCC_VERSION=11
51+
;;
4352
executorch-ubuntu-22.04-linter)
4453
LINTRUNNER=yes
4554
CLANG_VERSION=12
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
1+
a9592258daacad7423fd5f39aaa59c6e36471520
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
7a79b41e29a790ebb4b530eb98a89381e2d7de29
1+
659af3c353e49b35c191cdd2dba3b3c79d0e6822

.ci/docker/common/install_android.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,14 @@ install_ndk() {
4040
rm -rf "${NDK_INSTALLATION_DIR}" && mkdir -p "${NDK_INSTALLATION_DIR}"
4141

4242
pushd /tmp
43-
# The NDK installation is cached on ossci-android S3 bucket
44-
curl -Os --retry 3 "https://ossci-android.s3.amazonaws.com/android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
43+
ARCH=$(uname -m)
44+
if [ "${ARCH}" = "aarch64" ]; then
45+
# aarch64 NDK is not cached on S3, download from Google directly
46+
curl -Os --retry 3 "https://dl.google.com/android/repository/android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
47+
else
48+
# The NDK installation is cached on ossci-android S3 bucket
49+
curl -Os --retry 3 "https://ossci-android.s3.amazonaws.com/android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
50+
fi
4551
unzip -qo "android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
4652

4753
# Print the content for manual verification
@@ -73,7 +79,10 @@ install_sdk() {
7379
yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "build-tools;35.0.0"
7480
# And some more tools for future emulator tests
7581
yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "platform-tools"
76-
yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "tools"
82+
# The 'tools' package (emulator) is not available on aarch64
83+
if [ "$(uname -m)" != "aarch64" ]; then
84+
yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "tools"
85+
fi
7786
}
7887

7988
install_prerequiresites

.ci/docker/requirements-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,6 @@ sphinx-reredirects==0.1.4
3030
matplotlib>=3.9.4
3131
sphinx-copybutton==0.5.2
3232
# PyTorch Theme
33-
pytorch_sphinx_theme2==0.2.0
33+
pytorch_sphinx_theme2==0.4.4
3434
# script unit test requirements
3535
yaspin==3.1.0

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build_qnn_backend() {
1818
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1919

2020
parallelism=$(( $(nproc) - 1 ))
21-
bash backends/qualcomm/scripts/build.sh --skip_linux_android --skip_linux_embedded --job_number ${parallelism} --release
21+
bash backends/qualcomm/scripts/build.sh --skip_linux_android --job_number ${parallelism} --release
2222
}
2323

2424
set_up_aot() {

.ci/scripts/export_model_artifact.sh

Lines changed: 117 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
show_help() {
1111
cat << EOF
12-
Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
12+
Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [mode]
1313
1414
Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
1515
@@ -22,6 +22,7 @@ Arguments:
2222
- mistralai/Voxtral-Mini-4B-Realtime-2602
2323
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2424
- google/gemma-3-4b-it
25+
- nvidia/diar_streaming_sortformer_4spk-v2
2526
- nvidia/parakeet-tdt
2627
2728
quant_name Quantization type (optional, default: non-quantized)
@@ -34,13 +35,23 @@ Arguments:
3435
3536
output_dir Output directory for artifacts (optional, default: current directory)
3637
38+
mode Export mode (optional, default: vr-streaming)
39+
Supported modes:
40+
- vr-streaming: Voxtral Realtime streaming mode
41+
- vr-offline: Voxtral Realtime offline mode
42+
3743
Examples:
3844
export_model_artifact.sh metal "openai/whisper-small"
3945
export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal"
46+
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
47+
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
4048
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
49+
export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
4150
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4251
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
4352
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
53+
export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
54+
export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "./output" "vr-offline"
4455
EOF
4556
}
4657

@@ -61,6 +72,26 @@ DEVICE="$1"
6172
HF_MODEL="$2"
6273
QUANT_NAME="${3:-non-quantized}"
6374
OUTPUT_DIR="${4:-.}"
75+
MODE="${5:-}"
76+
77+
# Validate mode if specified
78+
if [ -n "$MODE" ]; then
79+
case "$MODE" in
80+
vr-streaming|vr-offline)
81+
# Voxtral Realtime modes require Voxtral Realtime model
82+
if [ "$HF_MODEL" != "mistralai/Voxtral-Mini-4B-Realtime-2602" ]; then
83+
echo "Error: Mode '$MODE' can only be used with Voxtral Realtime model"
84+
echo "Provided model: $HF_MODEL"
85+
exit 1
86+
fi
87+
;;
88+
*)
89+
echo "Error: Unsupported mode '$MODE'"
90+
echo "Supported modes: vr-streaming, vr-offline"
91+
exit 1
92+
;;
93+
esac
94+
fi
6495

6596
case "$DEVICE" in
6697
cuda)
@@ -112,6 +143,14 @@ case "$HF_MODEL" in
112143
PREPROCESSOR_FEATURE_SIZE=""
113144
PREPROCESSOR_OUTPUT=""
114145
;;
146+
Qwen/Qwen3-0.6B)
147+
MODEL_NAME="qwen3"
148+
TASK="text-generation"
149+
MAX_SEQ_LEN="64"
150+
EXTRA_PIP=""
151+
PREPROCESSOR_FEATURE_SIZE=""
152+
PREPROCESSOR_OUTPUT=""
153+
;;
115154
nvidia/parakeet-tdt)
116155
MODEL_NAME="parakeet"
117156
TASK=""
@@ -120,6 +159,14 @@ case "$HF_MODEL" in
120159
PREPROCESSOR_FEATURE_SIZE=""
121160
PREPROCESSOR_OUTPUT=""
122161
;;
162+
nvidia/diar_streaming_sortformer_4spk-v2)
163+
MODEL_NAME="sortformer"
164+
TASK=""
165+
MAX_SEQ_LEN=""
166+
EXTRA_PIP=""
167+
PREPROCESSOR_FEATURE_SIZE=""
168+
PREPROCESSOR_OUTPUT=""
169+
;;
123170
mistralai/Voxtral-Mini-4B-Realtime-2602)
124171
MODEL_NAME="voxtral_realtime"
125172
TASK=""
@@ -130,7 +177,7 @@ case "$HF_MODEL" in
130177
;;
131178
*)
132179
echo "Error: Unsupported model '$HF_MODEL'"
133-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
180+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
134181
exit 1
135182
;;
136183
esac
@@ -210,7 +257,43 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
210257
exit 0
211258
fi
212259

213-
# Voxtral Realtime uses a custom export script (streaming mode)
260+
# Sortformer uses a custom export script
261+
if [ "$MODEL_NAME" = "sortformer" ]; then
262+
if [ "$QUANT_NAME" != "non-quantized" ]; then
263+
echo "Error: Sortformer currently supports only non-quantized export"
264+
exit 1
265+
fi
266+
267+
pip install -r examples/models/sortformer/install_requirements.txt
268+
269+
SORTFORMER_BACKEND="$DEVICE"
270+
if [ "$DEVICE" = "cuda-windows" ]; then
271+
SORTFORMER_BACKEND="cuda-windows"
272+
elif [ "$DEVICE" = "cuda" ]; then
273+
SORTFORMER_BACKEND="cuda"
274+
elif [ "$DEVICE" = "xnnpack" ]; then
275+
SORTFORMER_BACKEND="xnnpack"
276+
else
277+
SORTFORMER_BACKEND="portable"
278+
fi
279+
280+
python -m executorch.examples.models.sortformer.export_sortformer \
281+
--hf-model "${HF_MODEL}" \
282+
--backend "${SORTFORMER_BACKEND}" \
283+
--output-dir "${OUTPUT_DIR}"
284+
285+
test -f "${OUTPUT_DIR}/sortformer.pte"
286+
mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte"
287+
# CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
288+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
289+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
290+
fi
291+
ls -al "${OUTPUT_DIR}"
292+
echo "::endgroup::"
293+
exit 0
294+
fi
295+
296+
# Voxtral Realtime uses a custom export script
214297
if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
215298
pip install safetensors huggingface_hub
216299

@@ -220,25 +303,48 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
220303

221304
# Per-component quantization flags
222305
VR_QUANT_ARGS=""
306+
VR_DTYPE_ARGS=""
223307
if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
224308
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
309+
elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
310+
VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
311+
elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
312+
VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
313+
VR_DTYPE_ARGS="--dtype bf16"
314+
fi
315+
316+
# Determine streaming mode based on MODE parameter
317+
USE_STREAMING="true"
318+
if [ "$MODE" = "vr-offline" ]; then
319+
USE_STREAMING="false"
320+
fi
321+
322+
# Configure export and preprocessor based on streaming mode
323+
STREAMING_ARG=""
324+
PREPROCESSOR_ARGS="--feature_size 128 --output_file ${OUTPUT_DIR}/preprocessor.pte"
325+
if [ "$USE_STREAMING" = "true" ]; then
326+
STREAMING_ARG="--streaming"
327+
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --streaming"
328+
else
329+
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
225330
fi
226331

227332
python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
228333
--model-path "$LOCAL_MODEL_DIR" \
229-
--backend xnnpack \
230-
--streaming \
334+
--backend "$DEVICE" \
335+
${STREAMING_ARG} \
231336
--output-dir "${OUTPUT_DIR}" \
232-
${VR_QUANT_ARGS}
337+
${VR_QUANT_ARGS} \
338+
${VR_DTYPE_ARGS}
233339

234-
# Export streaming preprocessor (no chunk padding)
235-
python -m executorch.extension.audio.mel_spectrogram \
236-
--feature_size 128 \
237-
--streaming \
238-
--output_file "${OUTPUT_DIR}/preprocessor.pte"
340+
# Export preprocessor
341+
python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}
239342

240343
test -f "${OUTPUT_DIR}/model.pte"
241344
test -f "${OUTPUT_DIR}/preprocessor.pte"
345+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
346+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
347+
fi
242348
# Copy tokenizer from downloaded model weights
243349
cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
244350
ls -al "${OUTPUT_DIR}"

.ci/scripts/test_backend.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ else
8585
fi
8686
CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
8787

88+
GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
89+
export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
90+
8891
EXIT_CODE=0
89-
${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
92+
${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
9093
# Generate markdown summary.
9194
${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from datasets import load_dataset
1212

1313
from optimum.executorch import (
14-
ExecuTorchModelForCausalLM,
1514
ExecuTorchModelForImageClassification,
1615
ExecuTorchModelForMaskedLM,
1716
ExecuTorchModelForSeq2SeqLM,
@@ -143,27 +142,62 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
143142
"--qembedding",
144143
"8w",
145144
]
145+
elif recipe == "cuda":
146+
command += [
147+
"--dtype",
148+
"bfloat16",
149+
"--device",
150+
"cuda",
151+
]
152+
if quantize:
153+
command += [
154+
"--qlinear",
155+
"4w",
156+
"--qlinear_packing_format",
157+
"tile_packed_to_4d",
158+
"--qembedding",
159+
"8w",
160+
]
146161
else:
147162
assert (
148163
not quantize
149-
), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
164+
), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
150165

151166
if not run_only:
152167
cli_export(command, model_dir)
153168

169+
if recipe == "cuda":
170+
model_path = Path(model_dir) / "model.pte"
171+
cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
172+
assert model_path.exists(), f"Main model file not found: {model_path}"
173+
assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
174+
154175
tokenizer = AutoTokenizer.from_pretrained(model_id)
155-
tokenizer.save_pretrained(model_dir)
156-
model = ExecuTorchModelForCausalLM.from_pretrained(model_dir)
157-
generated_text = model.text_generation(
158-
tokenizer=tokenizer,
159-
prompt="Simply put, the theory of relativity states that",
160-
max_seq_len=64,
176+
saved_files = tokenizer.save_pretrained(model_dir)
177+
tokenizer_path = get_tokenizer_path(model_dir, saved_files)
178+
179+
from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
180+
181+
if recipe == "cuda":
182+
runner = TextLLMRunner(
183+
f"{model_dir}/model.pte",
184+
tokenizer_path,
185+
f"{model_dir}/aoti_cuda_blob.ptd",
186+
)
187+
else:
188+
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
189+
tokens = []
190+
runner.generate(
191+
"Simply put, the theory of relativity states that",
192+
GenerationConfig(seq_len=64, temperature=0, echo=True),
193+
token_callback=lambda t: tokens.append(t),
161194
)
195+
generated_text = "".join(tokens)
162196
print(f"\nGenerated text:\n\t{generated_text}")
163197
generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
164198

165199
# Free memory before loading eager for quality check
166-
del model
200+
del runner
167201
del tokenizer
168202
gc.collect()
169203

.ci/scripts/test_lora.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
cmake_install_executorch_libraries() {
1313
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
1414
rm -rf cmake-out
15-
cmake --workflow llm-release
15+
cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
16+
cmake --build --preset llm-release-install
1617
}
1718

1819
cmake_build_llama_runner() {

0 commit comments

Comments
 (0)