Skip to content

Commit 0c0b205

Browse files
author
ssjia
committed
Update on "[ET-VK][ez] Implement helper functions to get fastest moving dim"
Add C++ and GLSL helpers to query the fastest moving dimension (the dimension with stride 1 in buffer layout). This is useful for optimizing memory access patterns in shaders, as iterating along the fastest moving dimension maximizes cache locality. The C++ `fastest_whcn_dim()` method accounts for block-transposed layouts by returning `outer_packed_dim` instead of `packed_dim` when applicable. A corresponding GLSL macro extracts this info from the hashed layout. Differential Revision: [D92061369](https://our.internmc.facebook.com/intern/diff/D92061369/) [ghstack-poisoned]
2 parents 73b4a69 + e029e51 commit 0c0b205

100 files changed

Lines changed: 1902 additions & 778 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/export_model_artifact.sh

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
# Export model to CUDA/Metal format with optional quantization
8+
# Export model to CUDA/Metal/XNNPACK format with optional quantization
99

1010
show_help() {
1111
cat << EOF
1212
Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
1313
14-
Export a HuggingFace model to CUDA/Metal format with optional quantization.
14+
Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
1515
1616
Arguments:
17-
device cuda or metal (required)
17+
device cuda, metal, or xnnpack (required)
1818
1919
hf_model HuggingFace model ID (required)
2020
Supported models:
@@ -28,6 +28,7 @@ Arguments:
2828
- non-quantized
2929
- quantized-int4-tile-packed
3030
- quantized-int4-weight-only
31+
- quantized-8da4w (XNNPACK only)
3132
3233
output_dir Output directory for artifacts (optional, default: current directory)
3334
@@ -36,6 +37,7 @@ Examples:
3637
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
3738
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
3839
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
40+
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
3941
EOF
4042
}
4143

@@ -64,9 +66,11 @@ case "$DEVICE" in
6466
;;
6567
metal)
6668
;;
69+
xnnpack)
70+
;;
6771
*)
6872
echo "Error: Unsupported device '$DEVICE'"
69-
echo "Supported devices: cuda, cuda-windows, metal"
73+
echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
7074
exit 1
7175
;;
7276
esac
@@ -139,9 +143,16 @@ case "$QUANT_NAME" in
139143
fi
140144
EXTRA_ARGS="--qlinear_encoder 4w"
141145
;;
146+
quantized-8da4w)
147+
if [ "$DEVICE" != "xnnpack" ]; then
148+
echo "Error: quantized-8da4w is only supported with xnnpack device"
149+
exit 1
150+
fi
151+
EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
152+
;;
142153
*)
143154
echo "Error: Unsupported quantization '$QUANT_NAME'"
144-
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
155+
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
145156
exit 1
146157
;;
147158
esac
@@ -157,10 +168,17 @@ pip list
157168
if [ "$MODEL_NAME" = "parakeet" ]; then
158169
pip install -r examples/models/parakeet/install_requirements.txt
159170

171+
# Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
172+
if [ "$DEVICE" = "xnnpack" ]; then
173+
DTYPE_ARG=""
174+
else
175+
DTYPE_ARG="--dtype bf16"
176+
fi
177+
160178
python -m executorch.examples.models.parakeet.export_parakeet_tdt \
161179
--backend "$DEVICE" \
162180
--output-dir "${OUTPUT_DIR}" \
163-
--dtype bf16 \
181+
${DTYPE_ARG} \
164182
${EXTRA_ARGS}
165183

166184
test -f "${OUTPUT_DIR}/model.pte"

.ci/scripts/test_model_e2e.sh

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
8+
# Test CUDA/Metal/XNNPACK model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
99

1010
show_help() {
1111
cat << EOF
1212
Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]
1313
14-
Build and run end-to-end tests for CUDA/Metal models.
14+
Build and run end-to-end tests for CUDA/Metal/XNNPACK models.
1515
1616
Arguments:
17-
device cuda or metal (required)
17+
device cuda, metal, or xnnpack (required)
1818
1919
hf_model HuggingFace model ID (required)
2020
Supported models:
@@ -28,6 +28,7 @@ Arguments:
2828
- non-quantized
2929
- quantized-int4-tile-packed
3030
- quantized-int4-weight-only
31+
- quantized-8da4w (XNNPACK only)
3132
3233
model_dir Directory containing model artifacts (optional, default: current directory)
3334
Expected files: model.pte, aoti_cuda_blob.ptd (CUDA only)
@@ -37,6 +38,7 @@ Examples:
3738
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
3839
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
3940
test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
41+
test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
4042
EOF
4143
}
4244

@@ -174,12 +176,17 @@ echo "::endgroup::"
174176

175177
echo "::group::Build $MODEL_NAME Runner"
176178

177-
if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ]; then
178-
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
179+
if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
180+
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
179181
exit 1
180182
fi
181183

182-
MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
184+
# Map device to make target (xnnpack uses cpu target which includes XNNPACK)
185+
if [ "$DEVICE" = "xnnpack" ]; then
186+
MAKE_TARGET="${RUNNER_PATH}-cpu"
187+
else
188+
MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
189+
fi
183190
make "${MAKE_TARGET}"
184191
echo "::endgroup::"
185192

.github/workflows/add-unanswered-to-project.yml

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,16 @@ jobs:
4343
"ethansfng", "ThomasJannaud", "nirvanagth", "marcinkwiatkowski", "3l1", "omerjerk", "nitish2112", "yipjustin",
4444
"ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", "Polyomino", "ezrilow", "navsud",
4545
"michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "junluan01",
46-
"YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta", "junpi", "pytorchbot", "pytorchmergebot",
47-
"pytorchupdatebot", "facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218",
48-
"per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm",
49-
"perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed",
50-
"agrima1304", "emmakujala", "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL",
51-
"ArmRyan", "xingguo01", "tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic",
52-
"chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97",
53-
"billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav",
54-
"neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa",
55-
"anzr299", "Jiseong-oh", "alexdean08",
46+
"mvartani-meta", "abeakkas", "elpdumont", "corporateshark", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat",
47+
"azad-meta", "junpi", "pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot",
48+
"Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils",
49+
"martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01",
50+
"Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd", "MatthiasHertel80",
51+
"AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", "tgonzalezorlandoarm", "chizkiyahu",
52+
"sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti",
53+
"jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar",
54+
"skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio",
55+
"ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", "anzr299", "Jiseong-oh", "alexdean08",
5656
// explicitly include the dependabot bot login seen in PRs
5757
"dependabot[bot]"
5858
]);
@@ -139,6 +139,11 @@ jobs:
139139
} else {
140140
console.log(`Skipping issue #${issue.number} by ${issue.user && issue.user.login}`);
141141
}
142+
if (!issue.pull_request && !isBotOrExcluded(issue.user) && !(await isMemberOfExcludedOrg(issue.user))) {
143+
await addItem(issue.node_id, 'issue', issue.number);
144+
} else {
145+
console.log(`Skipping issue #${issue.number} by ${issue.user && issue.user.login}`);
146+
}
142147
}
143148
144149
// Add open, non-draft PRs (regardless of review state), exclude by author/bots
@@ -156,6 +161,11 @@ jobs:
156161
} else {
157162
console.log(`Skipping PR #${pr.number} by ${pr.user && pr.user.login}`);
158163
}
164+
if (!pr.draft && !isBotOrExcluded(pr.user) && !(await isMemberOfExcludedOrg(pr.user))) {
165+
await addItem(pr.node_id, 'pr', pr.number);
166+
} else {
167+
console.log(`Skipping PR #${pr.number} by ${pr.user && pr.user.login}`);
168+
}
159169
}
160170
} catch (error) {
161171
core.setFailed(`Workflow failed: ${error.message}`);

.github/workflows/pull.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,39 @@ jobs:
163163
# Build and test ExecuTorch
164164
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
165165
166+
test-parakeet-xnnpack-linux:
167+
name: test-parakeet-xnnpack-linux
168+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
169+
permissions:
170+
id-token: write
171+
contents: read
172+
strategy:
173+
fail-fast: false
174+
with:
175+
runner: linux.4xlarge.memory
176+
docker-image: ci-image:executorch-ubuntu-22.04-clang12
177+
submodules: 'recursive'
178+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
179+
timeout: 120
180+
script: |
181+
set -eux
182+
183+
# The generic Linux job chooses to use base env, not the one setup by the image
184+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
185+
conda activate "${CONDA_ENV}"
186+
187+
echo "::group::Setup ExecuTorch"
188+
./install_executorch.sh
189+
echo "::endgroup::"
190+
191+
echo "::group::Export Parakeet with XNNPACK"
192+
bash .ci/scripts/export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
193+
echo "::endgroup::"
194+
195+
echo "::group::Test Parakeet with XNNPACK"
196+
bash .ci/scripts/test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
197+
echo "::endgroup::"
198+
166199
test-llama-runner-linux:
167200
# Test Both linux x86 and linux aarch64
168201
name: test-llama-runner-linux

.lintrunner.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,15 @@ include_patterns = [
510510
'backends/arm/vgf/**/*.py',
511511
'backends/arm/tosa/**/*.py',
512512
'backends/arm/ethosu/**/*.py',
513+
'backends/arm/operators/**/*.py',
514+
'backends/arm/common/**/*.py',
515+
'backends/arm/util/**/*.py',
516+
'backends/arm/runtime/**/*.py',
517+
'backends/arm/quantizer/**/*.py',
518+
'backends/arm/debug/**/*.py',
519+
'backends/arm/scripts/**/*.py',
520+
'backends/arm/operator_support/**/*.py',
521+
'backends/arm/*.py',
513522
]
514523
exclude_patterns = ['third-party/**', '**/third-party/**']
515524
command = [

CLAUDE.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,37 @@ Refer to the repo/framework/runtime "executorch" (in lower cases) or "ExecuTorch
44
camel cases), not "ExecutorTorch". With limited code or comment length, maybe refer
55
to the framework "ET" but consider it as very unofficial and not recommended.
66

7+
# Install
8+
9+
## Python
10+
11+
If the user is mostly importing `executorch` module and experimenting with Ahead-Of-Time
12+
export flow, installation means installing `executorch` python package.
13+
14+
Python virtual environment or conda environment is highly recommended for installing
15+
executorch from source. Double check if the user wants to enable virtual enablement before
16+
building from source.
17+
18+
First time install: run `install_executorch.sh` (or `install_executorch.bat` for Windows).
19+
20+
This script handles dependencies properly (since `executorch` depends on nightly versions
21+
of `torch`, those packages won't be available in pip so need special index url).
22+
23+
Subsequent install: run `pip install . -v --no-build-isolation` inside `executorch`
24+
directory.
25+
26+
Editable mode is avilable (either through `install_executorch.sh` script or `pip install . -e`.
27+
28+
Refer to more details in this [doc](docs/source/using-executorch-building-from-source.md).
29+
30+
## C++
31+
If the user is building basic executorch C++ libraries, refer to root level [CMakeLists.txt](CMakeLists.txt).
32+
33+
If working with LLM/ASR runners, prefer to use [Makefile](Makefile) and cmake [presets](CMakePresets.json).
34+
35+
Again refer to this [doc](docs/source/using-executorch-building-from-source.md#building-the-c-runtime)
36+
for more details.
37+
738
# Commit messages
839

940
Don't commit unless the user explicitly asks you to.

backends/aoti/CMakeLists.txt

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,48 @@ install(
6363
DESTINATION ${CMAKE_INSTALL_LIBDIR}
6464
)
6565

66+
# ==============================================================================
67+
# SlimTensor INTERFACE library (header-only) Provides lightweight tensor
68+
# operations for AOTI backends
69+
# ==============================================================================
70+
add_library(slimtensor INTERFACE)
71+
target_include_directories(
72+
slimtensor
73+
INTERFACE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
74+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
75+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/runtime/core/portable_type/c10>
76+
$<INSTALL_INTERFACE:include>
77+
)
78+
79+
# Use custom macros instead of cmake-generated ones (same as Buck build)
80+
target_compile_definitions(
81+
slimtensor INTERFACE C10_USING_CUSTOM_GENERATED_MACROS
82+
)
83+
84+
# Add CUDA support for SlimTensor when building with CUDA
85+
if(EXECUTORCH_BUILD_CUDA)
86+
find_package(CUDAToolkit REQUIRED)
87+
target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
88+
target_link_libraries(slimtensor INTERFACE CUDA::cudart)
89+
endif()
90+
91+
install(
92+
TARGETS slimtensor
93+
EXPORT ExecuTorchTargets
94+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
95+
)
96+
6697
# ==============================================================================
6798
# AOTI common shims using SlimTensor (for CUDA backend) Uses SlimTensor for all
6899
# tensor operations
69100
# TODO(gasoonjia): Replace aoti_common with this one after metal migration
70101
# ==============================================================================
71-
add_library(aoti_common_shims_slim STATIC common_shims_slim.cpp)
72-
target_include_directories(
73-
aoti_common_shims_slim
74-
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
75-
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
76-
)
102+
set(_aoti_common_shims_slim_sources common_shims_slim.cpp)
103+
if(EXECUTORCH_BUILD_CUDA)
104+
list(APPEND _aoti_common_shims_slim_sources slim/cuda/guard.cpp)
105+
endif()
106+
107+
add_library(aoti_common_shims_slim STATIC ${_aoti_common_shims_slim_sources})
77108
target_compile_options(
78109
aoti_common_shims_slim
79110
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
@@ -83,18 +114,7 @@ target_compile_definitions(
83114
aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
84115
)
85116

86-
# Add CUDA include directories and link CUDA runtime when building with CUDA
87-
if(EXECUTORCH_BUILD_CUDA)
88-
find_package(CUDAToolkit REQUIRED)
89-
target_include_directories(
90-
aoti_common_shims_slim PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
91-
)
92-
target_link_libraries(aoti_common_shims_slim PUBLIC CUDA::cudart)
93-
endif()
94-
95-
target_link_libraries(
96-
aoti_common_shims_slim PUBLIC slimtensor extension_tensor ${CMAKE_DL_LIBS}
97-
)
117+
target_link_libraries(aoti_common_shims_slim PUBLIC slimtensor ${CMAKE_DL_LIBS})
98118

99119
install(
100120
TARGETS aoti_common_shims_slim

backends/arm/_passes/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
from .decompose_select import DecomposeSelectPass # noqa
7676
from .decompose_select_scatter_pass import DecomposeSelectScatterPass # noqa
7777
from .decompose_sign_pass import DecomposeSignPass # noqa
78-
from .decompose_silu_pass import DecomposeSiluPass # noqa
7978
from .decompose_sinh_pass import DecomposeSinhPass # noqa
8079
from .decompose_softmax_pass import DecomposeSoftmaxPass # noqa
8180
from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@
7676
DecomposeSelectPass,
7777
DecomposeSelectScatterPass,
7878
DecomposeSignPass,
79-
DecomposeSiluPass,
8079
DecomposeSinhPass,
8180
DecomposeSoftmaxPass,
8281
DecomposeSoftmaxUnstablePass,
@@ -434,7 +433,6 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
434433
DecomposeLeakyReLUPass(tfa_pass=True),
435434
DecomposeLinalgVectorNormPass(tfa_pass=True),
436435
DecomposeSqrtPass(tfa_pass=True),
437-
DecomposeSiluPass(tfa_pass=True),
438436
DecomposeAvgPool2dPass(tfa_pass=True),
439437
DecomposeSoftmaxUnstablePass(tfa_pass=True),
440438
DecomposeSoftmaxPass(tfa_pass=True),

0 commit comments

Comments
 (0)