From 64b7c92738daa67d8cfa6074f44ad6eec6ac5224 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 23 Jun 2026 09:38:07 -0700 Subject: [PATCH 1/2] shelve changes to: Arm backend: Add a repro command when VGF model-converter fails (#20443) --- .ci/scripts/export_model_artifact.sh | 3 +- .ci/scripts/test_model_e2e.sh | 101 ---- .claude/skills/qualcomm/new_op_development.md | 15 +- .flake8 | 1 - .github/workflows/build-cadence-runner.yml | 64 ++- .github/workflows/mlx.yml | 23 - .github/workflows/validate_flatbuffer_gen.yml | 16 +- .gitignore | 1 - .lintrunner.toml | 3 - Makefile | 12 +- backends/aoti/aoti_backend.py | 19 +- backends/aoti/aoti_delegate_handle.h | 26 - backends/aoti/aoti_partitioner.py | 120 ++-- backends/aoti/tests/TARGETS | 12 - .../apple/metal/runtime/metal_backend.cpp | 10 +- backends/arm/_passes/__init__.py | 2 - backends/arm/_passes/arm_pass_manager.py | 7 +- .../aten_to_tosa_activation_functions.py | 18 - backends/arm/_passes/decompose_round_pass.py | 14 +- .../arm/_passes/deduplicate_get_attr_pass.py | 27 +- backends/arm/_passes/exir_to_tosa_pass.py | 43 +- .../arm/_passes/insert_dynamic_padding.py | 14 +- backends/arm/_passes/insert_rescales_pass.py | 35 +- backends/arm/_passes/insert_table_ops.py | 1 - backends/arm/_passes/rewrite_conv_pass.py | 26 +- backends/arm/_passes/rewrite_mxfp_linear.py | 98 +--- .../arm/_passes/size_adjust_input_pass.py | 43 +- backends/arm/_passes/symbolic_value_range.py | 113 +--- backends/arm/ao_ext/mxfp.py | 79 +-- backends/arm/ao_ext/mxfp_tosa_lib.py | 1 - backends/arm/ao_ext/mxfp_transform.py | 3 - backends/arm/ao_ext/ops/__init__.py | 2 - backends/arm/ao_ext/ops/mxfp_linear_op.py | 77 +-- backends/arm/operator_support/TARGETS | 1 - backends/arm/operator_support/__init__.py | 1 - .../tosa_profile_supported_op_lists.py | 3 - .../tosa_supported_operators.py | 406 ++++--------- backends/arm/operators/__init__.py | 2 - .../operators/op_tosa_cast_to_block_scaled.py | 85 +-- .../op_tosa_matmul_t_block_scaled.py | 8 +- backends/arm/operators/op_tosa_shapes.py | 218 +------ backends/arm/process_node.py | 101 +--- .../arm/quantizer/quantization_annotator.py | 1 - .../arm/scripts/install_models_for_test.sh | 7 +- backends/arm/scripts/pre-push | 2 +- backends/arm/test/misc/test_mxfp_linear_ao.py | 103 +--- backends/arm/test/misc/test_process_node.py | 80 +-- backends/arm/test/misc/test_runner_utils.py | 116 ---- backends/arm/test/misc/test_vgf_backend.py | 188 +----- backends/arm/test/misc/test_vgf_check_env.py | 99 +--- .../test_tosa_dialect_cast_to_block_scaled.py | 50 +- .../test_tosa_dialect_mxfp_linear.py | 34 -- .../arm/test/ops/mxfp/test_mxfp_linear.py | 218 ++----- backends/arm/test/ops/test_round.py | 2 + .../test_insert_dynamic_padding_pass.py | 154 ++--- .../arm/test/passes/test_rewrite_conv_pass.py | 38 +- .../passes/test_rewrite_mxfp_linear_pass.py | 42 +- .../test/passes/test_symbolic_value_range.py | 13 - backends/arm/test/runner_utils.py | 142 +---- backends/arm/test/targets.bzl | 3 - backends/arm/tosa/dialect/__init__.py | 1 - .../tosa/dialect/ops/cast_to_block_scaled.py | 25 +- backends/arm/tosa/dialect/ops/conv2d.py | 31 +- backends/arm/tosa/dialect/ops/conv3d.py | 28 +- .../arm/tosa/dialect/ops/depthwise_conv2d.py | 19 +- .../tosa/dialect/ops/matmul_t_block_scaled.py | 44 +- backends/arm/tosa/mapping.py | 17 +- backends/arm/tosa/partitioner.py | 12 +- backends/arm/tosa/utils.py | 4 - backends/arm/vgf/backend.py | 160 +----- backends/arm/vgf/check_env.py | 177 +++++- backends/arm/vgf/model_converter.py | 180 +----- backends/cadence/fused_quant/op_add.cpp | 2 +- backends/cadence/fused_quant/op_add.h | 13 +- backends/cadence/fused_quant/op_bmm.cpp | 2 +- backends/cadence/fused_quant/op_bmm.h | 13 +- backends/cadence/fused_quant/op_hardswish.cpp | 2 +- backends/cadence/fused_quant/op_hardswish.h | 8 +- backends/cadence/fused_quant/op_mul.cpp | 2 +- backends/cadence/fused_quant/op_mul.h | 13 +- backends/cadence/fused_quant/op_relu.cpp | 2 +- backends/cadence/fused_quant/op_relu.h | 8 +- backends/cadence/fused_quant/quant_utils.h | 4 +- .../cadence/fused_quant/tests/test_op_add.cpp | 2 +- .../cadence/fused_quant/tests/test_op_bmm.cpp | 2 +- .../fused_quant/tests/test_op_hardswish.cpp | 2 +- .../cadence/fused_quant/tests/test_op_mul.cpp | 2 +- .../fused_quant/tests/test_op_relu.cpp | 2 +- .../generic/operators/op_avg_pool2d.cpp | 2 +- .../cadence/generic/operators/op_avg_pool2d.h | 5 +- .../generic/operators/op_fully_connected.cpp | 2 +- .../generic/operators/op_fully_connected.h | 2 +- .../generic/operators/op_linalg_svd.cpp | 2 +- .../cadence/generic/operators/op_linalg_svd.h | 2 +- .../operators/op_quantized_conv1d_nlc.cpp | 2 +- .../operators/op_quantized_conv1d_nlc.h | 2 +- .../generic/operators/op_quantized_conv2d.cpp | 4 +- .../generic/operators/op_quantized_conv2d.h | 2 +- .../op_quantized_depthwise_conv1d_nlc.cpp | 2 +- .../operators/op_quantized_embedding_byte.cpp | 2 +- .../operators/op_quantized_embedding_byte.h | 3 +- .../op_quantized_fully_connected.cpp | 2 +- .../operators/op_quantized_fully_connected.h | 8 +- .../operators/op_quantized_layer_norm.cpp | 2 +- .../generic/operators/op_quantized_linear.cpp | 2 +- .../generic/operators/op_quantized_linear.h | 4 +- .../generic/operators/op_quantized_matmul.cpp | 2 +- .../generic/operators/op_quantized_matmul.h | 2 +- .../generic/operators/op_quantized_mul.cpp | 2 +- .../generic/operators/op_quantized_relu.cpp | 2 +- .../generic/operators/op_requantize.cpp | 2 +- .../cadence/generic/operators/op_rope.cpp | 4 +- backends/cadence/generic/operators/op_rope.h | 4 +- .../cadence/generic/operators/op_softmax.cpp | 2 +- .../cadence/generic/operators/op_softmax.h | 2 +- .../operators/op_transposed_convolution.cpp | 2 +- backends/cadence/hifi/operators/op_mean.cpp | 2 +- .../operators/op_quantized_conv1d_nlc.cpp | 2 +- .../op_quantized_conv2d_nhwc_out.cpp | 2 +- .../op_quantized_depthwise_conv1d_nlc.cpp | 2 +- .../hifi/operators/op_quantized_matmul_out.h | 2 +- .../hifi/operators/op_softmax_f32_f32.cpp | 4 +- backends/cadence/hifi/operators/operators.h | 6 +- .../operators/op_quantized_conv_out.cpp | 2 +- .../op_quantized_fully_connected_out.cpp | 2 +- .../operators/op_quantized_linear_out.cpp | 4 +- .../operators/op_quantized_matmul_out.cpp | 4 +- .../cadence/vision/operators/op_softmax.cpp | 2 +- backends/cadence/vision/operators/operators.h | 2 +- backends/cortex_m/TARGETS | 13 +- .../ops/cmsis_scratch_buffer_context.h | 2 +- backends/cortex_m/ops/op_quantized_conv2d.cpp | 4 +- .../ops/op_quantized_depthwise_conv2d.cpp | 4 +- backends/cortex_m/ops/op_quantized_linear.cpp | 4 +- .../ops/op_quantized_transpose_conv2d.cpp | 4 +- backends/cortex_m/passes/BUCK | 2 - backends/cortex_m/passes/__init__.py | 30 + .../cortex_m/passes/aten_to_cortex_m_pass.py | 2 +- .../cortex_m/passes/scratch_buffer_sizes.py | 2 +- backends/cortex_m/target_config.py | 3 +- .../cortex_m/test/misc/test_cmsis_pybind.py | 3 +- .../cortex_m/test/misc/test_target_config.py | 3 +- backends/cortex_m/test/ops/test_avg_pool2d.py | 2 +- backends/cuda/runtime/cuda_backend.cpp | 25 +- backends/cuda/tests/test_cuda_partitioner.py | 160 +----- backends/cuda/tests/test_tq4_sdpa.py | 438 +------------- .../nxp/backend/edge_program_converter.py | 1 - backends/nxp/backend/graph_utils.py | 2 +- .../ops_converters/__init__.py | 4 - .../ops_converters/clamp_converter.py | 50 +- .../ops_converters/hardtanh_converter.py | 85 ++- .../ops_converters/mean_dim_converter.py | 125 +--- .../ir/converter/quantization_utils.py | 5 +- backends/nxp/backend/node_format_inference.py | 73 +-- backends/nxp/neutron_partitioner.py | 3 +- backends/nxp/nxp_backend.py | 55 +- backends/nxp/quantizer/neutron_quantizer.py | 2 - backends/nxp/quantizer/patterns.py | 43 +- backends/nxp/run_unittests.sh | 2 +- backends/nxp/tests/conftest.py | 2 +- backends/nxp/tests/executorch_pipeline.py | 4 - .../nxp/tests/generic_tests/test_cifarnet.py | 10 +- .../generic_tests/test_convert_div_to_mul.py | 3 +- .../tests/generic_tests/test_integration.py | 2 +- .../test_quantized_input_data.py | 43 +- .../node_converter/test_abs_converter.py | 6 +- .../test_adaptive_avg_pool2d_converter.py | 11 +- .../test_add_tensor_converter.py | 83 +-- .../test_avg_pool2d_converter.py | 19 +- .../node_converter/test_cat_converter.py | 24 +- .../node_converter/test_clamp_converter.py | 38 +- .../test_constant_pad_nd_converter.py | 29 +- .../node_converter/test_hardtanh_converter.py | 313 +++------- .../test_leaky_relu_converter.py | 18 +- .../node_converter/test_log_converter.py | 6 +- .../test_max_pool_2d_converter.py | 35 +- .../node_converter/test_mean_dim_converter.py | 340 +++-------- .../test_mul_tensor_converter.py | 13 +- .../test_permute_copy_converter.py | 49 +- .../node_converter/test_relu_converter.py | 18 +- .../node_converter/test_sigmoid_converter.py | 13 +- .../test_slice_tensor_converter.py | 40 +- .../test_sub_tensor_converter.py | 89 +-- .../node_converter/test_tanh_converter.py | 18 +- .../test_upsample_bilinear2d.py | 37 +- .../node_converter/test_upsample_nearest2d.py | 17 +- backends/nxp/tests/model_output_comparator.py | 30 +- backends/nxp/tests/nsys_testing.py | 106 +--- backends/nxp/tests/ops_aliases.py | 2 - backends/nxp/tests/utils.py | 32 -- backends/qualcomm/_passes/__init__.py | 4 - backends/qualcomm/_passes/decompose_acos.py | 4 +- backends/qualcomm/_passes/decompose_atan2.py | 4 +- .../_passes/decompose_log_variants.py | 6 +- .../qualcomm/_passes/decompose_remainder.py | 6 +- backends/qualcomm/_passes/decompose_var.py | 4 +- backends/qualcomm/_passes/qnn_pass_manager.py | 8 - backends/qualcomm/_passes/utils.py | 2 +- backends/qualcomm/aot/wrappers/targets.bzl | 3 +- backends/qualcomm/builders/README.md | 6 - backends/qualcomm/debugger/README.md | 4 +- backends/qualcomm/export_utils.py | 7 +- .../quantizer/annotators/htp_rules.py | 6 +- .../quantizer/annotators/lpai_rules.py | 6 +- backends/qualcomm/runtime/targets.bzl | 5 +- backends/qualcomm/targets.bzl | 32 +- backends/qualcomm/tests/models.py | 50 -- backends/qualcomm/tests/test_qnn_delegate.py | 166 +----- .../postpone_permute_below_squeeze_view.py | 6 +- .../test/test_permute_optimization_passes.py | 35 -- .../runtime/graph/ops/impl/Q8taBinary.cpp | 28 +- .../runtime/graph/ops/impl/Q8taConv2d.cpp | 52 +- .../runtime/graph/ops/impl/Q8taConv2d.h | 7 +- .../runtime/graph/ops/impl/Q8taConv2dDW.cpp | 52 +- .../graph/ops/impl/Q8taConv2dIm2Col.cpp | 72 +-- .../runtime/graph/ops/impl/Q8taConv2dPW.cpp | 89 +-- .../graph/ops/impl/Q8taConv2dTransposed.cpp | 58 +- .../runtime/graph/ops/impl/Q8taLinear.cpp | 30 +- .../graph/ops/impl/Q8taQuantizeDequantize.cpp | 23 +- .../test/custom_ops/impl/TestConv2dDw.cpp | 8 +- backends/vulkan/test/custom_ops/utils.cpp | 4 - backends/vulkan/test/custom_ops/utils.h | 18 - backends/webgpu/CMakeLists.txt | 39 +- backends/webgpu/runtime/WebGPUBackend.cpp | 11 +- backends/webgpu/runtime/WebGPUGraph.cpp | 247 ++------ backends/webgpu/runtime/WebGPUGraph.h | 52 +- backends/webgpu/runtime/WebGPUUtils.h | 22 - .../webgpu/scripts/test_webgpu_native_ci.sh | 125 ++-- backends/webgpu/test/test_build_webgpu.sh | 29 +- backends/webgpu/test/test_webgpu_native.cpp | 533 +++++------------- codegen/api/et_cpp.py | 3 +- codegen/api/types/types.py | 3 +- devtools/bundled_program/schema/README.md | 10 - devtools/bundled_program/serialize/BUCK | 3 +- .../bundled_program/serialize/__init__.py | 184 +----- docs/source/backends/nxp/op-support.csv | 1 - .../executor_runner/arm_memory_allocator.cpp | 25 +- .../executor_runner/arm_memory_allocator.h | 15 +- examples/espressif/README.md | 2 + .../espressif/executor_runner/CMakeLists.txt | 3 +- .../executor_runner/esp_executor_runner.cpp | 9 +- examples/models/BUCK | 3 - examples/models/__init__.py | 8 - examples/models/gemma4_31b/README.md | 21 +- .../gemma4_31b/cuda_source_transformations.py | 14 +- examples/models/gemma4_31b/export.py | 9 - examples/models/gemma4_31b/gguf_loader.py | 70 +-- examples/models/gemma4_31b/main.cpp | 122 +--- examples/models/gemma4_31b/model.py | 2 +- examples/models/gemma4_31b/sampler.py | 7 +- .../gemma4_31b/tests/test_cuda_pipeline.py | 49 +- examples/models/parakeet/CMakeLists.txt | 61 +- examples/models/parakeet/CMakePresets.json | 12 +- examples/models/parakeet/README.md | 25 - examples/models/parakeet/main.cpp | 274 +++++++-- examples/models/qwen3_5_moe/CMakeLists.txt | 19 +- examples/models/qwen3_5_moe/CMakePresets.json | 43 +- examples/models/qwen3_5_moe/README.md | 83 +-- examples/models/qwen3_5_moe/export.py | 6 - .../models/qwen3_5_moe/qwen35_moe_engine.cpp | 111 +--- examples/qualcomm/oss_scripts/llama/README.md | 108 +--- examples/qualcomm/oss_scripts/llama/TARGETS | 131 +---- .../llama/encoder/encoder_config.py | 7 + examples/qualcomm/oss_scripts/llama/llama.py | 183 +++--- .../oss_scripts/llama/masking_utils.py | 91 +-- .../llama/mix_precision_analyzer.py | 50 +- .../qualcomm/oss_scripts/llama/tokenizer.py | 142 +++-- .../llama/wrappers/base_component.py | 15 +- .../llama/wrappers/llm_wrappers.py | 493 +++++++++++----- .../llm_utils/decoder_model_wrapper.py | 13 +- .../llm_utils/qnn_decoder_model_manager.py | 21 +- .../qualcomm/oss_scripts/qwen2_5/qwen2_5.py | 5 +- exir/backend/utils.py | 10 +- exir/pass_base.py | 91 --- exir/tensor.py | 4 +- exir/tests/test_pass_infra.py | 108 ---- exir/tests/test_tensor.py | 20 - .../make_aten_functor_from_et_functor.h | 6 +- ...make_aten_functor_from_et_functor_test.cpp | 90 +-- extension/data_loader/file_data_loader.cpp | 8 +- .../test/file_data_loader_test.cpp | 95 ---- .../flat_tensor/flat_tensor_data_map.cpp | 8 +- extension/flat_tensor/flat_tensor_data_map.h | 6 +- extension/llm/modules/turboquant/kv_cache.py | 12 +- extension/named_data_map/merged_data_map.cpp | 2 +- extension/named_data_map/merged_data_map.h | 6 +- kernels/portable/cpu/op_index_put.cpp | 3 +- kernels/portable/cpu/op_log_softmax.cpp | 8 +- kernels/portable/cpu/op_native_dropout.cpp | 2 +- kernels/test/op_native_dropout_test.cpp | 2 +- runtime/core/exec_aten/exec_aten.h | 6 +- .../core/exec_aten/util/scalar_type_util.h | 2 +- runtime/core/memory_allocator.h | 16 - runtime/core/named_data_map.h | 10 +- runtime/core/portable_type/optional.h | 19 +- runtime/core/portable_type/string_view.h | 6 +- .../core/portable_type/test/CMakeLists.txt | 10 +- runtime/core/portable_type/test/targets.bzl | 8 + runtime/core/test/memory_allocator_test.cpp | 83 --- runtime/executor/merged_data_map.h | 7 +- runtime/executor/pte_data_map.cpp | 3 +- runtime/executor/pte_data_map.h | 7 +- runtime/executor/test/method_meta_test.cpp | 4 +- test/utils/OSSTestConfig.json | 1 + 304 files changed, 2993 insertions(+), 8623 deletions(-) diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 0e28098a1e8..b5bf19f4155 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -422,9 +422,8 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then --no-compile echo "::endgroup::" - # Copy tokenizer files for the runner and model-specific serving launcher. + # Copy tokenizer for the runner cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json" - cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json" # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues) echo "::group::Export" diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index d8bca45e695..503bd381a8d 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -447,105 +447,4 @@ case "$MODEL_NAME" in esac echo "::endgroup::" -if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then - echo "::group::Run $MODEL_NAME OpenAI serving smoke" - pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1" - python -m pip install --no-deps --no-build-isolation --editable . -v - - PORT=$(python - <<'PY' -import socket - -with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("127.0.0.1", 0)) - print(s.getsockname()[1]) -PY -) - SERVER_LOG=$(mktemp) - WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker" - python -u -m executorch.examples.models.qwen3_5_moe.serve \ - --model-path "${MODEL_DIR}/model.pte" \ - --data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \ - --tokenizer-path "${MODEL_DIR}/tokenizer.json" \ - --hf-tokenizer "${MODEL_DIR}" \ - --model-id qwen3.5-moe \ - --max-context 4096 \ - --max-sessions 2 \ - --no-think \ - --worker-bin "$WORKER_BIN" \ - --host 127.0.0.1 \ - --port "$PORT" >"$SERVER_LOG" 2>&1 & - SERVER_PID=$! - - cleanup_qwen_server() { - if kill -0 "$SERVER_PID" 2>/dev/null; then - kill "$SERVER_PID" 2>/dev/null || true - wait "$SERVER_PID" 2>/dev/null || true - fi - rm -f "$SERVER_LOG" - } - trap cleanup_qwen_server EXIT - - if ! python - "$PORT" "$SERVER_LOG" <<'PY' -import json -import sys -import time -import urllib.request - -port = sys.argv[1] -log_path = sys.argv[2] -base = f"http://127.0.0.1:{port}" - - -def request(path, payload=None): - data = None - headers = {} - if payload is not None: - data = json.dumps(payload).encode("utf-8") - headers["Content-Type"] = "application/json" - req = urllib.request.Request(base + path, data=data, headers=headers) - with urllib.request.urlopen(req, timeout=120) as resp: - return json.loads(resp.read().decode("utf-8")) - - -last = None -for _ in range(180): - try: - request("/health") - break - except Exception as e: - last = e - time.sleep(1) -else: - print(open(log_path, encoding="utf-8", errors="replace").read()) - raise RuntimeError(f"server did not become healthy: {last}") - -models = request("/v1/models") -ids = {m["id"] for m in models["data"]} -if "qwen3.5-moe" not in ids: - raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}") - -body = { - "model": "qwen3.5-moe", - "messages": [{"role": "user", "content": "What is the capital of France?"}], - "max_tokens": 32, - "temperature": 0, -} -resp = request("/v1/chat/completions", body) -content = resp["choices"][0]["message"].get("content") or "" -if "Paris" not in content: - raise AssertionError(f"expected Paris in serving response, got: {content!r}") - -print("Qwen3.5-MoE serving smoke passed") -PY - then - echo "Qwen3.5-MoE serving smoke failed; server log:" - cat "$SERVER_LOG" - exit 1 - fi - - cleanup_qwen_server - trap - EXIT - echo "::endgroup::" -fi - popd diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md index 149940b0796..4133a92ea48 100644 --- a/.claude/skills/qualcomm/new_op_development.md +++ b/.claude/skills/qualcomm/new_op_development.md @@ -217,17 +217,8 @@ class DecomposeMyOp(ExportPass): ### Registration (all decompose passes) 1. `_passes/__init__.py` — import + `__all__` -2. `_passes/qnn_pass_manager.py` — The pass manager uses classmethods for pipeline definitions: - - **Import** — add to the import block at top of file - - **`get_annotation_passes()`** — add pass class to the returned list (runs before quantizer, ATen IR) - - **`get_export_passes()`** — add pass class if needed for float-only path (runs after quantization, before to-edge) - - **`get_default_pass_activations()`** — add `(PassClass, True)` ONLY if the pass also needs to run in the to-edge pipeline - - **`get_passes_dependency_for_capture_program()`** — add `PassClass: [RemoveRedundancy]` dependency ONLY if also in `get_default_pass_activations` - -**When to add to which pipeline:** -- **Annotation only** (most common for decompose passes): `get_annotation_passes()` — pass decomposes the op before the quantizer sees it -- **Export pipeline** too: if the float-only test fails without it (op doesn't get handled by PyTorch's built-in decomposition during to-edge) -- **Capture program** (to-edge) too: if the op can appear in edge dialect and needs decomposition there (e.g., `DecomposeVar`, `DecomposeCDist`, `DecomposeDiagonal`) +2. `_passes/qnn_pass_manager.py` — import + `transform_for_annotation_pipeline` + `transform_for_export_pipeline` + `get_capture_program_passes` +3. `_passes/utils.py` — add to `get_passes_dependency_for_capture_program()` with `[RemoveRedundancy]` dependency --- @@ -264,4 +255,4 @@ class DecomposeMyOp(ExportPass): **Native QNN Op:** `qnn_constants.py` → `op_my_op.py` → `builders/__init__.py` → `htp_rules.py` → `lpai_rules.py` → `layout_transform.py` → `tests/models.py` → `test_qnn_delegate.py` → `partition/utils.py` (skip decomp) → `common_defs.py` (remove to_be_implemented) → `builders/README.md` -**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (`get_annotation_passes` + optionally `get_export_passes`; if also needed in to-edge: `get_default_pass_activations` + `get_passes_dependency_for_capture_program`) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md` +**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (annotation + export + capture) → `_passes/utils.py` (dependency) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md` diff --git a/.flake8 b/.flake8 index ffb419da1e4..fc9feb45d8b 100644 --- a/.flake8 +++ b/.flake8 @@ -75,7 +75,6 @@ exclude = ./configurations, ./docs, ./exir/_serialize/generated/executorch_flatbuffer, - ./devtools/bundled_program/serialize/generated, ./third_party, *.pyi diff --git a/.github/workflows/build-cadence-runner.yml b/.github/workflows/build-cadence-runner.yml index c447e4f9a20..49f750eeea2 100644 --- a/.github/workflows/build-cadence-runner.yml +++ b/.github/workflows/build-cadence-runner.yml @@ -19,18 +19,36 @@ concurrency: cancel-in-progress: true jobs: - # Same-repo PRs run on pull_request, which reads the PR's own workflow AND code - # -- so CI changes, new test jobs, code, and tests are all validated pre-merge. - # Fork PRs can't get credentials (OIDC) on pull_request, so Meta-exported forks - # (labeled CLA Signed + meta-exported) run on pull_request_target instead. The - # run condition is inlined per job (GitHub Actions has no YAML anchors and env - # is unavailable in job-level if), so keep the copies in sync. + gate: + runs-on: ubuntu-latest + outputs: + run-cadence: ${{ steps.decide.outputs.run }} + steps: + - id: decide + env: + EVENT: ${{ github.event_name }} + IS_FORK: ${{ github.event.pull_request.head.repo.full_name != github.repository }} + HAS_CLA: ${{ contains(github.event.pull_request.labels.*.name, 'CLA Signed') }} + HAS_EXPORT: ${{ contains(github.event.pull_request.labels.*.name, 'meta-exported') }} + run: | + run=false + case "${EVENT}" in + push|schedule|workflow_dispatch) + run=true + ;; + pull_request) + [ "${IS_FORK}" = "false" ] && run=true + ;; + pull_request_target) + if [ "${IS_FORK}" = "true" ] && [ "${HAS_CLA}" = "true" ] && [ "${HAS_EXPORT}" = "true" ]; then + run=true + fi + ;; + esac + echo "run=${run}" >> "${GITHUB_OUTPUT}" + cpu-build: - if: >- - github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || - (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && - contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported')) + if: github.event_name != 'pull_request_target' uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -40,7 +58,7 @@ jobs: runner: linux.2xlarge docker-image: ci-image:executorch-ubuntu-22.04-clang12 submodules: recursive - ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: cadence-runner-build script: | @@ -57,28 +75,21 @@ jobs: cpu-test: needs: cpu-build - if: >- - github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || - (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && - contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported')) + if: github.event_name != 'pull_request_target' permissions: id-token: write contents: read uses: ./.github/workflows/_test_cadence.yml with: - ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # Cross-compile cadence_executor_runner for each Cadence Xtensa core, one job # per backend so they show as separate lines (no matrix grouping). Shared logic # lives in _xtensa_build.yml. fusion_g3 is omitted until the upstream fusion_g3 # <-> nnlib-FusionG3 API skew is fixed (its runner does not link). hifi-build: - if: >- - github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || - (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && - contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported')) + needs: gate + if: needs.gate.outputs.run-cadence == 'true' permissions: id-token: write contents: read @@ -88,11 +99,8 @@ jobs: ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }} vision-build: - if: >- - github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || - (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && - contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported')) + needs: gate + if: needs.gate.outputs.run-cadence == 'true' permissions: id-token: write contents: read diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index 5a4ccbb4952..acc6b4840cf 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -161,29 +161,6 @@ jobs: fi echo "::endgroup::" - echo "::group::Verify chunked == unchunked prefill" - QWEN_TINY_PTE=/tmp/qwen35_moe_mlx_tiny/model.pte \ - ${CONDA_RUN} python -m pytest \ - examples/models/qwen3_5_moe/test_chunked_prefill.py -v - echo "::endgroup::" - - echo "::group::Build Qwen 3.5 MoE MLX C++ runner" - # Validates the MLX C++ runner build wiring (compile + link + metallib). - # The tiny model has no compatible tokenizer (vocab 256, random weights), - # so we don't run C++ inference here — only confirm it builds. - ${CONDA_RUN} make qwen3_5_moe-mlx - RUNNER=cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner - if [ ! -x "$RUNNER" ]; then - echo "Failed: runner not found at $RUNNER" - exit 1 - fi - if [ ! -f "$(dirname "$RUNNER")/mlx.metallib" ]; then - echo "Failed: mlx.metallib not copied next to runner" - exit 1 - fi - echo "Success: built $RUNNER" - echo "::endgroup::" - backend-tester: needs: run-decision if: | diff --git a/.github/workflows/validate_flatbuffer_gen.yml b/.github/workflows/validate_flatbuffer_gen.yml index 6c0455784c6..96eeda95e04 100644 --- a/.github/workflows/validate_flatbuffer_gen.yml +++ b/.github/workflows/validate_flatbuffer_gen.yml @@ -5,9 +5,7 @@ on: pull_request: paths: - "schema/**" - - "devtools/bundled_program/schema/**" - - "exir/_serialize/generated/**" - - "devtools/bundled_program/serialize/generated/**" + - "exir/_serialize/generated/executorch_flatbuffer/**" jobs: exir-flatbuffer: @@ -35,15 +33,3 @@ jobs: echo "Please run 'python exir/_serialize/generate_program.py' to regenerate the files and commit the changes." exit 1 fi - - - name: Generate bundled program flatbuffer Python - run: python devtools/bundled_program/serialize/generate_bundled_program.py - - - name: Validate bundled_program_flatbuffer is unchanged - run: | - git add -A devtools/bundled_program/serialize/generated - if ! git diff --cached --quiet -- devtools/bundled_program/serialize/generated; then - echo "Error: bundled_program_flatbuffer has uncommitted changes." - echo "Please run 'python devtools/bundled_program/serialize/generate_bundled_program.py' to regenerate the files and commit the changes." - exit 1 - fi diff --git a/.gitignore b/.gitignore index ee206e23d94..87772e21014 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,6 @@ arm-scratch/ executorch.egg-info pip-out/ build-profiling/ -**/ddr_*_temp # Any exported models and profiling outputs *.bin diff --git a/.lintrunner.toml b/.lintrunner.toml index 98c46c78960..ab498a5d0ac 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -9,7 +9,6 @@ exclude_patterns = [ '.github/scripts/**', 'exir/serde/**', 'exir/_serialize/generated/executorch_flatbuffer/**', - 'devtools/bundled_program/serialize/generated/**', ] command = [ 'python', @@ -42,7 +41,6 @@ exclude_patterns = [ '**/third-party/**', 'exir/serde/**', 'exir/_serialize/generated/executorch_flatbuffer/**', - 'devtools/bundled_program/serialize/generated/**', ] command = [ 'python', @@ -391,7 +389,6 @@ exclude_patterns = [ '**/*.gif', 'extension/llm/tokenizers', 'extension/llm/tokenizers/**', - 'examples/llm_server', 'backends/cadence/utils/FACTO', 'examples/cuda', 'examples/qualcomm', diff --git a/Makefile b/Makefile index 552bbf89bd7..c93085115aa 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help help: @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" @@ -131,7 +131,6 @@ help: @echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend" @echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend" @echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend" - @echo " qwen3_5_moe-mlx - Build Qwen3.5 MoE runner with MLX backend" @echo " clean - Clean build artifacts" voxtral-cuda: @@ -468,15 +467,6 @@ qwen3_5_moe-metal: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner" -qwen3_5_moe-mlx: - @echo "==> Building and installing ExecuTorch with MLX..." - cmake --workflow --preset mlx-release - @echo "==> Building Qwen3.5 MoE runner with MLX..." - cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx - @echo "" - @echo "✓ Build complete!" - @echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner" - clean: rm -rf cmake-out \ extension/llm/tokenizers/build \ diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 91a8a60078e..a478b43cf0f 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. import contextlib -import hashlib import os import typing from abc import ABC, abstractmethod @@ -277,21 +276,18 @@ def preprocess( # Create named data store named_data_store = NamedDataStore() + method_name = cls.method_name_from_compile_specs(compile_specs) - # Key each blob by a content hash so partitions in one method get distinct - # keys (a method-name-only key collides). Runtime recovers them from - # processed_bytes below. - so_blob_key = hashlib.sha256(so_data).hexdigest() + "_so_blob" - weights_blob_key = hashlib.sha256(blob_data).hexdigest() + "_weights_blob" - - named_data_store.add_named_data(so_blob_key, so_data, 1, None) + named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None) # Determine whether to save named data externally based on backend setting # External: save to separate .ptd file, otherwise merge with .pte file external_tag = ( f"aoti_{device_name}_blob" if cls.save_data_externally() else None ) - named_data_store.add_named_data(weights_blob_key, blob_data, 1, external_tag) + named_data_store.add_named_data( + method_name + "_weights_blob", blob_data, 1, external_tag + ) # Clean up the generated files os.remove(so_path) @@ -303,11 +299,8 @@ def preprocess( # the next preprocess call (e.g. for the next method). cls.release_moved_tensors(device_edge_program, compile_specs) - # The runtime cannot recompute these hash keys, so carry them (one per line). - processed_bytes = (so_blob_key + "\n" + weights_blob_key).encode("utf-8") - return PreprocessResult( - processed_bytes=processed_bytes, + processed_bytes=b"", debug_handle_map={}, data_store_output=named_data_store.get_named_data_store_output(), ) diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h index fbd748306cc..2d1a3146ae5 100644 --- a/backends/aoti/aoti_delegate_handle.h +++ b/backends/aoti/aoti_delegate_handle.h @@ -10,7 +10,6 @@ #include #include -#include #include namespace executorch { @@ -18,7 +17,6 @@ namespace backends { namespace aoti { using executorch::runtime::Error; -using executorch::runtime::FreeableBuffer; using executorch::runtime::etensor::Tensor; extern "C" { @@ -150,30 +148,6 @@ struct AOTIDelegateHandle { update_user_managed_constant_buffer_pairs; }; -// New-format payload is "\n"; an empty payload is a -// pre-this-change artifact, so fall back to the legacy method-name keys. -inline Error resolve_blob_keys( - const FreeableBuffer* processed, - const std::string& method_name, - std::string& so_blob_key, - std::string& weights_blob_key) { - if (processed != nullptr && processed->size() > 0) { - const std::string keys( - static_cast(processed->data()), processed->size()); - const size_t newline = keys.find('\n'); - if (newline == std::string::npos) { - return Error::Internal; - } - so_blob_key = keys.substr(0, newline); - weights_blob_key = keys.substr(newline + 1); - } else { - so_blob_key = method_name.empty() ? "so_blob" : method_name + "_so_blob"; - weights_blob_key = - method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; - } - return Error::Ok; -} - } // namespace aoti } // namespace backends } // namespace executorch diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index f84febbdc24..b263d0f9c81 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Callable, Dict, List, Mapping, Optional, Tuple +from typing import Callable, Dict, List, Optional, Tuple import torch from executorch.exir._warnings import experimental @@ -21,8 +21,6 @@ ) from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param from torch.export.exported_program import ExportedProgram -from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner -from torch.fx.passes.operator_support import OperatorSupportBase @experimental( @@ -32,10 +30,12 @@ class AotiPartitioner(Partitioner): """ Base partitioner for AOTInductor-driven backend integration. - Delegates the non-lowered operators to AOTInductor as one or more convex - partitions (a single partition when nothing else has claimed part of the - graph). It skips core ATen decomposition, letting the backend decompose via + This partitioner creates a single partition containing all operators from the input graph. + It skips core ATen decomposition, allowing the backend to handle decomposition using AOTInductor's backend-specific decomposition table. + + Only operators that cannot be handled by the aoti library will be excluded from + the partition and fall back to ExecuTorch's default or custom handling. """ def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None: @@ -49,76 +49,62 @@ def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None: self.delegation_spec = DelegationSpec(backend_name, compile_spec) def partition(self, exported_program: ExportedProgram) -> PartitionResult: - """Delegate the non-lowered ops to AOTInductor. - - Uses CapabilityBasedPartitioner rather than a single tag because a - delegated submodule must be convex: if a node that is not delegated sits - between the delegated ops, one tag would span a non-convex set and fusion - would fail with a dependency cycle. """ - # Only nodes not already lowered are candidates for this backend. - non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph)) - - control_flow_targets = [ - torch.ops.higher_order.cond, - torch.ops.higher_order.map_impl, - torch.ops.higher_order.while_loop, - torch.ops.higher_order.scan, - ] - - class AotiOperatorSupport(OperatorSupportBase): - def is_node_supported( - self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node - ) -> bool: - return node.op == "call_function" and node in non_lowered_nodes - - partitioner = CapabilityBasedPartitioner( - exported_program.graph_module, - AotiOperatorSupport(), - allows_single_node_partition=True, - ) + Fully delegate the graph to AOTInductor by tagging all nodes as a single partition. + """ partition_tags: Dict[str, DelegationSpec] = {} - for partition in partitioner.propose_partitions(): - tag = f"aoti_{partition.id}" - partition_tags[tag] = self.delegation_spec - for node in partition.nodes: - node.meta["delegation_tag"] = tag + tag = "tag0" + + # Tag torch.cond and other control flow operations + def is_control_flow(node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + torch.ops.higher_order.cond, + torch.ops.higher_order.map_impl, + torch.ops.higher_order.while_loop, + ] + + # Nodes already lowered by an earlier partitioner (e.g. a preceding + # TensorRT partition) appear as executorch_call_delegate calls and their + # output getitems; re-delegating them would nest a foreign delegate. Tag + # only the remaining non-lowered ops so this partitioner composes after + # others. + non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph)) - # A control-flow op carries its branch GraphModules as get_attr operands; - # they must share the op's tag so they land inside the same submodule. A - # branch module feeds a single control-flow op, so first match wins. for node in exported_program.graph.nodes: - if node.op != "get_attr": - continue - for user in node.users: - if ( - user.op == "call_function" - and user.target in control_flow_targets - and "delegation_tag" in user.meta - ): - node.meta["delegation_tag"] = user.meta["delegation_tag"] - break + if node.op == "call_function": + if node not in non_lowered_nodes: + continue + node.meta["delegation_tag"] = tag + # Tag get_attr nodes that are used by control flow operations + elif node.op == "get_attr": + # Check if any user is a control flow operation + for user in node.users: + if is_control_flow(user): + node.meta["delegation_tag"] = tag + break + + partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) tag_mutated_buffer(exported_program) - # tag_constant_data only tags constants that have users; tag the - # genuinely unused ones too so none are left dangling. - if partition_tags: - fallback_tag = next(iter(partition_tags)) - for node in exported_program.graph.nodes: - if ( - node.op == "placeholder" - and not node.users - and "delegation_tag" not in node.meta - and ( - is_param(exported_program, node) - or is_buffer(exported_program, node) - or is_lifted_tensor_constant(exported_program, node) - ) - ): - node.meta["delegation_tag"] = fallback_tag + # A constant that still has users feeds only a prior delegate; tagging it + # would fail backend lowering's same-tag check (its user keeps the prior + # tag). tag_constant_data already claimed the ones this partition uses, so + # tag only the genuinely unused constants here. + for node in exported_program.graph.nodes: + if ( + node.op == "placeholder" + and not node.users + and "delegation_tag" not in node.meta + and ( + is_param(exported_program, node) + or is_buffer(exported_program, node) + or is_lifted_tensor_constant(exported_program, node) + ) + ): + node.meta["delegation_tag"] = tag return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS index f41c1bfb517..d92e0e32a1f 100644 --- a/backends/aoti/tests/TARGETS +++ b/backends/aoti/tests/TARGETS @@ -3,18 +3,6 @@ load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils") oncall("executorch") -cpp_unittest( - name = "test_resolve_blob_keys", - srcs = [ - "test_resolve_blob_keys.cpp", - ], - deps = [ - "//executorch/backends/aoti:delegate_handle", - "//executorch/runtime/core:core", - "//executorch/runtime/core:evalue", - ], -) - cpp_unittest( name = "test_common_shims", srcs = [ diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index b9579d59d9c..c0d996df62b 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -245,12 +245,8 @@ class ET_EXPERIMENTAL MetalBackend final } } - std::string so_blob_key; - std::string weights_blob_key; - ET_CHECK_OK_OR_RETURN_ERROR( - executorch::backends::aoti::resolve_blob_keys( - processed, method_name, so_blob_key, weights_blob_key), - "Malformed named-data key payload"); + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str()); const NamedDataMap* named_data_map = context.get_named_data_map(); @@ -262,6 +258,8 @@ class ET_EXPERIMENTAL MetalBackend final // Prefetch the weights blob — trigger async readahead so pages are // resident by the time update_constants_from_blob memcpy's them. // This overlaps disk I/O with the .so write + dlopen (~200ms). + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; { auto prefetch_buf = named_data_map->get_data(weights_blob_key.c_str()); if (prefetch_buf.ok() && prefetch_buf->data() != nullptr) { diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index 27e5088fc72..29062b57579 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -171,14 +171,12 @@ from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass # noqa from .rewrite_matmul import RewriteMatmulPass # noqa from .rewrite_max_pool2d_pass import RewriteMaxPool2dPass # noqa -from .rewrite_mxfp_conv2d import RewriteMXFPConv2dPass # noqa from .rewrite_mxfp_linear import RewriteMXFPLinearPass # noqa from .rewrite_pad import RewritePadPass # noqa from .rewrite_slice import RewriteSlicePass # noqa from .rewrite_upsample import RewriteUpsamplePass # noqa from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa from .size_adjust_input_pass import SizeAdjustInputPass # noqa -from .symbolic_to_tosa_shape_pass import SymbolicToTosaShapesPass # noqa from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass # noqa from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass # noqa from .replace_inf_and_limit_values_pass import ( # noqa # usort: skip diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index fedca6eb65b..5ec57ee1787 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -117,7 +117,6 @@ InsertConstShapesPass, InsertControlFlowRescalesPass, InsertDataLayoutCastsPass, - InsertDynamicPaddingPass, InsertInt32CastsAfterInt64PlaceholdersPass, InsertRescaleInt32Pass, InsertRescalePass, @@ -147,14 +146,12 @@ RewriteLeLtToGeGtPass, RewriteMatmulPass, RewriteMaxPool2dPass, - RewriteMXFPConv2dPass, RewriteMXFPLinearPass, RewritePadPass, RewriteSlicePass, RewriteUpsamplePass, ScalarsToAttributePass, SizeAdjustInputPass, - SymbolicToTosaShapesPass, UnsqueezeBeforeRepeatPass, UnsqueezeScalarPlaceholdersPass, ) @@ -613,7 +610,6 @@ def _tosa_pipeline( RewriteMaxPool2dPass(), DecomposeAdaptiveMaxPool2dPass(), RewriteConvPass(exported_program), - RewriteMXFPConv2dPass(exported_program), RewriteMXFPLinearPass(exported_program), RewriteMatmulPass(), RewritePadPass(), @@ -634,8 +630,6 @@ def _tosa_pipeline( [ CastInt64BuffersToInt32Pass(exported_program), FuseEqualPlaceholdersPass(exported_program), - SymbolicToTosaShapesPass(), - InsertDynamicPaddingPass(), FuseConsecutiveConcatShapesPass(), EnsureUniqueOutputNodesPass(), RemoveNoopPass(), @@ -683,6 +677,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): InsertInt32CastsAfterInt64PlaceholdersPass(tfa_pass=True), DecomposeEmbeddingPass(tfa_pass=True), DecomposeScaledDotProductAttentionPass(tfa_pass=True), + DecomposeRoundPass(tfa_pass=True), DecomposeLogitPass(tfa_pass=True), PromoteBoolOperandsPass(tfa_pass=True), DecomposeSignPass(tfa_pass=True), diff --git a/backends/arm/_passes/aten_to_tosa_activation_functions.py b/backends/arm/_passes/aten_to_tosa_activation_functions.py index 8d51f092991..9b92b31e630 100644 --- a/backends/arm/_passes/aten_to_tosa_activation_functions.py +++ b/backends/arm/_passes/aten_to_tosa_activation_functions.py @@ -128,21 +128,3 @@ def rewrite_clamp(node: Node, pass_: AtenToDialectPass) -> DialectNodeSpec | Non exir_ops.backend.tosa.CLAMP.default, (node.args[0], *min_max_args), ) - - -def get_activation_replacement( - node: Node, pass_: AtenToDialectPass -) -> DialectNodeSpec | None: - # Dispatch activation rewrites from their ATen target to the matching TOSA - # dialect node builder. - match node.target: - case exir_ops.edge.aten.clamp.default: - return rewrite_clamp(node, pass_) - case exir_ops.edge.aten.erf.default: - return rewrite_erf(node, pass_) - case exir_ops.edge.aten.sigmoid.default: - return rewrite_sigmoid(node, pass_) - case exir_ops.edge.aten.tanh.default: - return rewrite_tanh(node, pass_) - case _: - return None diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py index 48b26f1d027..476f75d6b56 100644 --- a/backends/arm/_passes/decompose_round_pass.py +++ b/backends/arm/_passes/decompose_round_pass.py @@ -5,6 +5,7 @@ from typing import Set, Type +import torch from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload @@ -32,6 +33,16 @@ def _get_round_decomposition_ops(op) -> tuple[Op, Op, Op, Op, Op, Op, Op]: exir_ops.edge.aten.ceil.default, exir_ops.edge.aten.where.self, ) + elif op == torch.ops.aten.round.default: + return ( + torch.ops.aten.full.default, + torch.ops.aten.ge.Tensor, + torch.ops.aten.add.Scalar, + torch.ops.aten.sub.Scalar, + torch.ops.aten.floor.default, + torch.ops.aten.ceil.default, + torch.ops.aten.where.self, + ) raise RuntimeError(f"Can't get round decomposition ops for op {op}") @@ -54,10 +65,11 @@ class DecomposeRoundPass(ArmOpTargetedPass): target_ops = { exir_ops.edge.aten.round.default, + torch.ops.aten.round.default, } def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in self.target_ops or self._is_quantized_meta(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated) x = args[0] input_dtype = x.node.meta["val"].dtype diff --git a/backends/arm/_passes/deduplicate_get_attr_pass.py b/backends/arm/_passes/deduplicate_get_attr_pass.py index f5760a2fcb8..201a9036e34 100644 --- a/backends/arm/_passes/deduplicate_get_attr_pass.py +++ b/backends/arm/_passes/deduplicate_get_attr_pass.py @@ -9,7 +9,6 @@ from executorch.backends.arm._passes import ArmPass from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule, Node -from torch.fx.node import map_arg from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix @@ -25,13 +24,6 @@ class DeduplicateGetAttrPass(ArmPass): _passes_required_after: Set[Type[ExportPass]] = set() - def _replace_input_node(self, node: Node, old_node: Node, new_node: Node) -> None: - def maybe_replace_node(arg: Any) -> Any: - return new_node if arg is old_node else arg - - node.args = map_arg(node.args, maybe_replace_node) - node.kwargs = map_arg(node.kwargs, maybe_replace_node) - def _get_attr(self, graph_module: GraphModule, target: str) -> Any: attr: Any = graph_module for target_atom in target.split("."): @@ -59,26 +51,9 @@ def _copy_attr(self, graph_module: GraphModule, node: Node) -> str: return attr_name - def _split_shared_get_attrs(self, graph_module: GraphModule) -> bool: - modified = False - - for node in list(graph_module.graph.find_nodes(op="get_attr")): - users = list(node.users) - if len(users) <= 1: - continue - - for user in users[1:]: - with graph_module.graph.inserting_before(user): - new_node = graph_module.graph.get_attr(node.target) - new_node.meta.update(node.meta) - self._replace_input_node(user, node, new_node) - modified = True - - return modified - def call(self, graph_module: GraphModule) -> PassResult: seen_targets: set[str] = set() - modified = self._split_shared_get_attrs(graph_module) + modified = False for node in graph_module.graph.find_nodes(op="get_attr"): diff --git a/backends/arm/_passes/exir_to_tosa_pass.py b/backends/arm/_passes/exir_to_tosa_pass.py index c0c6efb1a6c..b77171b9eaf 100644 --- a/backends/arm/_passes/exir_to_tosa_pass.py +++ b/backends/arm/_passes/exir_to_tosa_pass.py @@ -5,38 +5,37 @@ import executorch.backends.arm.tosa.dialect # noqa: F401 from executorch.backends.arm._passes.aten_to_tosa_activation_functions import ( - get_activation_replacement, -) -from executorch.backends.arm._passes.aten_to_tosa_tensor_operators import rewrite_argmax -from executorch.backends.transforms.aten_to_dialect_pass import ( - AtenToDialectPass, - DialectNodeSpec, + rewrite_clamp, + rewrite_erf, + rewrite_sigmoid, + rewrite_tanh, ) +from executorch.backends.transforms.aten_to_dialect_pass import AtenToDialectPass from executorch.exir.dialects._ops import ops as exir_ops -from torch.fx import Node class ExirToTosaPass(AtenToDialectPass): """Rewrite simple EXIR ops to equivalent backend TOSA dialect ops. - Rewrite functions are registered with the shared ATen-to-dialect pass - infrastructure. + Rewrite functions are grouped by op category and registered with the shared + ATen-to-dialect pass infrastructure. """ -@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.argmax.default) -def _get_tensor_operators_replacement( - node: Node, pass_: AtenToDialectPass -) -> DialectNodeSpec: - return rewrite_argmax(node, pass_) +_ACTIVATION_FUNCTION_REWRITES = { + exir_ops.edge.aten.clamp.default: rewrite_clamp, + exir_ops.edge.aten.erf.default: rewrite_erf, + exir_ops.edge.aten.sigmoid.default: rewrite_sigmoid, + exir_ops.edge.aten.tanh.default: rewrite_tanh, +} +_DIRECT_REWRITE_CATEGORIES = { + "activation_functions": _ACTIVATION_FUNCTION_REWRITES, +} -@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.clamp.default) -@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.erf.default) -@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.sigmoid.default) -@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.tanh.default) -def _get_activation_replacement( - node: Node, pass_: AtenToDialectPass -) -> DialectNodeSpec | None: - return get_activation_replacement(node, pass_) +# Register each category's ATen targets with the function that builds the +# corresponding TOSA dialect node spec. +for _rewrite_category in _DIRECT_REWRITE_CATEGORIES.values(): + for _edge_target, _rewrite_fn in _rewrite_category.items(): + ExirToTosaPass.register_dialect_substitution(_edge_target)(_rewrite_fn) diff --git a/backends/arm/_passes/insert_dynamic_padding.py b/backends/arm/_passes/insert_dynamic_padding.py index b1d998268eb..bfc0382e4ad 100644 --- a/backends/arm/_passes/insert_dynamic_padding.py +++ b/backends/arm/_passes/insert_dynamic_padding.py @@ -29,7 +29,6 @@ class InsertDynamicPaddingPass(ArmOpTargetedPass): _passes_required_after: Set[Type[ExportPass]] = set() target_ops = ( exir_ops.backend.tosa.CONV2D.default, - exir_ops.backend.tosa.CONV3D.default, exir_ops.backend.tosa.DEPTHWISE_CONV2D.default, exir_ops.backend.tosa.MAX_POOL2D.default, exir_ops.backend.tosa.AVG_POOL2D.default, @@ -58,12 +57,11 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue: if not self._is_dynamic_padding(padding): return super().call_operator(op, args, kwargs, meta, updated) - # Create a pad op before the convolution/pool op. + # Create a pad op before conv2d input_tensor = args[0] zero_padding_pair = [0, 0] - spatial_rank = 3 if op == exir_ops.backend.tosa.CONV3D.default else 2 - zero_spatial_padding = [0] * (spatial_rank * 2) + zero_spatial_padding = [0, 0, 0, 0] N_padding = super().call_shape_operator( exir_ops.backend.tosa.CONST_SHAPE.default, (zero_padding_pair,), @@ -95,7 +93,7 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue: meta, True, ) - new_args = list(args) - new_args[0] = pad_res - new_args[padding_index] = zero_spatial_padding - return super().call_operator(op, tuple(new_args), kwargs, meta, updated) + new_conv2d_args = list(args) + new_conv2d_args[0] = pad_res + new_conv2d_args[padding_index] = zero_spatial_padding + return super().call_operator(op, tuple(new_conv2d_args), kwargs, meta, updated) diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py index f84ec5b678e..45374c12c3b 100644 --- a/backends/arm/_passes/insert_rescales_pass.py +++ b/backends/arm/_passes/insert_rescales_pass.py @@ -18,7 +18,6 @@ from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.backends.arm.constants import DQ_OPS, Q_OPS -from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule, Node @@ -36,12 +35,6 @@ class InsertRescalePass(ArmPass): _passes_required_after: Set[Type[ExportPass]] = set() - _mxfp_payload_dtypes = { - TosaSpecialDtype.FP4E2M1, - TosaSpecialDtype.FP6E2M3, - TosaSpecialDtype.FP6E3M2, - } - def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None: """Ensure uint8 tensors only appear at IO boundaries. @@ -58,23 +51,21 @@ def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None: continue if node.op in ("placeholder", "output"): continue - if node.op == "call_function": - if node.target == operator.getitem and all( - user.op == "output" for user in node.users - ): - continue - if node.target == exir_ops.backend.tosa.RESCALE.default: - continue - if ( - node.target - == exir_ops.edge.dim_order_ops._to_dim_order_copy.default - ): - # dim_order is a view-like transform; allow it to preserve uint8 at IO. + if node.op == "call_function" and node.target == operator.getitem: + if all(user.op == "output" for user in node.users): continue - if node.meta.get(TosaSpecialDtype.meta_key()) in self._mxfp_payload_dtypes: - # Sub-byte FP types are stored uint8 arrays, so we need an exception for those. + if ( + node.op == "call_function" + and node.target + == exir_ops.edge.dim_order_ops._to_dim_order_copy.default + ): + # dim_order is a view-like transform; allow it to preserve uint8 at IO. + continue + if ( + node.op == "call_function" + and node.target == exir_ops.backend.tosa.RESCALE.default + ): continue - raise ValueError( f"Found internal uint8 tensor at node {node.name} " f"({node.target}). Uint8 is only allowed at IO boundaries." diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py index 82d2ff1dbe0..10b85149dad 100644 --- a/backends/arm/_passes/insert_table_ops.py +++ b/backends/arm/_passes/insert_table_ops.py @@ -58,7 +58,6 @@ class TableOps: exir_ops.edge.aten.acos.default: torch.acos, exir_ops.edge.aten.tan.default: torch.tan, exir_ops.edge.aten.silu.default: torch.nn.functional.silu, - exir_ops.edge.aten.round.default: torch.round, } # Targets that must be treated explicitly diff --git a/backends/arm/_passes/rewrite_conv_pass.py b/backends/arm/_passes/rewrite_conv_pass.py index 3ae5ae9f9fb..6f588a1a1f1 100644 --- a/backends/arm/_passes/rewrite_conv_pass.py +++ b/backends/arm/_passes/rewrite_conv_pass.py @@ -97,25 +97,23 @@ def _adjust_pad_if_needed( if isinstance(mod_remainder, torch.SymInt): shape_env = get_context_shape_env() - exact_values = evaluate_symbolic_expr_values(mod_remainder, shape_env) + exact_values = evaluate_symbolic_expr_values( + mod_remainder.node.expr, shape_env + ) if exact_values is not None: mod_remainder_upper = max(exact_values) - if len(exact_values) == 1: - mod_remainder = int(next(iter(exact_values))) - elif mod_remainder_upper == 0: - mod_remainder = 0 - else: - return pad - mod_remainder else: - # SizeAdjustInputPass already trims symbolic remainder classes - # that would force negative padding. Keep the symbolic - # expression here instead of asking ShapeEnv to normalize it. - return pad - mod_remainder - if mod_remainder > pad: + value_ranges = shape_env.bound_sympy(mod_remainder.node.expr) + mod_remainder_upper = int(value_ranges.upper) + if mod_remainder_upper == 0: + mod_remainder = 0 + else: + mod_remainder_upper = mod_remainder + + if mod_remainder_upper > pad: raise RuntimeError( - "This case should be handled by SizeAdjustInputPass, is it enabled?\n" + "This case should be handled by the SizeAdjustInputPass, is it enabled?\n" ) - return pad - mod_remainder def _is_depthwise_conv2d(self, node: torch.fx.Node) -> bool: diff --git a/backends/arm/_passes/rewrite_mxfp_linear.py b/backends/arm/_passes/rewrite_mxfp_linear.py index 6f4a475e46e..d4ca436dc41 100644 --- a/backends/arm/_passes/rewrite_mxfp_linear.py +++ b/backends/arm/_passes/rewrite_mxfp_linear.py @@ -8,53 +8,16 @@ from typing import Any, cast, Sequence, Set, Type import torch -from executorch.backends.arm._passes import ArmOpTargetedPass +from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, ) -from executorch.backends.arm.ao_ext.mxfp import ( - mxfp_dtype_to_str, - mxfp_str_to_dtype, - MXFPDType, -) -from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2 - - -def _get_weights_payload_dtype( - qdata_node: torch.fx.Node, - dtype: str = "", -) -> MXFPDType: - if dtype: - return mxfp_str_to_dtype(dtype) - qdata = get_first_fake_tensor(qdata_node) - if qdata.dtype == torch.uint8: - return torch.float4_e2m1fn_x2 - return qdata.dtype - - -def _mark_mxfp_payload(node: torch.fx.Node, payload_dtype: MXFPDType) -> None: - """Annotate uint8-backed MXFP payload nodes with their TOSA dtype. - PyTorch represents sub-byte MXFP payloads as ``torch.uint8`` tensors, so - the tensor dtype alone cannot distinguish FP4E2M1, FP6E2M3, and FP6E3M2. - Store the logical TOSA dtype in node metadata so later lowering and - serialization treat the payload as MXFP data rather than ordinary uint8. - FP8 payloads have native PyTorch dtypes and do not need this metadata. - """ - if payload_dtype == torch.float4_e2m1fn_x2: - node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP4E2M1 - elif payload_dtype == DTYPE_FP6_E2M3: - node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E2M3 - elif payload_dtype == DTYPE_FP6_E3M2: - node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E3M2 - - -class RewriteMXFPLinearPass(ArmOpTargetedPass): +class RewriteMXFPLinearPass(ArmPass): """Rewrite ``tosa_mxfp.linear`` into explicit TOSA MXFP operators. For each MXFP linear custom op, the pass: @@ -69,24 +32,15 @@ class RewriteMXFPLinearPass(ArmOpTargetedPass): """ - target_ops = { - torch.ops.tosa_mxfp.linear.default, - exir_ops.edge.tosa_mxfp.linear.default, - } _passes_required_after: Set[Type[ExportPass]] = set() def __init__(self, exported_program: torch.export.ExportedProgram, *args, **kwargs): super().__init__(*args, **kwargs) self.exported_program = exported_program - def _get_linear_args(self, node: torch.fx.Node) -> tuple[ - torch.fx.Node, - torch.fx.Node, - torch.fx.Node, - torch.fx.Node | None, - int, - MXFPDType, - ]: + def _get_linear_args( + self, node: torch.fx.Node + ) -> tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node, torch.fx.Node | None, int]: """Extract the MXFP linear operands from a custom-op node.""" input_node = cast(torch.fx.Node, node.args[0]) weight_qdata_node = cast(torch.fx.Node, node.args[1]) @@ -99,26 +53,7 @@ def _get_linear_args(self, node: torch.fx.Node) -> tuple[ int, node.args[4] if len(node.args) > 4 else node.kwargs.get("block_size", 32), ) - payload_dtype_str = cast( - str, - ( - node.args[5] - if len(node.args) > 5 - else node.kwargs.get( - "weight_payload_dtype", - node.kwargs.get("weight_dtype", ""), - ) - ), - ) - payload_dtype = _get_weights_payload_dtype(weight_qdata_node, payload_dtype_str) - return ( - input_node, - weight_qdata_node, - weight_scale_node, - bias_node, - block_size, - payload_dtype, - ) + return input_node, weight_qdata_node, weight_scale_node, bias_node, block_size def _reshape_with_view( self, @@ -149,15 +84,12 @@ def _create_block_scaled_inputs( weight_qdata_node: torch.fx.Node, weight_scale_node: torch.fx.Node, block_size: int, - payload_dtype: MXFPDType, ) -> tuple[torch.fx.Node, torch.fx.Node]: """Create rank-3 inputs for the block-scaled cast and matmul ops.""" graph = graph_module.graph input_fake = get_first_fake_tensor(input_node) weight_qdata_fake = get_first_fake_tensor(weight_qdata_node) weight_scale_fake = get_first_fake_tensor(weight_scale_node) - payload_dtype_str = mxfp_dtype_to_str(payload_dtype) - _mark_mxfp_payload(weight_qdata_node, payload_dtype) batches = reduce(operator.mul, input_fake.shape[:-1], 1) input_reshape_shape = [1, batches, input_fake.shape[-1]] @@ -177,13 +109,13 @@ def _create_block_scaled_inputs( graph=graph, op_target=exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default, args=(input_reshaped, block_size), - kwargs={"output_dtype": payload_dtype_str}, + kwargs={"output_dtype": weight_qdata_fake.dtype}, from_node=mxfp_linear_node, ) cast_node.meta["val"] = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( get_first_fake_tensor(input_reshaped), block_size, - output_dtype=payload_dtype_str, + output_dtype=weight_qdata_fake.dtype, ) input_qdata_node = create_node( @@ -194,7 +126,6 @@ def _create_block_scaled_inputs( from_node=mxfp_linear_node, ) input_qdata_node.meta["val"] = cast_node.meta["val"][0] - _mark_mxfp_payload(input_qdata_node, payload_dtype) input_scale_node = create_node( graph=graph, @@ -219,10 +150,8 @@ def _create_matmul_node( weight_qdata_node: torch.fx.Node, weight_scale_node: torch.fx.Node, block_size: int, - payload_dtype: MXFPDType, ) -> torch.fx.Node: """Insert ``MATMUL_T_BLOCK_SCALED`` with updated fake metadata.""" - payload_dtype_str = mxfp_dtype_to_str(payload_dtype) matmul_node = create_node( graph=graph_module.graph, op_target=exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default, @@ -233,7 +162,7 @@ def _create_matmul_node( weight_scale_node, block_size, ), - kwargs={"payload_dtype": payload_dtype_str}, + kwargs={}, from_node=mxfp_linear_node, ) matmul_node.meta["val"] = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default( @@ -242,7 +171,6 @@ def _create_matmul_node( get_first_fake_tensor(weight_qdata_node), get_first_fake_tensor(weight_scale_node), block_size, - payload_dtype=payload_dtype_str, ) return matmul_node @@ -327,7 +255,6 @@ def _rewrite_mxfp_linear_node( weight_scale_node, bias_node, block_size, - payload_dtype, ) = self._get_linear_args(mxfp_linear_node) with graph.inserting_before(mxfp_linear_node): @@ -341,7 +268,6 @@ def _rewrite_mxfp_linear_node( weight_qdata_node, weight_scale_node, block_size, - payload_dtype, ) matmul_node = self._create_matmul_node( graph_module, @@ -351,7 +277,6 @@ def _rewrite_mxfp_linear_node( weight_qdata_node, weight_scale_node, block_size, - payload_dtype, ) with graph.inserting_after(matmul_node): @@ -374,7 +299,10 @@ def call(self, graph_module: torch.fx.GraphModule): graph = graph_module.graph for node in list(graph.nodes): - if node.op != "call_function" or node.target not in self.target_ops: + if node.op != "call_function" or node.target not in ( + torch.ops.tosa_mxfp.linear.default, + exir_ops.edge.tosa_mxfp.linear.default, + ): continue modified = True diff --git a/backends/arm/_passes/size_adjust_input_pass.py b/backends/arm/_passes/size_adjust_input_pass.py index 6028e618d65..1c331b9c329 100644 --- a/backends/arm/_passes/size_adjust_input_pass.py +++ b/backends/arm/_passes/size_adjust_input_pass.py @@ -62,41 +62,6 @@ def _greater_than(input: SymIntLike, other: int) -> bool | torch.SymBool: return input > other -def _get_slice_adjustment( - remainder: SymIntLike, - pad: int, - stride: int, -) -> SymIntLike | None: - """Return the amount to slice from the end of a conv dimension. - - The required trim is ``max(remainder - pad, 0)``. For symbolic shapes we - encode that clamp using only integer arithmetic that the TOSA shape - materializer already supports: a sum of floor-div terms over the possible - residue classes. - - """ - if not isinstance(remainder, torch.SymInt): - return remainder - pad if remainder > pad else None - - shape_env = get_context_shape_env() - exact_values = evaluate_symbolic_expr_values(remainder.node.expr, shape_env) - if exact_values is not None: - adjustments = {max(value - pad, 0) for value in exact_values} - if len(adjustments) == 1: - adjustment = next(iter(adjustments)) - return adjustment if adjustment > 0 else None - - if pad >= stride - 1: - return None - - adjustment: SymIntLike | None = None # type: ignore[no-redef] - for threshold in range(pad + 1, stride): - term = (remainder + stride - threshold) // stride - adjustment = term if adjustment is None else adjustment + term - - return adjustment - - def get_slices_convolution(conv_node: torch.fx.Node) -> Slices: slices: Slices = [] @@ -120,12 +85,8 @@ def get_slices_convolution(conv_node: torch.fx.Node) -> Slices: remainder = conv_remainder( input_shape[dim], pad, dilation, weight_shape[dim], stride ) - adjustment = _get_slice_adjustment( - remainder, - pad, - stride, - ) - if adjustment is not None: + if _greater_than(remainder, pad): + adjustment = remainder - pad args = (dim, 0, input_shape[dim] - adjustment) slices.append(args) diff --git a/backends/arm/_passes/symbolic_value_range.py b/backends/arm/_passes/symbolic_value_range.py index 609a84edc54..0753fefa270 100644 --- a/backends/arm/_passes/symbolic_value_range.py +++ b/backends/arm/_passes/symbolic_value_range.py @@ -39,70 +39,11 @@ def _symbol_values(symbol: sympy.Symbol, shape_env: ShapeEnv) -> _ExactValues: return frozenset(sympy.Integer(value) for value in range(lower, upper + 1)) -def _expr_symbols_to_values( - expr: sympy.Basic, - shape_env: ShapeEnv, -) -> dict[sympy.Symbol, _ExactValues]: - return {symbol: _symbol_values(symbol, shape_env) for symbol in expr.free_symbols} - - -def _try_expr_to_int(expr: sympy.Basic) -> Optional[int]: - integer_value = _expr_to_int(expr) - if integer_value is not None: - return integer_value - - try: - return _expr_to_int(sympy.simplify(expr)) - except (RecursionError, TypeError): - return None - - -def _constant_expr_values(expr: sympy.Basic) -> Optional[set[int]]: - if expr.free_symbols: - return None - - integer_value = _try_expr_to_int(expr) - return {integer_value} if integer_value is not None else None - - -def _evaluate_exact_values( - expr: sympy.Basic, - shape_env: ShapeEnv, -) -> _ExactValues: - try: - return sympy_interp( - _ExactValueAnalysis, - _expr_symbols_to_values(expr, shape_env), - expr, - missing_handler=lambda symbol: _symbol_values(symbol, shape_env), - ) - except (RecursionError, TypeError): - return None - - -def _exact_values_to_ints(exact_values: _ExactValues) -> Optional[set[int]]: - if exact_values is None: - return None - - result: set[int] = set() - for value in exact_values: - integer_value = _try_expr_to_int(value) - if integer_value is None: - return None - result.add(integer_value) - return result - - def _map_values(values: _ExactValues, fn) -> _ExactValues: if values is None: return None - result = set() - for value in values: - try: - result.add(fn(value)) - except (RecursionError, TypeError): - return None + result = {sympy.simplify(fn(value)) for value in values} if len(result) > _MAX_SET_SIZE: return None return frozenset(result) @@ -114,13 +55,7 @@ def _combine_values(lhs: _ExactValues, rhs: _ExactValues, fn) -> _ExactValues: if len(lhs) * len(rhs) > _MAX_SET_SIZE * _MAX_SET_SIZE: return None - result = set() - for a in lhs: - for b in rhs: - try: - result.add(fn(a, b)) - except (RecursionError, TypeError): - return None + result = {sympy.simplify(fn(a, b)) for a in lhs for b in rhs} if len(result) > _MAX_SET_SIZE: return None return frozenset(result) @@ -145,12 +80,6 @@ def mod(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues: return None return _combine_values(lhs, rhs, lambda a, b: sympy.Mod(a, b)) - @staticmethod - def floordiv(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues: - if rhs is None or any(value == 0 for value in rhs): - return None - return _combine_values(lhs, rhs, lambda a, b: sympy.floor(a / b)) - @staticmethod def pow(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues: return _combine_values(lhs, rhs, lambda a, b: a**b) @@ -175,15 +104,35 @@ def evaluate_symbolic_expr_values( ) -> Optional[set[int]]: """Return a best-effort finite set of possible integer values. - The helper avoids ShapeEnv bound queries here because some exported dynamic - expressions trigger very deep SymPy normalization. Instead, it relies on a - small exact-set analysis over bounded symbols using ``sympy_interp``. + The helper first relies on ``bound_sympy`` for cheap singleton detection. + When interval bounds are not precise enough, it falls back to a small + exact-set analysis over bounded symbols using ``sympy_interp``. """ - root_expr = expr.node.expr if isinstance(expr, torch.SymInt) else expr - - constant_values = _constant_expr_values(root_expr) - if constant_values is not None: - return constant_values + root_expr = sympy.simplify( + expr.node.expr if isinstance(expr, torch.SymInt) else expr + ) + value_range = shape_env.bound_sympy(root_expr) + if value_range.is_int and value_range.is_singleton(): + singleton = _expr_to_int(value_range.lower) + return {singleton} if singleton is not None else None + + exact_values = sympy_interp( + _ExactValueAnalysis, + { + symbol: _symbol_values(symbol, shape_env) + for symbol in root_expr.free_symbols + }, + root_expr, + missing_handler=lambda symbol: _symbol_values(symbol, shape_env), + ) + if exact_values is None: + return None - return _exact_values_to_ints(_evaluate_exact_values(root_expr, shape_env)) + result: set[int] = set() + for value in exact_values: + integer_value = _expr_to_int(sympy.simplify(value)) + if integer_value is None: + return None + result.add(integer_value) + return result diff --git a/backends/arm/ao_ext/mxfp.py b/backends/arm/ao_ext/mxfp.py index f3b611ce14c..783da92590e 100644 --- a/backends/arm/ao_ext/mxfp.py +++ b/backends/arm/ao_ext/mxfp.py @@ -10,85 +10,12 @@ from executorch.exir._warnings import experimental from torchao.core.config import AOBaseConfig from torchao.prototype.mx_formats.config import ScaleCalculationMode -from torchao.prototype.mx_formats.mx_tensor import ( - DTYPE_FP6_E2M3, - DTYPE_FP6_E3M2, - to_dtype, - to_mx, -) from torchao.quantization import quantize_ -# Pytorch lacks dtypes for the FP6 types, so we use ao's string representations for those. -MXFPDType = torch.dtype | str - - -SUPPORTED_MXFP_DTYPES: set[MXFPDType] = { - torch.float4_e2m1fn_x2, - torch.float8_e4m3fn, - torch.float8_e5m2, - # Use ao's string representations. - DTYPE_FP6_E2M3, - DTYPE_FP6_E3M2, -} - - -_DTYPE_TO_STR: dict[MXFPDType, str] = { - DTYPE_FP6_E2M3: "fp6e2m3", - DTYPE_FP6_E3M2: "fp6e3m2", - torch.float4_e2m1fn_x2: "f4e2m1", - torch.float8_e4m3fn: "f8e4m3", - torch.float8_e5m2: "f8e5m2", -} - - -_STR_TO_DTYPE = {value: key for (key, value) in _DTYPE_TO_STR.items()} - - -def mxfp_dtype_to_str(dtype: MXFPDType) -> str: - try: - return _DTYPE_TO_STR[dtype] - except KeyError as e: - supported = ", ".join(str(dtype) for dtype in _DTYPE_TO_STR) - raise ValueError( - f"Unsupported MXFP dtype {dtype}. Supported dtypes: {supported}" - ) from e - - -def mxfp_str_to_dtype(dtype: str) -> MXFPDType: - try: - return _STR_TO_DTYPE[dtype] - except KeyError as e: - supported = ", ".join(sorted(_STR_TO_DTYPE)) - raise ValueError( - f"Unsupported MXFP dtype string {dtype!r}. Supported strings: {supported}" - ) from e - - def _match_supported_modules(module: torch.nn.Module, _name: str) -> bool: """Default filter function that matches supported modules.""" - return isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)) - - -def _cast_to_block_scaled_cpu_ref( - input: torch.Tensor, - output_dtype: MXFPDType, - block_size: int, -) -> torch.Tensor: - """Emulate the current TOSA activation cast in eager mode.""" - input_scale, input_qdata = to_mx( - input.to(torch.float32).contiguous(), - elem_dtype=output_dtype, - block_size=block_size, - scaling_mode=ScaleCalculationMode.RCEIL, - ) - return to_dtype( - input_qdata, - input_scale, - output_dtype, - block_size, - torch.float32, - ) + return isinstance(module, torch.nn.Linear) @experimental("This API is experimental and may change without notice.") @@ -96,7 +23,7 @@ def _cast_to_block_scaled_cpu_ref( class MXFPOpConfig(AOBaseConfig): """Configuration for Arm MXFP source transforms.""" - weight_dtype: MXFPDType = torch.float8_e4m3fn + weight_dtype: torch.dtype = torch.float8_e4m3fn weight_scaling_mode: ScaleCalculationMode = ScaleCalculationMode.RCEIL # Only block size of 32 is currently supported for now, so we hardcode it here. @@ -105,7 +32,7 @@ def block_size(self) -> int: return 32 def __post_init__(self) -> None: - if self.weight_dtype not in SUPPORTED_MXFP_DTYPES: + if self.weight_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2): raise ValueError(f"Unsupported weight_dtype: {self.weight_dtype}") if not isinstance(self.weight_scaling_mode, ScaleCalculationMode): raise ValueError( diff --git a/backends/arm/ao_ext/mxfp_tosa_lib.py b/backends/arm/ao_ext/mxfp_tosa_lib.py index 911d944c720..4459ec59126 100644 --- a/backends/arm/ao_ext/mxfp_tosa_lib.py +++ b/backends/arm/ao_ext/mxfp_tosa_lib.py @@ -8,5 +8,4 @@ # MXFP TOSA library definition for the Arm backend containing. # This library will generate custom ops like the following example: # torch.ops.tosa_mxfp.linear.default -# torch.ops.tosa_mxfp.conv2d.default MXFP_TOSA_LIB = Library("tosa_mxfp", "DEF") diff --git a/backends/arm/ao_ext/mxfp_transform.py b/backends/arm/ao_ext/mxfp_transform.py index e1f119aa0a0..b7823524475 100644 --- a/backends/arm/ao_ext/mxfp_transform.py +++ b/backends/arm/ao_ext/mxfp_transform.py @@ -6,7 +6,6 @@ import torch from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig -from executorch.backends.arm.ao_ext.ops.mxfp_conv2d_op import transform_conv2d_to_mxfp from executorch.backends.arm.ao_ext.ops.mxfp_linear_op import transform_linear_to_mxfp from torchao.quantization.transform_module import register_quantize_module_handler @@ -21,7 +20,5 @@ def _transform_to_mxfp( """ if isinstance(module, torch.nn.Linear): return transform_linear_to_mxfp(module, config) - elif isinstance(module, torch.nn.Conv2d): - return transform_conv2d_to_mxfp(module, config) else: return module diff --git a/backends/arm/ao_ext/ops/__init__.py b/backends/arm/ao_ext/ops/__init__.py index d4c602154fe..a690c4b7b02 100644 --- a/backends/arm/ao_ext/ops/__init__.py +++ b/backends/arm/ao_ext/ops/__init__.py @@ -3,10 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from .mxfp_conv2d_op import MXFPConv2dOp from .mxfp_linear_op import MXFPLinearOp __all__ = [ - "MXFPConv2dOp", "MXFPLinearOp", ] diff --git a/backends/arm/ao_ext/ops/mxfp_linear_op.py b/backends/arm/ao_ext/ops/mxfp_linear_op.py index 565d8695c5a..5238f85a847 100644 --- a/backends/arm/ao_ext/ops/mxfp_linear_op.py +++ b/backends/arm/ao_ext/ops/mxfp_linear_op.py @@ -12,50 +12,17 @@ import torch import torch.nn.functional as F -from executorch.backends.arm.ao_ext.mxfp import ( - _cast_to_block_scaled_cpu_ref, - mxfp_dtype_to_str, - mxfp_str_to_dtype, - MXFPDType, - MXFPOpConfig, -) +from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig from executorch.backends.arm.ao_ext.mxfp_tosa_lib import MXFP_TOSA_LIB +from torchao.prototype.mx_formats.config import ScaleCalculationMode from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx - -# Define the custom TOSA operator. Note that weight_payload_dtype is needed as -# an extra argument because sub-byte dtypes (FP4 and FP6) are contained -# in uint8 tensors, meaning the weight tensor itself does not contain -# the dtype. MXFP_TOSA_LIB.define( "linear(Tensor input, Tensor weight_qdata, Tensor weight_scale, " - "Tensor? bias=None, SymInt block_size=32, str weight_payload_dtype='') -> Tensor" + "Tensor? bias=None, SymInt block_size=32) -> Tensor" ) -def _get_mx_elem_dtype( - weight_qdata: torch.Tensor, - weight_payload_dtype: str = "", -) -> MXFPDType: - if weight_payload_dtype: - return mxfp_str_to_dtype(weight_payload_dtype) - if weight_qdata.dtype == torch.uint8: - return torch.float4_e2m1fn_x2 - return weight_qdata.dtype - - -def _get_num_input_features( - weight_qdata: torch.Tensor, weight_payload_dtype: str = "" -) -> int: - num_input_features = weight_qdata.shape[-1] - if weight_qdata.dtype == torch.uint8 and weight_payload_dtype == mxfp_dtype_to_str( - torch.float4_e2m1fn_x2 - ): - # FP4 elements are packed pairwise in each byte in a uint8 tensor. - num_input_features *= 2 - return num_input_features - - @torch.library.register_fake("tosa_mxfp::linear", lib=MXFP_TOSA_LIB) # type: ignore[misc] def _mxfp_linear_fake( input: torch.Tensor, @@ -63,7 +30,6 @@ def _mxfp_linear_fake( weight_scale: torch.Tensor, bias: torch.Tensor | None = None, block_size: int = 32, - weight_payload_dtype: str = "", ) -> torch.Tensor: if weight_qdata.ndim != 3: raise ValueError( @@ -73,16 +39,15 @@ def _mxfp_linear_fake( raise ValueError( f"Expected weight_qdata batch dim to be 1, got {weight_qdata.shape[0]}" ) - num_input_features = _get_num_input_features(weight_qdata, weight_payload_dtype) - if input.shape[-1] != num_input_features: + if input.shape[-1] != weight_qdata.shape[-1]: raise ValueError( f"Input last dim {input.shape[-1]} must match linear in_features " - f"{num_input_features}" + f"{weight_qdata.shape[-1]}" ) expected_scale_shape = ( 1, weight_qdata.shape[1], - num_input_features // block_size, + weight_qdata.shape[-1] // block_size, ) if tuple(weight_scale.shape) != expected_scale_shape: raise ValueError( @@ -93,6 +58,27 @@ def _mxfp_linear_fake( return input.new_empty(output_shape, dtype=torch.float32) +def _cast_to_block_scaled_cpu_ref( + input: torch.Tensor, + output_dtype: torch.dtype, + block_size: int, +) -> torch.Tensor: + """Emulate the current TOSA activation cast in eager mode.""" + input_scale, input_qdata = to_mx( + input.to(torch.float32).contiguous(), + elem_dtype=output_dtype, + block_size=block_size, + scaling_mode=ScaleCalculationMode.RCEIL, + ) + return to_dtype( + input_qdata, + input_scale, + output_dtype, + block_size, + torch.float32, + ) + + @torch.library.impl("tosa_mxfp::linear", "cpu", lib=MXFP_TOSA_LIB) def _mxfp_linear_cpu( input: torch.Tensor, @@ -100,26 +86,23 @@ def _mxfp_linear_cpu( weight_scale: torch.Tensor, bias: torch.Tensor | None = None, block_size: int = 32, - weight_payload_dtype: str = "", ) -> torch.Tensor: """CPU reference implementation of the MXFP linear op.""" if weight_qdata.ndim != 3 or weight_scale.ndim != 3: raise ValueError("Expected rank-3 weight tensors for MXFP linear") - elem_dtype = _get_mx_elem_dtype(weight_qdata, weight_payload_dtype) - # Cast the input to block-scaled format and back again to match the # expected input format of the TOSA dequantized_input = _cast_to_block_scaled_cpu_ref( input, - elem_dtype, + weight_qdata.dtype, block_size, ) dequantized_weight = to_dtype( weight_qdata, weight_scale, - elem_dtype, + weight_qdata.dtype, block_size, torch.float32, ) @@ -141,7 +124,6 @@ def __init__( ) -> None: super().__init__() self.config = config - self.weight_dtype = mxfp_dtype_to_str(config.weight_dtype) self.register_buffer("weight_qdata", weight_qdata, persistent=True) self.register_buffer("weight_scale", weight_scale, persistent=True) @@ -164,7 +146,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.weight_scale, self.bias, self.config.block_size, - self.weight_dtype, ) diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS index 88e112feac5..a2fd054d472 100644 --- a/backends/arm/operator_support/TARGETS +++ b/backends/arm/operator_support/TARGETS @@ -4,7 +4,6 @@ runtime.python_library( name = "operator_support", srcs = glob(["*.py"]), deps = [ - "//executorch/backends/arm:ao_ext", "//executorch/backends/arm:constants", "//executorch/backends/arm/_passes:passes", "//executorch/backends/arm/tosa:resize_utils", diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py index 4d48d6ad0ff..066b5462f64 100644 --- a/backends/arm/operator_support/__init__.py +++ b/backends/arm/operator_support/__init__.py @@ -21,7 +21,6 @@ reduce_sum_support, right_shift_support, slice_copy_support, - sym_size_int_support, to_dim_order_copy_support, tosa_supported_operators, unfold_copy_support, diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py index dc448ba0d5f..fab4e6c60c1 100644 --- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py +++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py @@ -99,7 +99,6 @@ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, exir_ops.edge.aten.pad.default, exir_ops.edge.aten.constant_pad_nd.default, - exir_ops.edge.aten.argmax.default, exir_ops.edge.aten.amax.default, exir_ops.edge.aten.amin.default, exir_ops.edge.aten.eye.default, @@ -129,7 +128,6 @@ exir_ops.edge.aten.tan.default, exir_ops.edge.aten.silu.default, exir_ops.edge.aten.detach_copy.default, - exir_ops.edge.aten.round.default, } @@ -239,7 +237,6 @@ operator.getitem, exir_ops.edge.aten.pad.default, exir_ops.edge.aten.constant_pad_nd.default, - exir_ops.edge.aten.argmax.default, exir_ops.edge.aten.amax.default, exir_ops.edge.aten.amin.default, exir_ops.edge.aten.eye.default, diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py index 82a529d62a2..2e640b758d2 100644 --- a/backends/arm/operator_support/tosa_supported_operators.py +++ b/backends/arm/operator_support/tosa_supported_operators.py @@ -14,12 +14,9 @@ import typing from typing import final, Optional, Sequence, Type -# Register Arm-specific torch.library ops and MXFP transforms at package -# import time. -import executorch.backends.arm.ao_ext # noqa: F401 - import torch import torch.fx as fx + from executorch.backends.arm._passes.arm_pass_utils import ( get_first_fake_tensor, is_submodule_node, @@ -87,7 +84,7 @@ def __init__(self, tosa_spec: TosaSpecification, reporter: WhyNoPartitionReporte # Class attributes populated by subclasses tosa_specs: list[TosaSpecification] = TosaSpecification.all_versions_and_profiles() - targets: list[object] = [] + targets: list[str] = [] @final def is_node_supported( @@ -243,10 +240,7 @@ def get_registered_tosa_support_checks( class MXOpsSupportList(OperatorSupportBase): """Accept Arm MX custom ops when the active spec enables MX support.""" - targets = ( - exir_ops.edge.tosa_mxfp.conv2d.default, - exir_ops.edge.tosa_mxfp.linear.default, - ) + targets = (exir_ops.edge.tosa_mxfp.linear.default,) def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node @@ -254,141 +248,88 @@ def is_node_supported( return node.op == "call_function" and node.target in self.targets -def _profile_support_check( +def tosa_support_factory( tosa_spec: TosaSpecification, -) -> Optional[OperatorSupportBase]: - if tosa_spec.support_integer() and tosa_spec.support_float(): - return TOSAProINTFPSupportList() - if tosa_spec.support_integer(): - return TOSAProINTSupportList() - if tosa_spec.support_float(): - return TOSAProFPSupportList() - return None + exported_program: ExportedProgram, + reporter: WhyNoPartitionReporter, + additional_checks: Optional[Sequence[OperatorSupportBase]] = None, +) -> OperatorSupportBase: + """Create an OperatorSupport composite for a TOSA spec. + Combine profile-specific positive checks, registered operator checks, and + negative checks into a single :py:class:`OperatorSupportBase` chain. -def _registered_support_checks( - tosa_spec: TosaSpecification, - reporter: WhyNoPartitionReporter, -) -> list[OperatorSupportBase]: - return [ - check(tosa_spec, reporter) - for check in get_registered_tosa_support_checks(tosa_spec) - ] + Args: + tosa_spec (TosaSpecification): Active TOSA specification. + exported_program (ExportedProgram): Program context for checks. + reporter (WhyNoPartitionReporter): Reporter for rejections. + additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra + negative checks to apply. + Returns: + OperatorSupportBase: Composite checker for the given spec. -def _positive_checks( - tosa_spec: TosaSpecification, - exported_program: ExportedProgram, - reporter: WhyNoPartitionReporter, -) -> list[OperatorSupportBase]: - checks: list[OperatorSupportBase] = [ + """ + # Postive checks: Add nodes to partitioning + positive_checks: list[OperatorSupportBase] = [ ControlFlowSubmoduleSupported(exported_program, tosa_spec, reporter), ControlFlowOpSupported(exported_program, tosa_spec, reporter), ] - if profile_check := _profile_support_check(tosa_spec): - checks.append(profile_check) - + if tosa_spec.support_integer() and tosa_spec.support_float(): + positive_checks.append(TOSAProINTFPSupportList()) + elif tosa_spec.support_integer(): + positive_checks.append(TOSAProINTSupportList()) + elif tosa_spec.support_float(): + positive_checks.append(TOSAProFPSupportList()) if tosa_spec.support_extension("mxfp"): - checks.append(MXOpsSupportList()) - + positive_checks.append(MXOpsSupportList()) # TODO: Refactor to use TOSAProSupportLists + negtive checks - checks.extend(_registered_support_checks(tosa_spec, reporter)) + positive_checks += [ + check(tosa_spec, reporter) + for check in get_registered_tosa_support_checks(tosa_spec) + ] - return checks + # Negative checks: Remove nodes from partitioning + negative_checks: list[OperatorSupportBase] = [ + CheckInt64InputsAndOutputs(exported_program, reporter), + RankCheck(reporter, max_rank=MAX_RANK), + *[ + reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}") + for check in (additional_checks if additional_checks else []) + ], + ] + if tosa_spec.support_float(): + negative_checks.append(CheckMixedFloatingInputs(reporter)) + else: + negative_checks.append(CheckArmQuantized(reporter)) + negative_checks.append(CheckProperQuantization(reporter)) -def _disallowed_dtypes(tosa_spec: TosaSpecification) -> list[torch.dtype]: - dtypes = [torch.float64] + disallowed_dtypes = [torch.float64] if not tosa_spec.support_extension("bf16"): - dtypes.append(torch.bfloat16) + disallowed_dtypes.append(torch.bfloat16) if not ( tosa_spec.support_extension("fp8e4m3") or tosa_spec.support_extension("mxfp") ): - dtypes.append(torch.float8_e4m3fn) + disallowed_dtypes.append(torch.float8_e4m3fn) if not ( tosa_spec.support_extension("fp8e5m2") or tosa_spec.support_extension("mxfp") ): - dtypes.append(torch.float8_e5m2) + disallowed_dtypes.append(torch.float8_e5m2) if tosa_spec.is_U55_subset: - dtypes.append(torch.bool) - return dtypes - - -def _wrapped_additional_checks( - additional_checks: Optional[Sequence[OperatorSupportBase]], - reporter: WhyNoPartitionReporter, -) -> list[OperatorSupportBase]: - if not additional_checks: - return [] - return [ - reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}") - for check in additional_checks - ] - - -def _negative_checks( - tosa_spec: TosaSpecification, - exported_program: ExportedProgram, - reporter: WhyNoPartitionReporter, - additional_checks: Optional[Sequence[OperatorSupportBase]], -) -> list[OperatorSupportBase]: - checks: list[OperatorSupportBase] = [RankCheck(reporter, MAX_RANK)] - - if not tosa_spec.support_extension("int64"): - checks.append(CheckInt64InputsAndOutputs(exported_program, reporter, tosa_spec)) - - checks.extend(_wrapped_additional_checks(additional_checks, reporter)) - - if tosa_spec.support_float(): - checks.append(CheckMixedFloatingInputs(reporter)) - else: - checks.append(CheckArmQuantized(reporter)) - checks.append(CheckProperQuantization(reporter)) - - checks.append( + disallowed_dtypes.append(torch.bool) + negative_checks.append( CheckDtypeInputsAndOutputs( - exported_program, reporter, _disallowed_dtypes(tosa_spec), tosa_spec + exported_program, reporter, disallowed_dtypes, tosa_spec ) ) - if tosa_spec.is_U55_subset: - checks.append(EthosU55NotSupported(reporter)) - checks.append(EthosU55DtypeSupport(reporter)) - checks.append(EthosU55CastCheck(reporter)) - + negative_checks.append(EthosU55NotSupported(reporter)) + negative_checks.append(EthosU55DtypeSupport(reporter)) + negative_checks.append(EthosU55CastCheck(reporter)) if not tosa_spec.support_extension("shape"): - checks.append(SymbolicShapeSupportCheck(reporter)) - - return checks - - -def tosa_support_factory( - tosa_spec: TosaSpecification, - exported_program: ExportedProgram, - reporter: WhyNoPartitionReporter, - additional_checks: Optional[Sequence[OperatorSupportBase]] = None, -) -> OperatorSupportBase: - """Create an OperatorSupport composite for a TOSA spec. - - Combine profile-specific positive checks, registered operator checks, and - negative checks into a single :py:class:`OperatorSupportBase` chain. - - Args: - tosa_spec (TosaSpecification): Active TOSA specification. - exported_program (ExportedProgram): Program context for checks. - reporter (WhyNoPartitionReporter): Reporter for rejections. - additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra - negative checks to apply. - - Returns: - OperatorSupportBase: Composite checker for the given spec. - - """ - positive_checks = _positive_checks(tosa_spec, exported_program, reporter) - negative_checks = _negative_checks( - tosa_spec, exported_program, reporter, additional_checks - ) + negative_checks.append(SymbolicShapeSupportCheck(reporter)) return chain( reporter.wrap_check( @@ -427,40 +368,6 @@ def _has_symbolic_shape(node: fx.Node) -> bool: return False - def _partition_dynamic_upmsample_nearest2d(self, node: fx.Node) -> bool: - """Check if the node is an upsample_nearest2d with symbolic shapes. - - Args: - node (fx.Node): FX node to check. - - Returns: - bool: True if the node is an upsample_nearest2d with symbolic - shapes; otherwise, False. - - """ - if node.target != exir_ops.edge.aten.upsample_nearest2d.vec: - return False - - try: - input_tensor = get_first_fake_tensor(node.all_input_nodes[0]) - output_tensor = get_first_fake_tensor(node) - except Exception as exc: - self.reporter.report_reject( - node, - f"upsample_nearest2d symbolic shapes need tensor metadata: {exc}", - ) - return False - - input_size_xy = input_tensor.shape[2:4] - output_size_xy = output_tensor.shape[2:4] - if len(input_size_xy) != 2 or len(output_size_xy) != 2: - self.reporter.report_reject( - node, "upsample_nearest2d expects 2D spatial input/output." - ) - return False - - return True - def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: @@ -487,13 +394,14 @@ def is_node_supported( self._has_symbolic_shape(input_node) for input_node in node.all_input_nodes ): if node.target == exir_ops.edge.aten.upsample_nearest2d.vec: - return self._partition_dynamic_upmsample_nearest2d(node) - else: - self.reporter.report_reject( - node, - "Node has symbolic shape, has the TOSA spec shape extension support?", - ) - return False + return True + + self.reporter.report_reject( + node, + "Node has symbolic shape but the TOSA spec does not support " + "the shape extension.", + ) + return False return True @@ -654,10 +562,7 @@ def is_node_supported( self.reporter.report_reject(node, "One or more inputs were not quantized.") return False - all_q_users = all( - output_node.target in (*Q_OPS, torch.ops.aten.sym_size.int) - for output_node in node.users - ) + all_q_users = all((output_node.target in Q_OPS) for output_node in node.users) output_dtype = get_first_fake_tensor(node).dtype output_quantized = ( output_quantized or all_q_users or _is_integer_dtype(output_dtype) @@ -683,10 +588,7 @@ class CheckInt64InputsAndOutputs(OperatorSupportBase): """ def __init__( - self, - exported_program: ExportedProgram, - reporter: WhyNoPartitionReporter, - tosa_spec: TosaSpecification, + self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter ): """Initialize the check with program context and reporter.""" self.input_names = [ @@ -695,7 +597,6 @@ def __init__( if spec.kind == InputKind.USER_INPUT ] self.reporter = reporter - self.tosa_spec = tosa_spec self.int32_min = torch.iinfo(torch.int32).min self.int32_max = torch.iinfo(torch.int32).max super().__init__() @@ -708,138 +609,6 @@ def inside_int32_bounds(self, node: torch.fx.Node) -> bool: min_val, max_val = int(torch.min(data)), int(torch.max(data)) return min_val >= self.int32_min and max_val <= self.int32_max - def has_rejected_int64_output( - self, node: torch.fx.Node, tensor_list: Sequence[typing.Any] - ) -> bool: - if node.target in ( - torch.ops.aten.argmax.default, - exir_ops.edge.aten.argmax.default, - ): - return not self._is_tosa_argmax_supported(node) - return any( - tensor.dtype == torch.int64 - for tensor in tensor_list - if isinstance(tensor, FakeTensor) - ) - - def _is_tosa_argmax_dtype_supported( - self, node: torch.fx.Node, input_dtype: torch.dtype - ) -> bool: - if input_dtype == torch.int8: - if not self.tosa_spec.support_integer(): - self.reporter.report_reject( - node, "TOSA ARGMAX requires PRO-INT for int8 input." - ) - return False - elif input_dtype == torch.int16: - if not ( - self.tosa_spec.support_integer() - and self.tosa_spec.support_extension("int16") - ): - self.reporter.report_reject( - node, "TOSA ARGMAX requires EXT-INT16 for int16 input." - ) - return False - elif input_dtype in (torch.float16, torch.float32): - if not self.tosa_spec.support_float(): - self.reporter.report_reject( - node, f"TOSA ARGMAX requires PRO-FP for {input_dtype} input." - ) - return False - elif input_dtype == torch.bfloat16: - if not ( - self.tosa_spec.support_float() - and self.tosa_spec.support_extension("bf16") - ): - self.reporter.report_reject( - node, "TOSA ARGMAX requires EXT-BF16 for bfloat16 input." - ) - return False - else: - self.reporter.report_reject( - node, f"TOSA ARGMAX does not support {input_dtype} input." - ) - return False - return True - - def _is_tosa_argmax_supported(self, node: torch.fx.Node) -> bool: - dim = node.kwargs.get("dim", node.args[1] if len(node.args) > 1 else None) - if dim is None: - self.reporter.report_reject( - node, "TOSA ARGMAX requires an explicit reduction dimension." - ) - return False - if not isinstance(dim, int): - self.reporter.report_reject( - node, "TOSA ARGMAX requires a statically known reduction dimension." - ) - return False - - input_node = typing.cast(torch.fx.Node, node.args[0]) - input_tensor = get_first_fake_tensor(input_node) - if not self._is_tosa_argmax_dtype_supported(node, input_tensor.dtype): - return False - - input_rank = len(input_tensor.shape) - if input_rank == 0: - self.reporter.report_reject( - node, "TOSA ARGMAX requires an input with rank at least 1." - ) - return False - - axis = dim + input_rank if dim < 0 else dim - if axis < 0 or axis >= input_rank: - self.reporter.report_reject( - node, - f"TOSA ARGMAX axis must be in [0, {input_rank - 1}] but got {dim}.", - ) - return False - - keepdim = node.kwargs.get( - "keepdim", node.args[2] if len(node.args) > 2 else False - ) - if keepdim: - self.reporter.report_reject( - node, "TOSA ARGMAX does not support keepdim=True." - ) - return False - - return True - - def _check_int64_input_nodes(self, node: torch.fx.Node) -> bool: - """Check if all int64 input nodes are constant and will be - partitioned. - """ - for input_node in ( - input_node - for input_node in node.all_input_nodes - if input_node.op != "get_attr" - ): - if isinstance(input_node.meta["val"], torch.SymInt): - continue - tensor_in = get_first_fake_tensor(input_node) - if tensor_in.dtype != torch.int64: - continue - # Constant placeholder - if ( - input_node.op != "call_function" - and input_node.name not in self.input_names - ): - continue - # Constant operator - if input_node.op == "call_function": - if input_node.target in ComputeConstantOpsAOTPass.targeted_ops: - # This is not perfect since the input_node can still be rejected by other checks but - # this should cover the majority of cases. - if self.is_node_supported({}, input_node): - continue - self.reporter.report_reject( - node, f"Non-constant int64 input {input_node.name}" - ) - return False - - return True - def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: @@ -849,7 +618,7 @@ def is_node_supported( vals = node.meta["val"] tensor_list = vals if isinstance(vals, (list, tuple)) else [vals] - any_int64 = self.has_rejected_int64_output(node, tensor_list) + any_int64 = any(tensor.dtype == torch.int64 for tensor in tensor_list) # Don't partition nodes with int64 output... if any_int64: # ... Except for constant ops that are directly cast to something non-int64. @@ -883,7 +652,35 @@ def is_node_supported( ) return False - return self._check_int64_input_nodes(node) + # Ops with int64 inputs are only partitioned if input nodes are constant and will be partitioned. + # If it is not partitioned, the partition will get an int64 input and fail. + for input_node in ( + input_node + for input_node in node.all_input_nodes + if input_node.op != "get_attr" + ): + tensor_in = get_first_fake_tensor(input_node) + if tensor_in.dtype != torch.int64: + continue + # Constant placeholder + if ( + input_node.op != "call_function" + and input_node.name not in self.input_names + ): + continue + # Constant operator + if input_node.op == "call_function": + if input_node.target in ComputeConstantOpsAOTPass.targeted_ops: + # This is not perfect since the input_node can still be rejected by other checks but + # this should cover the majority of cases. + if self.is_node_supported({}, input_node): + continue + self.reporter.report_reject( + node, f"Non-constant int64 input {input_node.name}" + ) + return False + + return True class CheckDtypeInputsAndOutputs(OperatorSupportBase): @@ -915,9 +712,6 @@ def is_node_supported( for input_node in node.all_input_nodes if input_node.op != "get_attr" ): - if isinstance(input_node.meta["val"], torch.SymInt): - continue - tensor = get_first_fake_tensor(input_node) if tensor.dtype in self.disallowed_dtypes: self.reporter.report_reject( @@ -978,8 +772,6 @@ def is_node_supported( for input_node in node.all_input_nodes if input_node.op != "get_attr" ): - if isinstance(input_node.meta["val"], torch.SymInt): - continue dtype = get_first_fake_tensor(input_node).dtype if dtype.is_floating_point: floating_dtypes.add(dtype) @@ -1017,8 +809,6 @@ def is_node_supported( ) # check if any input node has an unsupported rank for input_node in input_nodes: - if isinstance(input_node.meta["val"], torch.SymInt): - continue input_node_shape = get_first_fake_tensor(input_node).shape if len(input_node_shape) > self.max_rank: self.reporter.report_reject( diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 1acaf4e65ef..aa988a1ccd7 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -42,13 +42,11 @@ op_sub, op_sum, op_to_dim_order_copy, - op_tosa_argmax, op_tosa_avg_pool2d, op_tosa_avg_pool2d_adaptive, op_tosa_cast_to_block_scaled, op_tosa_clamp, op_tosa_conv2d, - op_tosa_conv2d_block_scaled, op_tosa_conv3d, op_tosa_custom, op_tosa_depthwise_conv2d, diff --git a/backends/arm/operators/op_tosa_cast_to_block_scaled.py b/backends/arm/operators/op_tosa_cast_to_block_scaled.py index b49fa521866..454c28ddfe2 100644 --- a/backends/arm/operators/op_tosa_cast_to_block_scaled.py +++ b/backends/arm/operators/op_tosa_cast_to_block_scaled.py @@ -5,7 +5,7 @@ """Provide a visitor for lowering block-scaled casts to TOSA.""" import operator -from typing import Any, List +from typing import Any, cast, List import torch import tosa_serializer as ts @@ -16,36 +16,25 @@ ) from executorch.backends.arm.operators.operator_validation_utils import ( validate_num_inputs, - validate_valid_dtype, ) -from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype +from executorch.backends.arm.tosa.mapping import TosaArg from executorch.backends.arm.tosa.specification import TosaSpecification -def _getitem_index(node: torch.fx.Node) -> int: - index = node.args[1] - if not isinstance(index, int): - raise ValueError( - f"CAST_TO_BLOCK_SCALED: expected integer getitem index, got {index!r}" - ) - return index - - -def _ordered_getitem_outputs(node: torch.fx.Node) -> list[torch.fx.Node]: +def _ordered_getitem_output_names(node: torch.fx.Node) -> list[str]: getitem_users = [ user for user in node.users if user.op == "call_function" and user.target == operator.getitem ] - ordered_users = sorted(getitem_users, key=_getitem_index) + ordered_users = sorted(getitem_users, key=lambda user: cast(int, user.args[1])) if len(ordered_users) != 2: raise ValueError( - f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem " - f"outputs, got {len(ordered_users)}" + f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem outputs, got {len(ordered_users)}" ) - return ordered_users + return [user.name for user in ordered_users] @register_node_visitor @@ -69,67 +58,15 @@ def define_node( raise ValueError(f"{self.target} requires the TOSA mxfp extension") input_tensor = inputs[0] - block_size = inputs[1].number if hasattr(inputs[1], "number") else None - if not isinstance(block_size, int) or isinstance(block_size, bool): - raise ValueError(f"{self.target}: missing block_size argument") - - validate_valid_dtype( - self.target, - input_tensor, - [ts.DType.FP32, ts.DType.BF16, ts.DType.FP16], - self.tosa_spec, - ) - - if not isinstance(node.meta.get("val"), tuple) or len(node.meta["val"]) != 2: - raise ValueError( - f"{self.target}: expected tuple metadata with two outputs, got {node.meta.get('val')!r}" - ) + block_size = inputs[1].number output_data_tensor, output_scale_tensor = node.meta["val"] - output_getitems = _ordered_getitem_outputs(node) - output_names = [user.name for user in output_getitems] - output_payload_dtype = output_getitems[0].meta.get(TosaSpecialDtype.meta_key()) - - if output_payload_dtype in ( - TosaSpecialDtype.FP4E2M1, - TosaSpecialDtype.FP6E2M3, - TosaSpecialDtype.FP6E3M2, - ): - output_data_dtype = output_payload_dtype.get_tosa_dtype() - elif output_data_tensor.dtype == torch.float8_e4m3fn: - output_data_dtype = ts.DType.FP8E4M3 - elif output_data_tensor.dtype == torch.float8_e5m2: - output_data_dtype = ts.DType.FP8E5M2 - else: - raise ValueError( - f"{self.target}: unsupported payload dtype {output_data_tensor.dtype}" - ) - if output_data_dtype not in ( - ts.DType.FP4E2M1, - ts.DType.FP6E2M3, - ts.DType.FP6E3M2, - ts.DType.FP8E4M3, - ts.DType.FP8E5M2, - ): - raise ValueError( - f"{self.target}: unsupported payload dtype {output_data_dtype}" - ) - if output_scale_tensor.dtype != torch.float8_e8m0fnu: - raise ValueError( - f"{self.target}: unsupported scale dtype {output_scale_tensor.dtype}" - ) - if not hasattr(ts.Op, "CAST_TO_BLOCK_SCALED"): - raise NotImplementedError( - "tosa_serializer does not provide CAST_TO_BLOCK_SCALED yet" - ) + # TODO(MLETORCH-2018): This is a local workaround for multi-output TOSA ops. + # Remove it once twe can handle multiple outputs generally. + output_names = _ordered_getitem_output_names(node) attr = ts.TosaSerializerAttribute() - attr_ctor = getattr(attr, "CastToBlockScaledAttribute", None) - if attr_ctor is None: - raise NotImplementedError( - "tosa_serializer does not provide CastToBlockScaledAttribute yet" - ) - attr_ctor(block_size) + attr.CastToBlockScaledAttribute(block_size) self._serialize_operator( node, diff --git a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py index 4c3a8ba99b2..2f1bd88c2bb 100644 --- a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py +++ b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py @@ -53,13 +53,7 @@ def define_node( validate_valid_dtype( self.target, [A_data, B_data], - [ - ts.DType.FP4E2M1, - ts.DType.FP6E2M3, - ts.DType.FP6E3M2, - ts.DType.FP8E4M3, - ts.DType.FP8E5M2, - ], + [ts.DType.FP8E4M3, ts.DType.FP8E5M2], self.tosa_spec, ) validate_valid_dtype( diff --git a/backends/arm/operators/op_tosa_shapes.py b/backends/arm/operators/op_tosa_shapes.py index b7480d78a4d..25c861a403d 100644 --- a/backends/arm/operators/op_tosa_shapes.py +++ b/backends/arm/operators/op_tosa_shapes.py @@ -13,7 +13,6 @@ NodeVisitor, register_node_visitor, ) -from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg from executorch.backends.arm.tosa.utils import normalize_symint @@ -22,6 +21,9 @@ class TosaConstShapeVisitor(NodeVisitor): target = "tosa.CONST_SHAPE.default" + def __init__(self, *args): + super().__init__(*args) + def define_node( self, node: torch.fx.Node, @@ -41,217 +43,3 @@ def define_node( vals=vals, name=output.name, ) - - -class TosaShapeNodeVisitor(NodeVisitor): - - tosa_specs = TosaSpecification.all_profiles_for_version("1.1") - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - tosa_graph = cast(ts.TosaSerializer, tosa_graph) - tosa_graph.currRegion.currBasicBlock.addShape( - output.name, - output.shape[0], - ) - - -class TosaBasicShapeVisitor(TosaShapeNodeVisitor): - tosa_op: ts.Op - attr_method: str - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - super().define_node(node, tosa_graph, inputs, output) - self.serialize( - node, - tosa_graph, - tosa_op=self.tosa_op, - inputs=inputs, - output=output, - attr_method=self.attr_method, - ) - - -@register_node_visitor -class TosaDimShapeVisitor(TosaShapeNodeVisitor): - target = "tosa.DIM.default" - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - super().define_node(node, tosa_graph, inputs, output) - - attr = ts.TosaSerializerAttribute() - attr.DimAttribute(axis=node.kwargs["axis"]) - self._serialize_operator( - node, - tosa_graph, - ts.Op.DIM, - [inputs[0].name], - [output.name], - attr, - ) - - -@register_node_visitor -class TosaAddShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.ADD_SHAPE.default" - - tosa_op = ts.Op.ADD_SHAPE - attr_method = "AddShapeAttribute" - - -@register_node_visitor -class TosaSubShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.SUB_SHAPE.default" - - tosa_op = ts.Op.SUB_SHAPE - attr_method = "SubShapeAttribute" - - -@register_node_visitor -class TosaAssertEqualShapeVisitor(TosaShapeNodeVisitor): - target = "tosa.ASSERT_EQUAL_SHAPE.default" - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - super().define_node(node, tosa_graph, inputs, output) - tosa_graph = cast(ts.TosaSerializer, tosa_graph) - attr = ts.TosaSerializerAttribute() - attr.AssertEqualShapeAttribute(allow_broadcast=node.kwargs["allow_broadcast"]) - self._serialize_operator( - node, - tosa_graph, - ts.Op.ASSERT_EQUAL_SHAPE, - [inputs[0].name, inputs[1].name], - [output.name], - attr, - ) - - -@register_node_visitor -class TosaCatShapeVisitor(TosaShapeNodeVisitor): - target = "tosa.CONCAT_SHAPE.default" - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - super().define_node(node, tosa_graph, inputs, output) - tosa_graph = cast(ts.TosaSerializer, tosa_graph) - - input_shape_list = [input.name for input in inputs[0].special] - - attr = ts.TosaSerializerAttribute() - attr.ConcatShapeAttribute() - self._serialize_operator( - node, - tosa_graph, - ts.Op.CONCAT_SHAPE, - input_shape_list, - [output.name], - attr, - ) - - -@register_node_visitor -class TosaDivCeilShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.DIV_CEIL_SHAPE.default" - - tosa_op = ts.Op.DIV_CEIL_SHAPE - attr_method = "DivCeilShapeAttribute" - - -@register_node_visitor -class TosaDivShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.DIV_FLOOR_SHAPE.default" - - tosa_op = ts.Op.DIV_FLOOR_SHAPE - attr_method = "DivFloorShapeAttribute" - - -@register_node_visitor -class TosaExp2ShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.EXP2_SHAPE.default" - - tosa_op = ts.Op.EXP2_SHAPE - attr_method = "Exp2ShapeAttribute" - - -@register_node_visitor -class TosaLog2CeilShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.LOG2_CEIL_SHAPE.default" - - tosa_op = ts.Op.LOG2_CEIL_SHAPE - attr_method = "Log2CeilShapeAttribute" - - -@register_node_visitor -class TosaLog2FloorShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.LOG2_FLOOR_SHAPE.default" - - tosa_op = ts.Op.LOG2_FLOOR_SHAPE - attr_method = "Log2FloorShapeAttribute" - - -@register_node_visitor -class TosaMaxShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.MAX_SHAPE.default" - - tosa_op = ts.Op.MAX_SHAPE - attr_method = "MaxShapeAttribute" - - -@register_node_visitor -class TosaMinShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.MIN_SHAPE.default" - - tosa_op = ts.Op.MIN_SHAPE - attr_method = "MinShapeAttribute" - - -@register_node_visitor -class TosaMulShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.MUL_SHAPE.default" - - tosa_op = ts.Op.MUL_SHAPE - attr_method = "MulShapeAttribute" - - -@register_node_visitor -class TosaSliceShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.SLICE_SHAPE.default" - - tosa_op = ts.Op.SLICE_SHAPE - attr_method = "SliceShapeAttribute" - - -@register_node_visitor -class TosaModShapeVisitor(TosaBasicShapeVisitor): - target = "tosa.MOD_SHAPE.default" - - tosa_op = ts.Op.MOD_SHAPE - attr_method = "ModShapeAttribute" diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index a0c2dbeb1fb..5f9c3e3938c 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -56,82 +56,14 @@ def _tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray: def _prepare_const_values_for_tosa_dtype( - values: np.ndarray, tosa_arg: TosaArg + values: np.ndarray, tosa_dtype: ts.DType ) -> np.ndarray: """Normalize constant storage to the expected TOSA serializer dtype.""" - if tosa_arg.dtype == ts.DType.INT48 and values.dtype != np.int64: + if tosa_dtype == ts.DType.INT48 and values.dtype != np.int64: return values.astype(np.int64) - if tosa_arg.dtype in (ts.DType.FP6E2M3, ts.DType.FP6E3M2): - if values.dtype == np.uint8: - try: - import ml_dtypes # type: ignore[import-not-found] - except ImportError as e: - raise RuntimeError( - "ml_dtypes is required to serialize FP6 tensors for TOSA. " - "Have you run setup.sh?" - ) from e - ml_dtype = { - ts.DType.FP6E2M3: ml_dtypes.float6_e2m3fn, - ts.DType.FP6E3M2: ml_dtypes.float6_e3m2fn, - }[tosa_arg.dtype] - return values.view(ml_dtype) return values -def _get_const_shape(values: np.ndarray, tosa_arg: TosaArg) -> list[int]: - """Return the TOSA logical shape for a serialized constant.""" - if tosa_arg.dtype == ts.DType.FP4E2M1: - return normalize_symint(tosa_arg.shape) - return normalize_symint(values.shape) - - -def _is_packed_fp4_const(values: np.ndarray, tosa_arg: TosaArg) -> bool: - """FP4 elements are pairwise in each byte of a uint8 tensor. - - This function checks if the given values and TOSA argument represent a - packed FP4 constant. - - """ - - return ( - tosa_arg.dtype == ts.DType.FP4E2M1 - and values.dtype == np.uint8 - and values.shape[-1] * 2 == tosa_arg.shape[-1] - ) - - -def _add_const( - tosa_graph: Any, - values: np.ndarray, - tosa_arg: TosaArg, - name: str, -) -> None: - """Add a constant, preserving packed FP4 storage when required.""" - if _is_packed_fp4_const(values, tosa_arg): - # TOSA FP4 tensors have logical FP4 shape, but constants are stored as - # packed bytes (two values per byte). Add the raw bytes as INT8 first - # then set TOSA dtype and shape correctly on the tensor metadata. - tosa_graph.addConst( - normalize_symint(values.shape), - ts.DType.INT8, - values, - name=name, - ) - tensor = tosa_graph.currRegion.currBasicBlock.tensors[name] - tensor.setDtype(ts.DType.FP4E2M1) - for dim, size in enumerate(normalize_symint(tosa_arg.shape)): - tensor.SetDimSize(dim, size) - return - - prepared_values = _prepare_const_values_for_tosa_dtype(values, tosa_arg) - tosa_graph.addConst( - _get_const_shape(prepared_values, tosa_arg), - tosa_arg.dtype, - prepared_values, - name=name, - ) - - def process_call_function( node: torch.fx.Node, tosa_graph: Any, @@ -222,7 +154,16 @@ def process_inputs_to_parameters( f"{type(parameter_data).__name__}" ) parameter_values = _tensor_to_numpy(parameter_data) - _add_const(tosa_graph, parameter_values, tosa_arg, name=tosa_arg.name) + parameter_values = _prepare_const_values_for_tosa_dtype( + parameter_values, tosa_arg.dtype + ) + + tosa_graph.addConst( + normalize_symint(parameter_values.shape), + tosa_arg.dtype, + parameter_values, + name=tosa_arg.name, + ) def process_inputs_to_buffers( @@ -247,7 +188,14 @@ def process_inputs_to_buffers( f"{type(buffer_data).__name__}" ) buffer_values = _tensor_to_numpy(buffer_data) - _add_const(tosa_graph, buffer_values, tosa_arg, name=tosa_arg.name) + buffer_values = _prepare_const_values_for_tosa_dtype(buffer_values, tosa_arg.dtype) + + tosa_graph.addConst( + normalize_symint(buffer_values.shape), + tosa_arg.dtype, + buffer_values, + name=tosa_arg.name, + ) def process_inputs_to_lifted_tensor_constants( @@ -269,7 +217,14 @@ def process_inputs_to_lifted_tensor_constants( f"{type(tensor).__name__}" ) tensor_values = _tensor_to_numpy(tensor) - _add_const(tosa_graph, tensor_values, tosa_arg, name=tosa_arg.name) + tensor_values = _prepare_const_values_for_tosa_dtype(tensor_values, tosa_arg.dtype) + + tosa_graph.addConst( + normalize_symint(tensor_values.shape), + tosa_arg.dtype, + tensor_values, + name=tosa_arg.name, + ) def _is_submodule_input( diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index 3b713659e84..7810077a679 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -532,7 +532,6 @@ def _get_fixed_qparams_qspec( torch.ops.aten.selu.default, torch.ops.aten.celu.default, torch.ops.aten.floor.default, - torch.ops.aten.round.default, torch.ops.aten.log.default, torch.ops.aten.reciprocal.default, torch.ops.aten.rsqrt.default, diff --git a/backends/arm/scripts/install_models_for_test.sh b/backends/arm/scripts/install_models_for_test.sh index 1e91cd9c08f..d6a7b9cdec0 100644 --- a/backends/arm/scripts/install_models_for_test.sh +++ b/backends/arm/scripts/install_models_for_test.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright 2025-2026 Arm Limited and/or its affiliates. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -8,8 +8,7 @@ set -e pip install -r backends/arm/requirements-arm-models-test.txt # Install model gym repository -MODEL_GYM_REF="${MODEL_GYM_REF:-v0.3.0}" -git clone --depth 1 --branch "$MODEL_GYM_REF" https://github.com/arm/neural-graphics-model-gym.git +git clone https://github.com/arm/neural-graphics-model-gym.git cd neural-graphics-model-gym # Remove model-converter installation from model-gym repository (to prevent overwriting executorch version) if [[ "$(uname)" == "Darwin" ]]; then @@ -19,4 +18,4 @@ else fi pip install . --no-deps cd .. -rm -rf neural-graphics-model-gym +rm -rf neural-graphics-model-gym \ No newline at end of file diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push index 9c324e0d784..1aa51a8f9ac 100755 --- a/backends/arm/scripts/pre-push +++ b/backends/arm/scripts/pre-push @@ -177,7 +177,7 @@ for COMMIT in ${COMMITS}; do for committed_file in "${license_files[@]}"; do # Skip files with certain extensions case "$committed_file" in - *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS|*/generated/*) + *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS) echo -e "${INFO} Skipping license check for ${committed_file} (excluded extension)" continue ;; diff --git a/backends/arm/test/misc/test_mxfp_linear_ao.py b/backends/arm/test/misc/test_mxfp_linear_ao.py index 1412d8ffdfe..0f2b6b9198c 100644 --- a/backends/arm/test/misc/test_mxfp_linear_ao.py +++ b/backends/arm/test/misc/test_mxfp_linear_ao.py @@ -5,11 +5,9 @@ import torch from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp -from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType from executorch.backends.arm.ao_ext.ops import MXFPLinearOp from torch.export import export -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2 class LinearModule(torch.nn.Module): @@ -21,86 +19,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return self.linear(x) -def _test_mxfp_linear_quantize_swaps_module( - weight_dtype: MXFPDType, - expected_weight_qdata_dtype: torch.dtype, - expected_weight_qdata_shape: tuple[int, ...], -) -> None: +def test_mxfp_linear_quantize_swaps_module() -> None: model = LinearModule().eval() - to_mxfp( - model, - MXFPOpConfig(weight_dtype=weight_dtype), - ) + to_mxfp(model, MXFPOpConfig()) assert isinstance(model.linear, MXFPLinearOp) - assert model.linear.weight_qdata.dtype == expected_weight_qdata_dtype - assert model.linear.weight_dtype == mxfp_dtype_to_str(weight_dtype) + assert model.linear.weight_qdata.dtype == torch.float8_e4m3fn assert model.linear.weight_scale.dtype == torch.float8_e8m0fnu - assert tuple(model.linear.weight_qdata.shape) == expected_weight_qdata_shape + assert tuple(model.linear.weight_qdata.shape) == (1, 8, 32) assert tuple(model.linear.weight_scale.shape) == (1, 8, 1) -def test_mxfp8_e4m3_linear_quantize_swaps_module() -> None: - _test_mxfp_linear_quantize_swaps_module( - torch.float8_e4m3fn, - torch.float8_e4m3fn, - (1, 8, 32), - ) - - -def test_mxfp4_linear_quantize_swaps_module() -> None: - _test_mxfp_linear_quantize_swaps_module( - torch.float4_e2m1fn_x2, - torch.uint8, - (1, 8, 16), - ) - - -def test_mxfp6_e2m3_linear_quantize_swaps_module() -> None: - _test_mxfp_linear_quantize_swaps_module( - DTYPE_FP6_E2M3, - torch.uint8, - (1, 8, 32), - ) - - -def test_mxfp6_e3m2_linear_quantize_swaps_module() -> None: - _test_mxfp_linear_quantize_swaps_module( - DTYPE_FP6_E3M2, - torch.uint8, - (1, 8, 32), - ) - - -def test_mxfp_linear_quantize_filter_fn_selects_modules() -> None: - class TwoLinearModule(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.selected = torch.nn.Linear(32, 8) - self.skipped = torch.nn.Linear(32, 8) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.selected(x) + self.skipped(x) - - def _is_selected_linear(module: torch.nn.Module, fqn: str) -> bool: - return isinstance(module, torch.nn.Linear) and fqn == "selected" - - model = TwoLinearModule().eval() - - to_mxfp( - model, - MXFPOpConfig(weight_dtype=torch.float8_e4m3fn), - filter_fn=_is_selected_linear, - ) - - assert isinstance(model.selected, MXFPLinearOp) - assert isinstance(model.skipped, torch.nn.Linear) - - -def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None: +def test_mxfp_linear_export_preserves_custom_op() -> None: model = LinearModule().eval() - to_mxfp(model, config) + to_mxfp(model, MXFPOpConfig()) exported = export(model, (torch.randn(4, 32),), strict=False) @@ -111,27 +44,3 @@ def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None: ] assert torch.ops.tosa_mxfp.linear.default in targets - - -def test_mxfp8_e4m3_linear_export_preserves_custom_op() -> None: - _test_mxfp_linear_export_preserves_custom_op( - MXFPOpConfig(weight_dtype=torch.float8_e4m3fn) - ) - - -def test_mxfp4_linear_export_preserves_custom_op() -> None: - _test_mxfp_linear_export_preserves_custom_op( - MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2) - ) - - -def test_mxfp6_e2m3_linear_export_preserves_custom_op() -> None: - _test_mxfp_linear_export_preserves_custom_op( - MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3) - ) - - -def test_mxfp6_e3m2_linear_export_preserves_custom_op() -> None: - _test_mxfp_linear_export_preserves_custom_op( - MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2) - ) diff --git a/backends/arm/test/misc/test_process_node.py b/backends/arm/test/misc/test_process_node.py index 02d2a5e012b..1ef348abdbf 100644 --- a/backends/arm/test/misc/test_process_node.py +++ b/backends/arm/test/misc/test_process_node.py @@ -3,19 +3,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from types import SimpleNamespace -from typing import cast - import numpy as np import torch import tosa_serializer as ts -from executorch.backends.arm.process_node import _add_const, process_placeholder -from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype +from executorch.backends.arm.process_node import process_placeholder +from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.backends.arm.tosa.specification import TosaSpecification from executorch.exir import to_edge from torch._export.utils import is_param -from tosa.TosaGraph import TosaGraph # type: ignore[import-not-found, import-untyped] -from tosa_serializer.numpy_utils import pack_6bit_array class Int32BiasModule(torch.nn.Module): @@ -99,74 +94,3 @@ def test_process_placeholder_int48_normalizes_int32_const_values() -> None: assert tosa_graph.values is not None assert tosa_graph.values.dtype == np.int64 assert tosa_graph.serialized_bytes == _expected_int48_bytes(module.bias) - - -def test_add_const_fp4_in_packed_storage() -> None: - packed_values = np.array([0xDE, 0xFE, 0x6D, 0x55], dtype=np.uint8).reshape( - 1, - 1, - 4, - ) - tosa_arg = cast( - TosaArg, - SimpleNamespace(dtype=ts.DType.FP4E2M1, shape=(1, 1, 8)), - ) - tosa_graph = ts.TosaSerializer() - - _add_const(tosa_graph, packed_values, tosa_arg, name="fp4_weight") - - graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0) - block = graph.Regions(0).Blocks(0) - tensors = { - block.Tensors(index).Name().decode(): block.Tensors(index) - for index in range(block.TensorsLength()) - } - tensor = tensors["fp4_weight"] - - assert tensor.Type() == ts.DType.FP4E2M1 - assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [1, 1, 8] - assert [tensor.Data(index) for index in range(tensor.DataLength())] == [ - 0xDE, - 0xFE, - 0x6D, - 0x55, - ] - - -def _test_add_const_fp6_in_packed_storage(dtype: int) -> None: - values = np.arange(32, dtype=np.uint8).reshape(1, 1, 32) - - tosa_arg = cast( - TosaArg, - SimpleNamespace(dtype=dtype, shape=(1, 1, 32)), - ) - tosa_graph = ts.TosaSerializer() - - _add_const(tosa_graph, values, tosa_arg, name="fp6_weight") - - graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0) - block = graph.Regions(0).Blocks(0) - tensors = { - block.Tensors(index).Name().decode(): block.Tensors(index) - for index in range(block.TensorsLength()) - } - tensor = tensors["fp6_weight"] - - assert tensor.Type() == dtype - assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [ - 1, - 1, - 32, - ] - assert tensor.DataLength() == 24 - assert [tensor.Data(index) for index in range(tensor.DataLength())] == ( - pack_6bit_array(values).reshape(-1).tolist() - ) - - -def test_add_const_fp6e2m3_in_packed_storage() -> None: - _test_add_const_fp6_in_packed_storage(ts.DType.FP6E2M3) - - -def test_add_const_fp6e3m2_in_packed_storage() -> None: - _test_add_const_fp6_in_packed_storage(ts.DType.FP6E3M2) diff --git a/backends/arm/test/misc/test_runner_utils.py b/backends/arm/test/misc/test_runner_utils.py index 54d41548a22..3c78b21e008 100644 --- a/backends/arm/test/misc/test_runner_utils.py +++ b/backends/arm/test/misc/test_runner_utils.py @@ -3,13 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import json from pathlib import Path -from types import SimpleNamespace from typing import Any, cast -import numpy as np -import torch from executorch.backends.arm.test import runner_utils @@ -117,115 +113,3 @@ def test_get_elf_path_accepts_nested_runner_output(monkeypatch, tmp_path: Path) monkeypatch.setattr(runner_utils, "_elf_search_roots", lambda: [tmp_path]) assert runner_utils.get_elf_path("corstone-300") == str(elf_path) - - -def test_shape_inference_json_uses_tosa_input_layout(tmp_path: Path) -> None: - test_case_path = tmp_path / "test_case.json" - artifact_path = tmp_path / "model.tosa" - input_tensor = torch.randn(1, 3, 4, 5).to(memory_format=torch.channels_last) - - runner_utils.TosaReferenceModelDispatch()._generate_shape_inference_json( - b"", - artifact_path, - test_case_path, - ["input"], - (input_tensor,), - ) - - test_case = json.loads(test_case_path.read_text(encoding="utf-8")) - - assert test_case == { - "tosa_file": str(artifact_path), - "shapes": {"input": [1, 4, 5, 3]}, - } - - -def test_numpy_to_torch_tensor_converts_dynamic_nhwc_output(monkeypatch) -> None: - symbolic_dim = object() - output_tensor = SimpleNamespace( - shape=(1, 3, symbolic_dim, 5), - dtype=torch.float32, - dim_order=lambda: runner_utils.NHWC_ORDER, - ) - monkeypatch.setattr( - runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor - ) - array = np.arange(60, dtype=np.float32).reshape(1, 4, 5, 3) - - result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object())) - - assert result.shape == (1, 3, 4, 5) - assert result.is_contiguous(memory_format=torch.channels_last) - torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 3, 1, 2)) - - -def test_numpy_to_torch_tensor_converts_dynamic_nnhwc_output(monkeypatch) -> None: - symbolic_dim = object() - output_tensor = SimpleNamespace( - shape=(1, 2, 3, symbolic_dim, 5), - dtype=torch.float32, - dim_order=lambda: runner_utils.NNHWC_ORDER, - ) - monkeypatch.setattr( - runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor - ) - array = np.arange(120, dtype=np.float32).reshape(1, 2, 4, 5, 3) - - result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object())) - - assert result.shape == (1, 2, 3, 4, 5) - assert result.dim_order() == runner_utils.NNHWC_ORDER - torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 1, 4, 2, 3)) - - -def _program_with_user_input(name: str) -> SimpleNamespace: - return SimpleNamespace( - graph_signature=SimpleNamespace(user_inputs=[name]), - graph=SimpleNamespace(nodes=[SimpleNamespace(op="placeholder", name=name)]), - ) - - -def test_user_inputs_need_shape_inference_rejects_static_input(monkeypatch) -> None: - monkeypatch.setattr( - runner_utils, - "get_first_fake_tensor", - lambda node: SimpleNamespace(shape=(1, 2)), - ) - - assert not runner_utils.user_inputs_need_shape_inference( - cast(Any, _program_with_user_input("input")) - ) - - -def test_user_inputs_need_shape_inference_accepts_symbolic_input(monkeypatch) -> None: - symbolic_dim = object() - monkeypatch.setattr( - runner_utils, - "get_first_fake_tensor", - lambda node: SimpleNamespace(shape=(1, symbolic_dim)), - ) - - assert runner_utils.user_inputs_need_shape_inference( - cast(Any, _program_with_user_input("input")) - ) - - -def test_user_inputs_need_shape_inference_ignores_non_user_inputs(monkeypatch) -> None: - program = SimpleNamespace( - graph_signature=SimpleNamespace(user_inputs=["input"]), - graph=SimpleNamespace( - nodes=[ - SimpleNamespace(op="placeholder", name="input"), - SimpleNamespace(op="placeholder", name="param"), - ] - ), - ) - - def fake_tensor(node): - if node.name == "input": - return SimpleNamespace(shape=(1, 2)) - return SimpleNamespace(shape=(1, object())) - - monkeypatch.setattr(runner_utils, "get_first_fake_tensor", fake_tensor) - - assert not runner_utils.user_inputs_need_shape_inference(cast(Any, program)) diff --git a/backends/arm/test/misc/test_vgf_backend.py b/backends/arm/test/misc/test_vgf_backend.py index 406ba1b405a..22a8607fbc7 100644 --- a/backends/arm/test/misc/test_vgf_backend.py +++ b/backends/arm/test/misc/test_vgf_backend.py @@ -3,10 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import os from types import SimpleNamespace from typing import cast -from unittest import mock import pytest @@ -16,14 +14,7 @@ clear_registered_pass_insertions, PassInsertions, ) - -from executorch.backends.arm.vgf import backend, backend as vgf_backend, VgfCompileSpec -from executorch.backends.arm.vgf.backend import ( - _copy_failure_artifacts, - _format_repro_command, - _replace_converter_input_path, - vgf_compile, -) +from executorch.backends.arm.vgf import backend as vgf_backend, VgfCompileSpec from executorch.exir.backend.backend_details import PreprocessResult from executorch.exir.pass_base import ExportPass from torch.export.exported_program import ExportedProgram @@ -114,180 +105,3 @@ def _raise(*args, **kwargs): assert _registry_state() == original_registry finally: clear_registered_pass_insertions() - - -def test_format_repro_command_quotes_shell_metacharacters(): - command = [ - "model-converter", - "--flag=value with spaces", - "-i", - "input file.tosa", - "-o", - "output file.vgf", - ] - - formatted = _format_repro_command(command) - - assert formatted == ( - "model-converter " - "'--flag=value with spaces' " - "-i " - "'input file.tosa' " - "-o " - "'output file.vgf'" - ) - - -def test_replace_converter_input_path_replaces_input_after_i(): - command = [ - "model-converter", - "--some-flag", - "-i", - "original.tosa", - "-o", - "output.vgf", - ] - - replaced = _replace_converter_input_path(command, "preserved.tosa") - - assert replaced == [ - "model-converter", - "--some-flag", - "-i", - "preserved.tosa", - "-o", - "output.vgf", - ] - assert command[3] == "original.tosa" - - -def test_copy_failure_artifacts_returns_none_without_artifact_path(tmp_path): - tosa_path = tmp_path / "input.tosa" - tosa_path.write_bytes(b"tosa bytes") - - copied_path = _copy_failure_artifacts( - str(tosa_path), - artifact_path=None, - tag_name="delegate_0", - ) - - assert copied_path is None - - -def test_copy_failure_artifacts_copies_tosa_with_tag_name(tmp_path): - tosa_path = tmp_path / "input.tosa" - artifact_path = tmp_path / "artifacts" - tosa_path.write_bytes(b"tosa bytes") - - copied_path = _copy_failure_artifacts( - str(tosa_path), - str(artifact_path), - tag_name="delegate_0", - ) - - assert copied_path == os.path.join( - str(artifact_path), - "failed_model_converter_input_delegate_0.tosa", - ) - assert os.path.exists(copied_path) - assert open(copied_path, "rb").read() == b"tosa bytes" - - -def test_copy_failure_artifacts_copies_tosa_without_tag_name(tmp_path): - tosa_path = tmp_path / "input.tosa" - artifact_path = tmp_path / "artifacts" - tosa_path.write_bytes(b"tosa bytes") - - copied_path = _copy_failure_artifacts( - str(tosa_path), - str(artifact_path), - tag_name="", - ) - - assert copied_path == os.path.join( - str(artifact_path), - "failed_model_converter_input.tosa", - ) - assert os.path.exists(copied_path) - assert open(copied_path, "rb").read() == b"tosa bytes" - - -@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env") -@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary") -@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run") -def test_vgf_compile_failure_includes_repro_command_and_copies_tosa( - mock_run, - mock_require_model_converter_binary, - mock_model_converter_env, - tmp_path, -): - artifact_path = tmp_path / "artifacts" - - mock_require_model_converter_binary.return_value = "model-converter" - mock_model_converter_env.return_value = {"PATH": "/test/bin"} - mock_run.side_effect = backend.subprocess.CalledProcessError( - returncode=1, - cmd=["model-converter"], - output=b"converter stdout", - stderr=b"converter stderr", - ) - - with pytest.raises(RuntimeError) as exc_info: - vgf_compile( - b"serialized tosa", - ["--flag=value with spaces"], - artifact_path=str(artifact_path), - tag_name="delegate_0", - ) - - copied_tosa_path = os.path.join( - str(artifact_path), - "failed_model_converter_input_delegate_0.tosa", - ) - - assert os.path.exists(copied_tosa_path) - assert open(copied_tosa_path, "rb").read() == b"serialized tosa" - - error = str(exc_info.value) - assert "Vgf compiler failed." in error - assert "Repro command:" in error - assert "model-converter '--flag=value with spaces' -i" in error - assert copied_tosa_path in error - assert " -o " in error - assert "Stderr:\nconverter stderr" in error - assert "Stdout:\nconverter stdout" in error - - -@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env") -@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary") -@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run") -def test_vgf_compile_failure_includes_temp_repro_command_without_artifact_path( - mock_run, - mock_require_model_converter_binary, - mock_model_converter_env, -): - mock_require_model_converter_binary.return_value = "model-converter" - mock_model_converter_env.return_value = {"PATH": "/test/bin"} - mock_run.side_effect = backend.subprocess.CalledProcessError( - returncode=1, - cmd=["model-converter"], - output=b"converter stdout", - stderr=b"converter stderr", - ) - - with pytest.raises(RuntimeError) as exc_info: - vgf_compile( - b"serialized tosa", - ["--some-flag"], - artifact_path=None, - tag_name="delegate_0", - ) - - error = str(exc_info.value) - assert "Vgf compiler failed." in error - assert "Repro command:" in error - assert "model-converter --some-flag -i" in error - assert "output_delegate_0.tosa.vgf" in error - assert "failed_model_converter_input_delegate_0.tosa" not in error - assert "Stderr:\nconverter stderr" in error - assert "Stdout:\nconverter stdout" in error diff --git a/backends/arm/test/misc/test_vgf_check_env.py b/backends/arm/test/misc/test_vgf_check_env.py index 646eb6b9a79..499a9f35db0 100644 --- a/backends/arm/test/misc/test_vgf_check_env.py +++ b/backends/arm/test/misc/test_vgf_check_env.py @@ -9,10 +9,8 @@ from pathlib import Path import executorch.backends.arm.vgf.check_env as check_env -import executorch.backends.arm.vgf.model_converter as model_converter import pytest -from executorch.backends.arm.vgf import backend as vgf_backend from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec @@ -121,7 +119,7 @@ def test_is_vgf_runtime_available(monkeypatch): def test_model_converter_check_fails_when_missing(monkeypatch): - monkeypatch.setattr(model_converter, "find_model_converter_binary", lambda: None) + monkeypatch.setattr(check_env, "find_model_converter_binary", lambda: None) result = check_env._check_model_converter() @@ -141,7 +139,7 @@ def test_model_converter_check_reports_version(monkeypatch, tmp_path): "raise SystemExit(1)\n", ) monkeypatch.setattr( - model_converter, "find_model_converter_binary", lambda: str(converter) + check_env, "find_model_converter_binary", lambda: str(converter) ) result = check_env._check_model_converter() @@ -174,20 +172,20 @@ def test_find_existing_lib_finds_libvgf(tmp_path): def test_runtime_backend_check_passes_when_vgf_registered(monkeypatch): class BackendRegistry: - registered_backend_names = [vgf_backend.VGF_BACKEND_NAME] + registered_backend_names = [check_env.VGF_BACKEND_NAME] def is_available(self, backend_name): - return backend_name == vgf_backend.VGF_BACKEND_NAME + return backend_name == check_env.VGF_BACKEND_NAME class Runtime: backend_registry = BackendRegistry() - monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime()) + monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime()) result = check_env._check_runtime_vgf_backend() assert result.status == check_env.STATUS_OK - assert vgf_backend.VGF_BACKEND_NAME in result.detail + assert check_env.VGF_BACKEND_NAME in result.detail def test_runtime_backend_check_fails_when_vgf_not_registered(monkeypatch): @@ -200,12 +198,12 @@ def is_available(self, backend_name): class Runtime: backend_registry = BackendRegistry() - monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime()) + monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime()) result = check_env._check_runtime_vgf_backend() assert result.status == check_env.STATUS_FAIL - assert vgf_backend.VGF_BACKEND_NAME in result.detail + assert check_env.VGF_BACKEND_NAME in result.detail assert "XnnpackBackend" in result.detail @@ -359,84 +357,3 @@ def test_main_source_build_mode(monkeypatch, capsys): def test_main_rejects_build_dir_without_source_build(): with pytest.raises(SystemExit): check_env.main(["--build-dir", "cmake-out-vkml"]) - - -def test_check_env_model_converter_probe_delegates_to_model_converter_module( - monkeypatch, -): - monkeypatch.setattr( - model_converter, - "check_model_converter_environment", - lambda: model_converter.ModelConverterEnvironmentCheck( - "converter", model_converter.STATUS_OK, "from-owner" - ), - ) - - result = check_env._check_model_converter() - - assert result.status == check_env.STATUS_OK - assert result.detail == "from-owner" - - -def test_check_env_model_converter_lib_dir_probe_delegates_to_model_converter_module( - monkeypatch, -): - monkeypatch.setattr( - model_converter, - "check_model_converter_lib_dir_environment", - lambda: model_converter.ModelConverterEnvironmentCheck( - "lib-dir", model_converter.STATUS_OK, "from-owner" - ), - ) - - result = check_env._check_model_converter_lib_dir() - - assert result.status == check_env.STATUS_OK - assert result.detail == "from-owner" - - -def test_check_env_runtime_probe_delegates_to_backend_module(monkeypatch): - monkeypatch.setattr( - vgf_backend, - "check_vgf_runtime_backend_environment", - lambda: vgf_backend.VgfRuntimeEnvironmentCheck( - "runtime", vgf_backend.STATUS_OK, "from-owner" - ), - ) - - result = check_env._check_runtime_vgf_backend() - - assert result.status == check_env.STATUS_OK - assert result.detail == "from-owner" - - -def test_model_converter_preflight_and_vgf_compile_share_executable_resolution( - monkeypatch, - tmp_path, -): - converter = _make_executable( - tmp_path / "model-converter", - "#!/usr/bin/env python3\n" - "from pathlib import Path\n" - "import sys\n" - "\n" - "if '--version' in sys.argv:\n" - " print('model-converter integration-test')\n" - " raise SystemExit(0)\n" - "\n" - "out_index = sys.argv.index('-o') + 1\n" - "Path(sys.argv[out_index]).write_bytes(b'compiled-vgf')\n" - "raise SystemExit(0)\n", - ) - - monkeypatch.setenv("MODEL_CONVERTER_PATH", str(converter)) - - preflight = check_env._check_model_converter() - compiled = vgf_backend.vgf_compile( - tosa_flatbuffer=b"fake-tosa-flatbuffer", - compile_flags=[], - ) - - assert preflight.status == check_env.STATUS_OK - assert str(converter) in preflight.detail - assert compiled == b"compiled-vgf" diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py index 77c42bf9f24..940023fa624 100644 --- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py +++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py @@ -5,7 +5,6 @@ import pytest import torch -from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str from executorch.backends.arm.tosa.dialect.lib import TosaValueError from executorch.backends.arm.tosa.dialect.ops import cast_to_block_scaled # noqa: F401 from executorch.backends.arm.tosa.specification import ( @@ -14,7 +13,6 @@ ) from executorch.exir.dialects._ops import ops as exir_ops from torch._subclasses.fake_tensor import FakeTensorMode -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2 def test_cast_to_block_scaled_requires_mxfp_extension() -> None: @@ -29,7 +27,7 @@ def test_cast_to_block_scaled_requires_mxfp_extension() -> None: exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( mode.from_tensor(sample_input), 32, - output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn), + output_dtype=torch.float8_e4m3fn, ) @@ -41,7 +39,7 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None: output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( mode.from_tensor(sample_input), 32, - output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn), + output_dtype=torch.float8_e4m3fn, ) assert output_data.dtype == torch.float8_e4m3fn @@ -50,48 +48,6 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None: assert tuple(output_scale.shape) == (2, 1) -def test_cast_to_block_scaled_tosa_fp_mxfp4() -> None: - tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") - sample_input = torch.randn((2, 32), dtype=torch.float32) - - with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode: - output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( - mode.from_tensor(sample_input), - 32, - output_dtype=mxfp_dtype_to_str(torch.float4_e2m1fn_x2), - ) - - assert output_data.dtype == torch.uint8 - assert tuple(output_data.shape) == (2, 16) - assert output_scale.dtype == torch.float8_e8m0fnu - assert tuple(output_scale.shape) == (2, 1) - - -def _test_cast_to_block_scaled_tosa_fp_mxfp6(dtype: str) -> None: - tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") - sample_input = torch.randn((2, 32), dtype=torch.float32) - - with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode: - output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( - mode.from_tensor(sample_input), - 32, - output_dtype=mxfp_dtype_to_str(dtype), - ) - - assert output_data.dtype == torch.uint8 - assert tuple(output_data.shape) == (2, 32) - assert output_scale.dtype == torch.float8_e8m0fnu - assert tuple(output_scale.shape) == (2, 1) - - -def test_cast_to_block_scaled_tosa_fp_mxfp6e2m3() -> None: - _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E2M3) - - -def test_cast_to_block_scaled_tosa_fp_mxfp6e3m2() -> None: - _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E3M2) - - def test_cast_to_block_scaled_invalid_shape() -> None: tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") @@ -103,5 +59,5 @@ def test_cast_to_block_scaled_invalid_shape() -> None: exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default( mode.from_tensor(torch.randn((2, 30), dtype=torch.float32)), 32, - output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn), + output_dtype=torch.float8_e4m3fn, ) diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py index 7dcffdeb4d9..74ce04bf3c1 100644 --- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py +++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py @@ -5,7 +5,6 @@ import pytest import torch -from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType from executorch.backends.arm.tosa.dialect.lib import TosaValueError from executorch.backends.arm.tosa.dialect.ops import matmul_t_block_scaled # noqa: F401 from executorch.backends.arm.tosa.specification import ( @@ -14,7 +13,6 @@ ) from executorch.exir.dialects._ops import ops as exir_ops from torch._subclasses.fake_tensor import FakeTensorMode -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E3M2 def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None: @@ -37,38 +35,6 @@ def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None: assert tuple(output.shape) == (1, 4, 8) -def _test_matmul_t_block_scaled_tosa_fp_subbyte( - payload_dtype: MXFPDType, - qdata_last_dim: int, -) -> None: - tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") - a_data = torch.empty((1, 4, qdata_last_dim), dtype=torch.uint8) - a_scale = torch.empty((1, 4, 1), dtype=torch.float8_e8m0fnu) - b_data = torch.empty((1, 8, qdata_last_dim), dtype=torch.uint8) - b_scale = torch.empty((1, 8, 1), dtype=torch.float8_e8m0fnu) - - with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode: - output = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default( - mode.from_tensor(a_data), - mode.from_tensor(a_scale), - mode.from_tensor(b_data), - mode.from_tensor(b_scale), - 32, - payload_dtype=mxfp_dtype_to_str(payload_dtype), - ) - - assert output.dtype == torch.float32 - assert tuple(output.shape) == (1, 4, 8) - - -def test_matmul_t_block_scaled_tosa_fp_mxfp4() -> None: - _test_matmul_t_block_scaled_tosa_fp_subbyte(torch.float4_e2m1fn_x2, 16) - - -def test_matmul_t_block_scaled_tosa_fp_mxfp6() -> None: - _test_matmul_t_block_scaled_tosa_fp_subbyte(DTYPE_FP6_E3M2, 32) - - def test_matmul_t_block_scaled_invalid_scale_shape() -> None: tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") a_data = torch.randn((1, 4, 32), dtype=torch.float32).to(torch.float8_e4m3fn) diff --git a/backends/arm/test/ops/mxfp/test_mxfp_linear.py b/backends/arm/test/ops/mxfp/test_mxfp_linear.py index fbec9307795..5cdd44cf138 100644 --- a/backends/arm/test/ops/mxfp/test_mxfp_linear.py +++ b/backends/arm/test/ops/mxfp/test_mxfp_linear.py @@ -10,7 +10,7 @@ import torch from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common as arm_common from executorch.backends.arm.test.ops.mxfp.common import ( MXFPTosaPipelineFP, MXFPVgfPipeline, @@ -18,12 +18,14 @@ from executorch.backends.arm.test.tester.analyze_output_utils import ( compare_rel_frobenius_and_cosine_similarity, ) -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2 aten_op = "torch.ops.tosa_mxfp.linear.default" input_t1 = Tuple[torch.Tensor] +_MXFP_FROBENIUS_THRESHOLD = 0.06 +_MXFP_COSINE_THRESHOLD = 0.995 + def _block_input_rank1() -> torch.Tensor: """Create a rank-1 input with distinct MXFP activation block scales.""" @@ -159,7 +161,6 @@ def _channels_last_rank4_input() -> torch.Tensor: test_data_vgf_fp = test_data_fp -# TODO: MLETORCH-2141 _vgf_xfail_reason = ( "MXFP is not yet supported in the VGF toolchain. Enable this test when " "toolchain support is available." @@ -214,45 +215,35 @@ def _is_linear(module: torch.nn.Module, _fqn: str) -> bool: return isinstance(module, torch.nn.Linear) -def _test_mxfp_linear_eager_cpu( - test_data, - config: MXFPOpConfig, - frobenius_threshold=0.3, - cosine_threshold=0.95, -) -> None: +@arm_common.parametrize("test_data", test_data_fp) +def test_mxfp_linear_tosa_FP(test_data) -> None: test_input, out_features, has_bias, set_block_weights = test_data() in_features = test_input.shape[-1] - ref_model = Linear( + module = Linear( in_features=in_features, out_features=out_features, bias=has_bias, ).eval() if set_block_weights: - ref_model.set_block_test_weights() - test_model = copy.deepcopy(ref_model).eval() - - to_mxfp(test_model, config, filter_fn=_is_linear) - - test_output = test_model(test_input) - ref_output = ref_model(test_input) + module.set_block_test_weights() - compare_rel_frobenius_and_cosine_similarity( - ref_output, - test_output, - quantization_parameters=None, - frobenius_threshold=frobenius_threshold, - cosine_threshold=cosine_threshold, - clean_reference=False, + pipeline = MXFPTosaPipelineFP[input_t1]( + module, + (test_input,), + aten_op, + filter_fn=_is_linear, + frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD, + cosine_threshold=_MXFP_COSINE_THRESHOLD, + tosa_version="1.1", + tosa_extensions=["mxfp"], ) + pipeline.run() -def _test_mxfp_linear_vgf( - test_data, - config: MXFPOpConfig, - frobenius_threshold, - cosine_threshold, -) -> None: +@arm_common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails) +@arm_common.SkipIfNoModelConverter +def test_mxfp_linear_vgf(test_data) -> None: test_input, out_features, has_bias, set_block_weights = test_data() in_features = test_input.shape[-1] module = Linear( @@ -269,169 +260,36 @@ def _test_mxfp_linear_vgf( (test_input,), aten_op, filter_fn=_is_linear, - frobenius_threshold=frobenius_threshold, - cosine_threshold=cosine_threshold, - mxfp_config=config, + frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD, + cosine_threshold=_MXFP_COSINE_THRESHOLD, tosa_spec="TOSA-1.1+FP+mxfp", ) pipeline.run() -def _test_mxfp_linear_tosa_FP( - test_data, - config: MXFPOpConfig, - frobenius_threshold=0.08, - cosine_threshold=0.995, -) -> None: +@arm_common.parametrize("test_data", test_data_fp) +def test_mxfp_linear_eager_cpu(test_data) -> None: test_input, out_features, has_bias, set_block_weights = test_data() in_features = test_input.shape[-1] - module = Linear( + ref_model = Linear( in_features=in_features, out_features=out_features, bias=has_bias, ).eval() - if set_block_weights: - module.set_block_test_weights() - - pipeline = MXFPTosaPipelineFP[input_t1]( - module, - (test_input,), - aten_op, - filter_fn=_is_linear, - frobenius_threshold=frobenius_threshold, - cosine_threshold=cosine_threshold, - mxfp_config=config, - tosa_version="1.1", - tosa_extensions=["mxfp"], - ) - pipeline.run() - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp8_linear_tosa_FP(test_data: torch.Tensor) -> None: - _test_mxfp_linear_tosa_FP( - test_data, - MXFPOpConfig(weight_dtype=torch.float8_e4m3fn), - ) - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp4_linear_tosa_FP(test_data: torch.Tensor) -> None: - _test_mxfp_linear_tosa_FP( - test_data, - MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2), - frobenius_threshold=0.3, - cosine_threshold=0.95, - ) - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp6_e2m3_linear_tosa_FP(test_data: torch.Tensor) -> None: - _test_mxfp_linear_tosa_FP( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3), - frobenius_threshold=0.2, - cosine_threshold=0.98, - ) - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp6_e3m2_linear_tosa_FP(test_data: torch.Tensor) -> None: - _test_mxfp_linear_tosa_FP( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2), - frobenius_threshold=0.2, - cosine_threshold=0.98, - ) - - -@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails) -@common.SkipIfNoModelConverter -def test_mxfp8_linear_vgf(test_data: torch.Tensor) -> None: - _test_mxfp_linear_vgf( - test_data, - MXFPOpConfig(weight_dtype=torch.float8_e4m3fn), - frobenius_threshold=0.08, - cosine_threshold=0.995, - ) - - -@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails) -@common.SkipIfNoModelConverter -def test_mxfp4_linear_vgf(test_data: torch.Tensor) -> None: - _test_mxfp_linear_vgf( - test_data, - MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2), - frobenius_threshold=0.3, - cosine_threshold=0.95, - ) - - -@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails) -@common.SkipIfNoModelConverter -def test_mxfp6_e2m3_linear_vgf(test_data: torch.Tensor) -> None: - _test_mxfp_linear_vgf( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3), - frobenius_threshold=0.2, - cosine_threshold=0.98, - ) - - -@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails) -@common.SkipIfNoModelConverter -def test_mxfp6_e3m2_linear_vgf(test_data: torch.Tensor) -> None: - _test_mxfp_linear_vgf( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2), - frobenius_threshold=0.2, - cosine_threshold=0.98, - ) - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp8_linear_eager_cpu(test_data: torch.Tensor) -> None: - """Check eager MXFP implementation. - - The Arm lowering tests compare lowered output against the eager CPU - implementation, so the eager implementation must be accurate for it to be - used as a reference in other tests. - - """ - _test_mxfp_linear_eager_cpu( - test_data, - MXFPOpConfig(weight_dtype=torch.float8_e4m3fn), - frobenius_threshold=0.08, - cosine_threshold=0.995, - ) - - -@common.parametrize("test_data", test_data_fp) -def test_mxfp4_linear_eager_cpu(test_data: torch.Tensor) -> None: - _test_mxfp_linear_eager_cpu( - test_data, - MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2), - frobenius_threshold=0.3, - cosine_threshold=0.95, - ) - + ref_model.set_block_test_weights() + test_model = copy.deepcopy(ref_model).eval() -@common.parametrize("test_data", test_data_fp) -def test_mxfp6_e2m3_linear_eager_cpu(test_data: torch.Tensor) -> None: - _test_mxfp_linear_eager_cpu( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3), - frobenius_threshold=0.2, - cosine_threshold=0.98, - ) + to_mxfp(test_model, MXFPOpConfig(), filter_fn=_is_linear) + test_output = test_model(test_input) + ref_output = ref_model(test_input) -@common.parametrize("test_data", test_data_fp) -def test_mxfp6_e3m2_linear_eager_cpu(test_data: torch.Tensor) -> None: - _test_mxfp_linear_eager_cpu( - test_data, - MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2), - frobenius_threshold=0.2, - cosine_threshold=0.98, + compare_rel_frobenius_and_cosine_similarity( + ref_output, + test_output, + quantization_parameters=None, + frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD, + cosine_threshold=_MXFP_COSINE_THRESHOLD, + clean_reference=False, ) diff --git a/backends/arm/test/ops/test_round.py b/backends/arm/test/ops/test_round.py index bcc71b70725..ff86dbffff0 100644 --- a/backends/arm/test/ops/test_round.py +++ b/backends/arm/test/ops/test_round.py @@ -6,6 +6,7 @@ from typing import Tuple +import pytest import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -66,6 +67,7 @@ def test_round_tosa_INT(test_data: torch.Tensor): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 +@pytest.mark.xfail(reason="where.self not supported on U55") def test_round_u55_INT(test_data: torch.Tensor): pipeline = EthosU55PipelineINT[input_t1]( Round(), diff --git a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py index 69e1830e3ee..64594403dae 100644 --- a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py +++ b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py @@ -3,122 +3,72 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import executorch.backends.arm.tosa.dialect # noqa: F401 import torch from executorch.backends.arm._passes.insert_dynamic_padding import ( InsertDynamicPaddingPass, ) -from executorch.backends.arm.tosa.mapping import TosaSpecialDtype +from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass from executorch.backends.arm.tosa.specification import ( TosaLoweringContext, TosaSpecification, ) -from executorch.backends.test.graph_builder import GraphBuilder +from executorch.exir import to_edge from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass -from torch.fx import GraphModule -from torch.fx.passes.infra.pass_base import PassResult +from torch._export.utils import _get_shape_env_from_gm +from torch.export import Dim, export -SPEC = TosaSpecification.create_from_string("TOSA-1.1+FP+shape") +class ConvModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, kernel_size=2, stride=3, padding=2) - -def _build_conv_graph( - target_op, - input_shape: tuple[int, ...], - weight_shape: tuple[int, ...], - padding: list[int], - stride: list[int], - dilation: list[int], -) -> GraphModule: - with TosaLoweringContext(SPEC): - builder = GraphBuilder() - input_tensor = builder.placeholder("input", torch.randn(input_shape)) - weight = builder.placeholder("weight", torch.randn(weight_shape)) - bias = builder.placeholder("bias", torch.randn(weight_shape[0])) - padding_shape = builder.call_operator( - exir_ops.backend.tosa.CONST_SHAPE.default, (padding,) - ) - padding_shape.node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE - conv = builder.call_operator( - target_op, - (input_tensor, weight, bias, stride, padding_shape, dilation), - ) - builder.output([conv]) - return ExportPass().call(builder.get_graph_module()).graph_module - - -def _run_insert_dynamic_padding(graph_module: GraphModule) -> GraphModule: - with TosaLoweringContext(SPEC): - result = InsertDynamicPaddingPass()(graph_module) - assert isinstance(result, PassResult) - return result.graph_module - - -def _assert_inserted_padding( - graph_module: GraphModule, - target_op, - zero_spatial_padding: list[int], - expected_full_padding_len: int, -) -> None: - nodes = graph_module.graph.nodes - conv_node = next(n for n in nodes if n.target == target_op) - assert conv_node.args[4] == zero_spatial_padding - - padding_node = next( - n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default - ) - padding_shape_node = padding_node.args[1] - assert padding_shape_node.target == exir_ops.backend.tosa.CONCAT_SHAPE.default - - n_padding, spatial_padding, c_padding = padding_shape_node.args[0] - assert n_padding.meta["val"] == [0, 0] - assert spatial_padding.target == exir_ops.backend.tosa.CONST_SHAPE.default - assert c_padding.meta["val"] == [0, 0] - - pad_list = padding_shape_node.meta["val"] - spatial_padding_value = spatial_padding.meta["val"] - assert len(pad_list) == expected_full_padding_len - assert pad_list[:2] == [0, 0] - assert pad_list[2:-2] == spatial_padding_value - assert pad_list[-2:] == [0, 0] + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.conv(x) def test_insert_dynamic_padding(): - graph_module = _build_conv_graph( - exir_ops.backend.tosa.CONV2D.default, - input_shape=(1, 8, 8, 3), - weight_shape=(16, 2, 2, 3), - padding=[2, 2, 2, 2], - stride=[3, 3], - dilation=[1, 1], - ) - - graph_module = _run_insert_dynamic_padding(graph_module) - - _assert_inserted_padding( - graph_module, - exir_ops.backend.tosa.CONV2D.default, - zero_spatial_padding=[0, 0, 0, 0], - expected_full_padding_len=8, - ) - - -def test_insert_dynamic_padding_conv3d(): - graph_module = _build_conv_graph( - exir_ops.backend.tosa.CONV3D.default, - input_shape=(1, 8, 8, 8, 3), - weight_shape=(16, 2, 2, 2, 3), - padding=[2, 2, 2, 2, 2, 2], - stride=[3, 3, 3], - dilation=[1, 1, 1], + model = ConvModule() + example_inputs = (torch.randn(1, 3, 8, 8),) + ep = export( + model, + example_inputs, + dynamic_shapes={ + "x": {2: Dim("height", min=4, max=10), 3: Dim("width", min=4, max=10)} + }, ) + edge_model = to_edge(ep) + shape_env = _get_shape_env_from_gm(edge_model.exported_program().graph_module) + with TosaLoweringContext( + TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env + ): + edge_model = edge_model.transform( + [RewriteConvPass(edge_model.exported_program())] + ) + nodes = edge_model.exported_program().graph.nodes + conv_node = next( + n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default + ) + initial_padding = conv_node.args[4] + assert any(isinstance(p, torch.SymInt) for p in initial_padding) - graph_module = _run_insert_dynamic_padding(graph_module) - - _assert_inserted_padding( - graph_module, - exir_ops.backend.tosa.CONV3D.default, - zero_spatial_padding=[0, 0, 0, 0, 0, 0], - expected_full_padding_len=10, - ) + edge_model = edge_model.transform( + [ + InsertDynamicPaddingPass(), + ] + ) + nodes = edge_model.exported_program().graph.nodes + conv_node = next( + n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default + ) + padding = conv_node.args[4] + assert padding == [0, 0, 0, 0] + padding_node = next( + n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default + ) + assert padding_node is not None + pad_list = padding_node.args[1].meta["val"] + assert len(pad_list) == 8 + assert pad_list[:2] == [0, 0] # N-padding + assert pad_list[2:6] == initial_padding # HW-padding in NHWC order + assert pad_list[6:] == [0, 0] # C-padding diff --git a/backends/arm/test/passes/test_rewrite_conv_pass.py b/backends/arm/test/passes/test_rewrite_conv_pass.py index 736aa685b86..fc8478afee5 100644 --- a/backends/arm/test/passes/test_rewrite_conv_pass.py +++ b/backends/arm/test/passes/test_rewrite_conv_pass.py @@ -336,15 +336,11 @@ def test_rewrite_conv_dynamic_keeps_static_padding_when_symbolic_remainder_is_ze assert all(not isinstance(p, torch.SymInt) for p in padding) -def test_rewrite_conv_adjust_pad_if_needed_static_allows_negative_padding_until_later_validation(): +def test_rewrite_conv_adjust_pad_if_needed_static_raises_before_negative_padding(): rewrite_pass, _, _ = _make_rewrite_pass((torch.randn(1, 3, 9, 12),)) - try: + with pytest.raises(RuntimeError, match="SizeAdjustInputPass"): rewrite_pass._adjust_pad_if_needed(6, 2, 3, 0, 1) - except RuntimeError as e: - assert "SizeAdjustInputPass" in str(e) - else: - pytest.fail("Expected RuntimeError was not raised") def test_rewrite_conv_adjust_pad_if_needed_static_positive_padding_stays_non_negative(): @@ -391,7 +387,7 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_exact_zero_keeps_positive_pa assert adjusted_pad == 1 -def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_returns_symbolic_padding(): +def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_raises_before_negative_padding(): rewrite_pass, shape_env, input_len = _make_rewrite_pass( (torch.randn(1, 3, 8, 8),), dynamic_shapes={ @@ -403,9 +399,8 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_retur with TosaLoweringContext( TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env ): - adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1) - - assert isinstance(adjusted_pad, torch.SymInt) + with pytest.raises(RuntimeError, match="SizeAdjustInputPass"): + rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1) def test_rewrite_conv_symbolic_comparison_with_int_specializes_to_hint(): @@ -443,12 +438,11 @@ def unsafe_adjust(input_len, input_weight, stride, pad, dilation): with TosaLoweringContext( TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env ): - adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1) - - assert isinstance(adjusted_pad, torch.SymInt) + with pytest.raises(RuntimeError, match="SizeAdjustInputPass"): + rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1) -def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_returns_symbolic_padding(): +def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_raises_before_negative_padding(): rewrite_pass, shape_env, input_len = _make_rewrite_pass( (torch.randn(1, 3, 8, 8),), dynamic_shapes={ @@ -457,22 +451,8 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_returns_s }, ) - with TosaLoweringContext( - TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env - ): - adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1) - - assert isinstance(adjusted_pad, torch.SymInt) - - -def test_rewrite_conv_adjust_pad_if_needed_symbolic_singleton_overflow_still_raises(): - rewrite_pass, shape_env, input_len = _make_rewrite_pass( - (torch.randn(1, 3, 9, 12),), - dynamic_shapes=_multiples_of_three_dynamic_shapes(), - ) - with TosaLoweringContext( TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env ): with pytest.raises(RuntimeError, match="SizeAdjustInputPass"): - rewrite_pass._adjust_pad_if_needed(input_len, 3, 3, 1, 1) + rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1) diff --git a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py index f89872f93b8..572a2b247e9 100644 --- a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py +++ b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py @@ -9,15 +9,12 @@ import torch from executorch.backends.arm._passes.rewrite_mxfp_linear import RewriteMXFPLinearPass from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp -from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str -from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.backends.arm.tosa.specification import ( TosaLoweringContext, TosaSpecification, ) from executorch.exir.dialects._ops import ops as exir_ops from torch.export import export -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3 class _LinearModule(torch.nn.Module): @@ -52,11 +49,9 @@ def _get_nodes_from_target( ] -def _rewrite_linear_module( - config: MXFPOpConfig, -) -> tuple[torch.fx.GraphModule, list[torch.fx.Node], list[torch.fx.Node]]: +def test_rewrite_mxfp_linear_replaces_custom_op() -> None: model = _LinearModule(bias=True).eval() - to_mxfp(model, config, filter_fn=_is_linear) + to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear) exported = export(model, (torch.randn(4, 5, 32),), strict=False) tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp") @@ -71,11 +66,6 @@ def _rewrite_linear_module( matmul_nodes = _get_nodes_from_target( graph_module, exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default ) - return graph_module, cast_nodes, matmul_nodes - - -def test_rewrite_mxfp_linear_replaces_custom_op() -> None: - graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(MXFPOpConfig()) assert ( len(_get_nodes_from_target(graph_module, torch.ops.tosa_mxfp.linear.default)) @@ -98,34 +88,6 @@ def test_rewrite_mxfp_linear_replaces_custom_op() -> None: assert tuple(output_node.meta["val"][0].shape) == (4, 5, 8) -def test_rewrite_mxfp6_linear_marks_payload_dtype() -> None: - graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module( - MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3) - ) - cast_node = cast_nodes[0] - matmul_node = matmul_nodes[0] - input_qdata_node = next( - node - for node in graph_module.graph.nodes - if node.op == "call_function" - and node.target == operator.getitem - and node.args[0] == cast_node - and node.args[1] == 0 - ) - weight_qdata_node = matmul_node.args[2] - assert isinstance(weight_qdata_node, torch.fx.Node) - - assert cast_node.kwargs["output_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3) - assert matmul_node.kwargs["payload_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3) - assert tuple(cast_node.meta["val"][0].shape) == (1, 4 * 5, 32) - assert ( - input_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3 - ) - assert ( - weight_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3 - ) - - def test_rewrite_mxfp_dual_linear() -> None: model = _DualLinearModule().eval() to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear) diff --git a/backends/arm/test/passes/test_symbolic_value_range.py b/backends/arm/test/passes/test_symbolic_value_range.py index 99dfafc93a6..7a6ecfdf79c 100644 --- a/backends/arm/test/passes/test_symbolic_value_range.py +++ b/backends/arm/test/passes/test_symbolic_value_range.py @@ -68,16 +68,3 @@ def test_evaluate_symbolic_expr_values_bails_out_for_large_symbol_ranges() -> No shape_env, symint = _make_shape_env(hint=3, compiler_min=1, compiler_max=400) assert evaluate_symbolic_expr_values(symint, shape_env) is None - - -def test_evaluate_symbolic_expr_values_does_not_require_shape_env_bounds( - monkeypatch, -) -> None: - shape_env, symint = _make_shape_env(hint=3, compiler_min=2, compiler_max=6) - - def raise_recursion(_expr): - raise RecursionError - - monkeypatch.setattr(shape_env, "bound_sympy", raise_recursion) - - assert evaluate_symbolic_expr_values(symint, shape_env) == {2, 3, 4, 5, 6} diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 9a63452e325..ff26d17ee13 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -2,10 +2,10 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + import importlib.resources as _resources import json import logging -import numbers import os import re import shutil @@ -14,11 +14,13 @@ import tempfile from collections.abc import Iterable from pathlib import Path + from types import NoneType from typing import Any, cast, Dict, List, Optional, Tuple import executorch.backends.arm.test as arm_test_package import executorch.backends.arm.tosa.schemas as tosa_schemas_package + import numpy as np import torch from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor @@ -29,6 +31,7 @@ NNHWC_INVERSE_ORDER, NNHWC_ORDER, ) + from executorch.backends.arm.ethosu import EthosUCompileSpec from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification @@ -40,6 +43,7 @@ from executorch.exir import ExecutorchProgramManager, ExportedProgram from executorch.exir.lowered_backend_module import LoweredBackendModule from torch.fx.node import Node + from torch.overrides import TorchFunctionMode from tosa.TosaGraph import TosaGraph # type: ignore[import-not-found, import-untyped] @@ -75,7 +79,6 @@ "corstone-320", "vkml_emulation_layer", } -INFER_SHAPES_PATH = "infer_shapes" class QuantizationParams: @@ -99,9 +102,7 @@ def __init__( self.dtype = dtype -def get_input_names( - program: ExportedProgram, is_lowered_module: bool = False -) -> list[str]: +def get_input_names(program: ExportedProgram) -> list[str]: """Get a list[str] with the names of the inputs to this model. Args: @@ -110,15 +111,7 @@ def get_input_names( A list of strings with the names of the model input. """ - - if not is_lowered_module: - return [spec.arg.name for spec in program.graph_signature.input_specs] - else: - return [ - user_input - for user_input in program.graph_signature.user_inputs - if isinstance(user_input, str) - ] + return [spec.arg.name for spec in program.graph_signature.input_specs] def get_input_quantization_params( @@ -211,59 +204,25 @@ def torch_tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray: return tensor.numpy() -def torch_tensor_to_tosa_shape(tensor: torch.Tensor) -> list[int]: - shape = list(tensor.shape) - dim_order = tensor.dim_order() - if dim_order in (NHWC_ORDER, NNHWC_ORDER): - shape = [shape[index] for index in dim_order] - return [int(dim) for dim in shape] - - -def user_inputs_need_shape_inference(program: ExportedProgram) -> bool: - user_inputs = { - user_input - for user_input in program.graph_signature.user_inputs - if isinstance(user_input, str) - } - for node in program.graph.nodes: - if node.op != "placeholder" or node.name not in user_inputs: - continue - input_tensor = get_first_fake_tensor(node) - if any(not isinstance(dim, numbers.Integral) for dim in input_tensor.shape): - return True - return False - - def numpy_to_torch_tensor(array: np.ndarray, output_node: Node) -> torch.Tensor: output_tensor = get_first_fake_tensor(output_node) shape = output_tensor.shape dim_order = output_tensor.dim_order() - - def is_concrete_shape(shape_like) -> bool: - return all(isinstance(dim, numbers.Integral) for dim in shape_like) - - def to_torch_tensor() -> torch.Tensor: - if array.dtype.type is np.void: - # If dtype is void, "cheat" and use the output_tensor dtype. - return torch.frombuffer(array, dtype=output_tensor.dtype) - return torch.from_numpy(array) - if dim_order == NHWC_ORDER: - tensor = to_torch_tensor() - if is_concrete_shape(shape): - tensor = tensor.reshape([shape[i] for i in NHWC_ORDER]) + shape_with_dim_order = [shape[i] for i in NHWC_ORDER] + tensor = torch.from_numpy(array).reshape(shape_with_dim_order) return tensor.permute(NHWC_INVERSE_ORDER).to(memory_format=torch.channels_last) elif dim_order == NNHWC_ORDER: - tensor = to_torch_tensor() - if is_concrete_shape(shape): - tensor = tensor.reshape([shape[i] for i in NNHWC_ORDER]) - return tensor.permute(NNHWC_INVERSE_ORDER) + shape_with_dim_order = [shape[i] for i in NNHWC_ORDER] + tensor = torch.from_numpy(array).reshape(shape_with_dim_order) + return tensor.permute(NNHWC_INVERSE_ORDER).to(memory_format=torch.channels_last) else: - tensor = to_torch_tensor() - - if is_concrete_shape(shape): - return tensor.reshape(shape) - return tensor + if array.dtype.type is np.void: + # If dtype is void, "cheat" and use the output_tensor dtype. + tensor = torch.frombuffer(array, dtype=output_tensor.dtype) + else: + tensor = torch.from_numpy(array) + return tensor.reshape(shape) class TosaReferenceModelDispatch(TorchFunctionMode): @@ -275,65 +234,12 @@ def __init__(self): self.ran_tosa_dispatch = False super().__init__() - def _generate_shape_inference_json( - self, - tosa_buffer: bytes, - artifact_path: Path, - test_case_path: Path, - input_names: list[str], - inputs: Tuple[torch.Tensor, ...], - ): - shapes = dict( - zip(input_names, [torch_tensor_to_tosa_shape(input) for input in inputs]) - ) - with open(test_case_path, "w", encoding="utf-8") as f: - json.dump({"tosa_file": str(artifact_path), "shapes": shapes}, f, indent=2) - - def _run_infer_shapes( - self, - tosa_buffer: bytes, - input_names: list[str], - inputs: Tuple[torch.Tensor, ...], - temp_dir_path: Path, - infer_shapes_path: str = INFER_SHAPES_PATH, - ) -> bytes: - model_suffix = "model.tosa" - tosa_sym_int_model = temp_dir_path / model_suffix - tosa_sym_int_model.write_bytes(tosa_buffer) - test_case_file = temp_dir_path / "test_case.json" - - self._generate_shape_inference_json( - tosa_buffer, tosa_sym_int_model, test_case_file, input_names, inputs - ) - subprocess.run( - [ - infer_shapes_path, - f"{test_case_file}", - ], - check=True, - capture_output=True, - text=True, - ) # nosec - resolved_file = temp_dir_path / f"resolved_{model_suffix}" - with open(resolved_file, "rb") as f: - return f.read() - def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs): tosa_buffer = lowered_backend_module.processed_bytes compile_spec = TosaCompileSpec._from_list(lowered_backend_module.compile_specs) - tosa_spec = compile_spec.tosa_spec - output_node = lowered_backend_module.original_module.graph.output_node() - if tosa_spec.support_extension("shape") and user_inputs_need_shape_inference( - lowered_backend_module.original_module - ): - input_names = get_input_names(lowered_backend_module.original_module, True) - # Generate json file for shape inference extension, which is required by the reference model. - with tempfile.TemporaryDirectory() as temp_dir: - tosa_buffer = self._run_infer_shapes( - tosa_buffer, input_names, inputs, Path(temp_dir) - ) - return run_tosa_graph(tosa_buffer, tosa_spec, inputs, output_node) + output_node = lowered_backend_module.original_module.graph.output_node() + return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs, output_node) def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) @@ -376,7 +282,7 @@ def __torch_function__(self, func, types, args=..., kwargs=None): def run_target( executorch_program_manager: ExecutorchProgramManager, - inputs: Tuple[torch.Tensor, ...], + inputs: Tuple[torch.Tensor], intermediate_path: str | Path, target_board: str, elf_path: str | Path, @@ -404,7 +310,7 @@ def run_target( def save_inputs_to_file( exported_program: ExportedProgram, - inputs: Tuple[torch.Tensor, ...], + inputs: Tuple[torch.Tensor], intermediate_path: str | Path, ): input_file_paths: list[str] = [] @@ -436,7 +342,7 @@ def get_output_from_file( def run_vkml_emulation_layer( executorch_program_manager: ExecutorchProgramManager, - inputs: Tuple[torch.Tensor, ...], + inputs: Tuple[torch.Tensor], intermediate_path: str | Path, elf_path: str | Path, ): @@ -484,7 +390,7 @@ def run_vkml_emulation_layer( def run_corstone( executorch_program_manager: ExecutorchProgramManager, - inputs: Tuple[torch.Tensor, ...], + inputs: Tuple[torch.Tensor], intermediate_path: str | Path, target_board: str, elf_path: str | Path, diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl index d321766e8d8..4df310f6dc1 100644 --- a/backends/arm/test/targets.bzl +++ b/backends/arm/test/targets.bzl @@ -23,7 +23,6 @@ def define_arm_tests(): "ops/test_log10.py", "ops/test_max_pool1d.py", "ops/test_mul.py", - "ops/test_mxfp_conv2d.py", "ops/mxfp/test_mxfp_linear.py", "ops/test_permute.py", "ops/test_rsqrt.py", @@ -58,14 +57,12 @@ def define_arm_tests(): # "misc/test_evaluate_model.py", "misc/test_pass_pipeline_config.py", "misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py", - "misc/tosa_dialect/test_tosa_dialect_mxfp_conv2d.py", "misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py", "misc/tosa_dialect/test_tosa_resize.py", "misc/test_tosa_spec.py", "misc/test_bn_relu_folding_qat.py", "misc/test_custom_partition.py", "misc/test_debug_hook.py", - "misc/test_mxfp_conv2d_ao.py", "misc/test_mxfp_linear_ao.py", "misc/test_post_quant_device_switch.py", "misc/test_vgf_check_env.py", diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py index 4d059b64efe..0585f7a1ff8 100644 --- a/backends/arm/tosa/dialect/__init__.py +++ b/backends/arm/tosa/dialect/__init__.py @@ -11,7 +11,6 @@ binary_elementwise, cast_to_block_scaled, conv2d, - conv2d_block_scaled, conv3d, custom, data_layout_ops, diff --git a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py index 8dbff7c11c5..ed109be6124 100644 --- a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py +++ b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py @@ -5,28 +5,24 @@ from __future__ import annotations -from typing import cast - import torch -from executorch.backends.arm.ao_ext.mxfp import mxfp_str_to_dtype from executorch.backends.arm.tosa.dialect.lib import TosaValueError from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op from executorch.backends.arm.tosa.specification import ( get_context_spec, TosaSpecification, ) -from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2 @register_fake_tosa_op( - "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, str output_dtype) -> (Tensor, Tensor)", + "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, ScalarType output_dtype) -> (Tensor, Tensor)", [TosaSpecification.create_from_string("TOSA-1.1+FP")], ) def CAST_TO_BLOCK_SCALED( input: torch.Tensor, block_size: int, - output_dtype: str, + output_dtype: torch.dtype, ) -> tuple[torch.Tensor, torch.Tensor]: tosa_spec = get_context_spec() @@ -66,25 +62,12 @@ def CAST_TO_BLOCK_SCALED( ) scale_tensor_dtype = torch.float8_e8m0fnu - elem_dtype = mxfp_str_to_dtype(output_dtype) - if elem_dtype not in ( - torch.float4_e2m1fn_x2, - DTYPE_FP6_E2M3, - DTYPE_FP6_E3M2, - torch.float8_e4m3fn, - torch.float8_e5m2, - ): + if output_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2): raise TosaValueError( f"Unsupported block-scaled output dtype {output_dtype}", op="CAST_TO_BLOCK_SCALED", ) scale_shape = (*input.shape[:-1], input.shape[-1] // block_size) - if elem_dtype == torch.float4_e2m1fn_x2: - output_shape = (*input.shape[:-1], input.shape[-1] // 2) - output_data = input.new_empty(output_shape, dtype=torch.uint8) - elif elem_dtype in (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2): - output_data = input.new_empty(input.shape, dtype=torch.uint8) - else: - output_data = torch.empty_like(input, dtype=cast(torch.dtype, elem_dtype)) + output_data = torch.empty_like(input, dtype=output_dtype) output_scale = input.new_empty(scale_shape, dtype=scale_tensor_dtype) return output_data, output_scale diff --git a/backends/arm/tosa/dialect/ops/conv2d.py b/backends/arm/tosa/dialect/ops/conv2d.py index d0db2d60fcd..5af0ca1617a 100644 --- a/backends/arm/tosa/dialect/ops/conv2d.py +++ b/backends/arm/tosa/dialect/ops/conv2d.py @@ -3,6 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import math from typing import Optional import torch @@ -88,23 +89,6 @@ def validate_conv2d_args_dtypes( # noqa: C901 return output_dtype -def conv_output_dim( - input_dim: int | torch.SymInt, - kernel_dim: int, - stride: int, - pad_before: int | torch.SymInt, - pad_after: int | torch.SymInt, - dilation: int, -) -> int | torch.SymInt: - receptive_field = dilation * (kernel_dim - 1) + 1 - total_pad = pad_before + pad_after - - if stride == 1: - return input_dim + total_pad - receptive_field + 1 - - return (input_dim + total_pad - receptive_field) // stride + 1 - - @register_fake_tosa_op( "CONV2D(Tensor input, " "Tensor weight, " @@ -126,14 +110,17 @@ def CONV2D( output_dtype = validate_conv2d_args_dtypes(tosa_spec, x, weight, bias, op="CONV2D") + torch_pad = [pad[0], pad[2]] N = x.shape[0] - H_in, W_in = x.shape[1:3] C_out = weight.shape[0] - H_out = conv_output_dim( - H_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0] + H_in, W_in = x.shape[1], x.shape[2] + H_out = math.floor( + (H_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0] + + 1 ) - W_out = conv_output_dim( - W_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1] + W_out = math.floor( + (W_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1] + + 1 ) output_shape = [N, H_out, W_out, C_out] return torch.empty(size=output_shape, dtype=output_dtype) diff --git a/backends/arm/tosa/dialect/ops/conv3d.py b/backends/arm/tosa/dialect/ops/conv3d.py index a81ae0dae53..67ceb0596c6 100644 --- a/backends/arm/tosa/dialect/ops/conv3d.py +++ b/backends/arm/tosa/dialect/ops/conv3d.py @@ -3,14 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import math from typing import Optional import torch from executorch.backends.arm.tosa.dialect.lib import TosaValueError -from executorch.backends.arm.tosa.dialect.ops.conv2d import ( - conv_output_dim, - validate_conv2d_args_dtypes, -) +from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op from executorch.backends.arm.tosa.specification import ( get_context_spec, @@ -37,7 +35,7 @@ def validate_conv3d_args_dtypes( "Tensor weight, " "Tensor bias, " "int[3] stride, " - "SymInt[6] pad, " + "int[6] pad, " "int[3] dilation) -> Tensor", TosaSpecification.all_versions_and_profiles(), ) @@ -46,24 +44,28 @@ def CONV3D( weight: torch.Tensor, bias: torch.Tensor, stride: list[int], - pad: list[int | torch.SymInt], + pad: list[int], dilation: list[int], ) -> torch.Tensor: tosa_spec = get_context_spec() output_dtype = validate_conv3d_args_dtypes(tosa_spec, x, weight, bias) + torch_pad = [pad[0], pad[2], pad[4]] N = x.shape[0] C_out = weight.shape[0] - D_in, H_in, W_in = x.shape[1:4] - D_out = conv_output_dim( - D_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0] + D_in, H_in, W_in = x.shape[1], x.shape[2], x.shape[3] + D_out = math.floor( + (D_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0] + + 1 ) - H_out = conv_output_dim( - H_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1] + H_out = math.floor( + (H_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1] + + 1 ) - W_out = conv_output_dim( - W_in, weight.shape[3], stride[2], pad[4], pad[5], dilation[2] + W_out = math.floor( + (W_in + 2 * torch_pad[2] - dilation[2] * (weight.shape[3] - 1) - 1) / stride[2] + + 1 ) output_shape = [N, D_out, H_out, W_out, C_out] return torch.empty(size=output_shape, dtype=output_dtype) diff --git a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py index 83ef3ff72fb..ae864f29d62 100644 --- a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py +++ b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py @@ -3,11 +3,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import math + import torch -from executorch.backends.arm.tosa.dialect.ops.conv2d import ( - conv_output_dim, - validate_conv2d_args_dtypes, -) +from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op from executorch.backends.arm.tosa.specification import ( @@ -39,11 +38,17 @@ def DEPTHWISE_CONV2D( tosa_spec, x, weight, bias, op="DEPTHWISE_CONV2D" ) + torch_pad = [pad[0], pad[2]] + # Weight format is [KH, KW, IC, M], where C_out = IC * M. kernel_h, kernel_w = weight.shape[0], weight.shape[1] C_out = weight.shape[2] * weight.shape[3] N = x.shape[0] - H_in, W_in = x.shape[1:3] - H_out = conv_output_dim(H_in, kernel_h, stride[0], pad[0], pad[1], dilation[0]) - W_out = conv_output_dim(W_in, kernel_w, stride[1], pad[2], pad[3], dilation[1]) + H_in, W_in = x.shape[1], x.shape[2] + H_out = math.floor( + (H_in + 2 * torch_pad[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1 + ) + W_out = math.floor( + (W_in + 2 * torch_pad[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1 + ) output_shape = [N, H_out, W_out, C_out] return torch.empty(size=output_shape, dtype=output_dtype) diff --git a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py index fcea104320f..b42e2855e4c 100644 --- a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py +++ b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py @@ -7,11 +7,6 @@ import torch -from executorch.backends.arm.ao_ext.mxfp import ( - mxfp_str_to_dtype, - MXFPDType, - SUPPORTED_MXFP_DTYPES, -) from executorch.backends.arm.tosa.dialect.lib import TosaValueError from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op from executorch.backends.arm.tosa.specification import ( @@ -33,39 +28,18 @@ def _validate_block_size(block_size: int) -> None: ) -def _get_payload_dtype( - data: torch.Tensor, - payload_dtype: str = "", -) -> MXFPDType: - if payload_dtype: - return mxfp_str_to_dtype(payload_dtype) - if data.dtype == torch.uint8: - return torch.float4_e2m1fn_x2 - return data.dtype - - -def _get_logical_last_dim(data: torch.Tensor, payload_dtype: str = "") -> int: - last_dim = data.shape[-1] - if _get_payload_dtype(data, payload_dtype) == torch.float4_e2m1fn_x2: - return last_dim * 2 - return last_dim - - def _validate_dtypes( A_data: torch.Tensor, A_scale: torch.Tensor, B_data: torch.Tensor, B_scale: torch.Tensor, - payload_dtype: str = "", ) -> None: - A_dtype = _get_payload_dtype(A_data, payload_dtype) - B_dtype = _get_payload_dtype(B_data, payload_dtype) - if A_dtype not in SUPPORTED_MXFP_DTYPES: + if A_data.dtype not in (torch.float8_e4m3fn, torch.float8_e5m2): raise TosaValueError( f"Unsupported A_data dtype {A_data.dtype}", op="MATMUL_T_BLOCK_SCALED", ) - if B_dtype != A_dtype: + if B_data.dtype != A_data.dtype: raise TosaValueError( f"B_data dtype {B_data.dtype} must match A_data dtype {A_data.dtype}", op="MATMUL_T_BLOCK_SCALED", @@ -83,7 +57,6 @@ def _validate_shapes( B_data: torch.Tensor, B_scale: torch.Tensor, block_size: int, - payload_dtype: str = "", ) -> tuple[int, int, int]: if A_data.ndim != 3 or A_scale.ndim != 3 or B_data.ndim != 3 or B_scale.ndim != 3: raise TosaValueError( @@ -91,10 +64,8 @@ def _validate_shapes( op="MATMUL_T_BLOCK_SCALED", ) - N, H = A_data.shape[:2] - D, W = B_data.shape[:2] - C = _get_logical_last_dim(A_data, payload_dtype) - Cb = _get_logical_last_dim(B_data, payload_dtype) + N, H, C = A_data.shape + D, W, Cb = B_data.shape if C != Cb: raise TosaValueError( f"A_data last dim {C} must match B_data last dim {Cb}", @@ -129,8 +100,7 @@ def _validate_shapes( @register_fake_tosa_op( - "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, " - "Tensor B_scale, SymInt block_size, str payload_dtype='') -> Tensor", + "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, Tensor B_scale, SymInt block_size) -> Tensor", [TosaSpecification.create_from_string("TOSA-1.1+FP")], ) def MATMUL_T_BLOCK_SCALED( @@ -139,7 +109,6 @@ def MATMUL_T_BLOCK_SCALED( B_data: torch.Tensor, B_scale: torch.Tensor, block_size: int, - payload_dtype: str = "", ) -> torch.Tensor: tosa_spec = get_context_spec() @@ -150,13 +119,12 @@ def MATMUL_T_BLOCK_SCALED( ) _validate_block_size(block_size) - _validate_dtypes(A_data, A_scale, B_data, B_scale, payload_dtype) + _validate_dtypes(A_data, A_scale, B_data, B_scale) output_shape = _validate_shapes( A_data, A_scale, B_data, B_scale, block_size, - payload_dtype, ) return A_data.new_empty(output_shape, dtype=torch.float32) diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py index 5e661676149..245a9c00235 100644 --- a/backends/arm/tosa/mapping.py +++ b/backends/arm/tosa/mapping.py @@ -35,9 +35,6 @@ class TosaSpecialDtype(Enum): """Special TOSA dtypes not natively expressed in PyTorch.""" - FP4E2M1 = ts.DType.FP4E2M1 - FP6E2M3 = ts.DType.FP6E2M3 - FP6E3M2 = ts.DType.FP6E3M2 INT48 = ts.DType.INT48 INT4 = ts.DType.INT4 SHAPE = ts.DType.SHAPE @@ -105,7 +102,6 @@ def map_dtype(data_type: torch.dtype) -> Any: torch.float8_e4m3fn: ts.DType.FP8E4M3, torch.float8_e5m2: ts.DType.FP8E5M2, torch.float8_e8m0fnu: ts.DType.FP8UE8M0, - torch.float4_e2m1fn_x2: ts.DType.FP4E2M1, torch.int8: ts.DType.INT8, # TOSA uses signless int8; unsigned semantics are expressed via RESCALE. torch.uint8: ts.DType.INT8, @@ -160,10 +156,8 @@ def extract_tensor_meta(meta): raise ValueError( f"Expected first value in node.meta['val'] to be FakeTensor, got {val.__class__}" ) - shape = tuple(val.size()) - if special_dtype == TosaSpecialDtype.FP4E2M1 and val.dtype == torch.uint8: - shape = (*shape[:-1], shape[-1] * 2) dtype = map_dtype(val.dtype) + shape = tuple(val.size()) return (dtype, shape) @@ -255,15 +249,6 @@ def __validate(self, tosa_spec: TosaSpecification) -> bool: or tosa_spec.support_extension("mxfp") ): return False - case ts.DType.FP4E2M1: - if not tosa_spec.support_extension("mxfp"): - return False - case ts.DType.FP6E2M3: - if not tosa_spec.support_extension("mxfp"): - return False - case ts.DType.FP6E3M2: - if not tosa_spec.support_extension("mxfp"): - return False return True diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index 8c4257e9472..37b9cd7cc2a 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -309,9 +309,7 @@ def _detag_boundary_nodes( elif detag_first_fp_node and not is_q_node and not is_dq_node: # For non Q/DQ nodes, remove tag from first node in partition if any input has fp dtype for input in node.all_input_nodes: - if is_partitioned(input, tag) or isinstance( - input.meta["val"], torch.SymInt - ): + if is_partitioned(input, tag): continue if get_first_fake_tensor(input).dtype.is_floating_point: reporter.report_reject( @@ -358,13 +356,7 @@ def _partition_has_invalid_uint8(self, partition: Partition, tag: str) -> bool: if dtype is None: try: dtype = get_first_fake_tensor(node).dtype - except ( - AttributeError, - KeyError, - RuntimeError, - ValueError, - TypeError, - ): + except (AttributeError, KeyError, RuntimeError, ValueError): dtype = None if dtype is None: continue diff --git a/backends/arm/tosa/utils.py b/backends/arm/tosa/utils.py index b1d727d7d01..b44793cec5f 100644 --- a/backends/arm/tosa/utils.py +++ b/backends/arm/tosa/utils.py @@ -164,10 +164,6 @@ def build_reshape_tosa( def normalize_symint(shape): """Dynamic shapes in executorch are represented with torch.SymInt objects in the shapes, in TOSA we do not have this concept and instead use -1. - - This function replaces each symbolic dimension with -1. Static dimensions - are preserved unchanged. - """ removed_symints = tuple([-1 if isinstance(d, torch.SymInt) else d for d in shape]) return list(removed_symints) diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py index cc2e5a088f4..f062cdc90c6 100644 --- a/backends/arm/vgf/backend.py +++ b/backends/arm/vgf/backend.py @@ -14,12 +14,10 @@ import logging import os # nosec B404 - used alongside subprocess for tool invocation -import shlex import shutil import subprocess # nosec B404 - required to drive external converter CLI import tempfile -from dataclasses import dataclass -from typing import Any, final, List +from typing import final, List from executorch.backends.arm._passes import RewriteConvPass from executorch.backends.arm._passes.arm_pass_manager import ( @@ -40,7 +38,7 @@ ) from executorch.backends.arm.vgf.model_converter import ( # type: ignore[import-not-found] model_converter_env, - require_model_converter_executable, + require_model_converter_binary, ) from executorch.exir.backend.backend_details import ( # type: ignore[import-not-found] BackendDetails, @@ -54,94 +52,6 @@ # debug functionality logger = logging.getLogger(__name__) -STATUS_OK = "PASS" -STATUS_FAIL = "FAIL" -VGF_BACKEND_NAME = "VgfBackend" - - -@dataclass(frozen=True) -class VgfRuntimeEnvironmentCheck: - """One VGF runtime backend environment preflight result. - - This lives next to the Python VGF backend name and backend implementation, - while importing the actual ExecuTorch runtime lazily so AoT import behavior - remains unchanged. - - """ - - name: str - status: str - detail: str - action: str | None = None - - @property - def ok(self) -> bool: - return self.status != STATUS_FAIL - - def to_dict(self) -> dict[str, str | None]: - return { - "name": self.name, - "status": self.status, - "detail": self.detail, - "action": self.action, - } - - -def _load_runtime() -> Any: - from executorch.runtime import Runtime - - return Runtime.get() - - -def check_vgf_runtime_backend_environment() -> VgfRuntimeEnvironmentCheck: - """Check whether the installed runtime exposes the VGF backend.""" - - try: - runtime = _load_runtime() - except Exception as exc: - return VgfRuntimeEnvironmentCheck( - "VGF runtime backend", - STATUS_FAIL, - f"Could not initialize executorch.runtime.Runtime: {exc}", - "Install or rebuild ExecuTorch with runtime pybindings. For source " - "builds, enable the VGF runtime backend and reinstall the package.", - ) - - try: - registered_backend_names = list( - runtime.backend_registry.registered_backend_names - ) - is_available = runtime.backend_registry.is_available( - backend_name=VGF_BACKEND_NAME - ) - except Exception as exc: - return VgfRuntimeEnvironmentCheck( - "VGF runtime backend", - STATUS_FAIL, - f"Runtime backend registry query failed: {exc}", - "Reinstall or rebuild ExecuTorch with backend registry pybindings.", - ) - - if is_available: - return VgfRuntimeEnvironmentCheck( - "VGF runtime backend", - STATUS_OK, - f"{VGF_BACKEND_NAME} is available in the runtime backend registry.", - ) - - rendered = ", ".join(registered_backend_names[:20]) - if len(registered_backend_names) > 20: - rendered += ", ..." - - return VgfRuntimeEnvironmentCheck( - "VGF runtime backend", - STATUS_FAIL, - f"{VGF_BACKEND_NAME} is not available. Registered backends: " - f"{rendered or ''}.", - "Use a runtime build/package that includes the VGF backend. For source " - "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.", - ) - def _register_grid_sampler_rewrite_pass() -> None: """Register VGF-only custom shader lowering passes.""" @@ -252,52 +162,6 @@ def preprocess( return PreprocessResult(processed_bytes=binary) -def _format_repro_command(command: List[str]) -> str: - """Return a shell-safe command string for reproducing converter failures.""" - return " ".join(shlex.quote(arg) for arg in command) - - -def _copy_failure_artifacts( - tosa_path: str, - artifact_path: str | None, - tag_name: str, -) -> str | None: - """Copy the failing TOSA input to the artifact directory, if configured. - - Args: - tosa_path: Temporary TOSA flatbuffer passed to model-converter. - artifact_path: User-configured intermediate artifact directory. - tag_name: Optional delegation tag used to disambiguate artifacts. - - Returns: - Path to the copied TOSA file, or None if no artifact path was configured. - - """ - if not artifact_path: - return None - - os.makedirs(artifact_path, exist_ok=True) - - suffix = f"_{tag_name}" if tag_name else "" - failure_tosa_path = os.path.join( - artifact_path, - f"failed_model_converter_input{suffix}.tosa", - ) - shutil.copy2(tosa_path, failure_tosa_path) - return failure_tosa_path - - -def _replace_converter_input_path( - conversion_command: List[str], - input_path: str, -) -> List[str]: - """Return a converter command that uses a preserved TOSA input path.""" - input_flag_index = conversion_command.index("-i") - repro_command = list(conversion_command) - repro_command[input_flag_index + 1] = input_path - return repro_command - - def vgf_compile( tosa_flatbuffer: bytes, compile_flags: List[str], @@ -327,7 +191,7 @@ def vgf_compile( f.write(tosa_flatbuffer) compile_flags = [f for f in compile_flags if f and f.strip()] - converter_binary = str(require_model_converter_executable()) + converter_binary = require_model_converter_binary() vgf_path = tosa_path + ".vgf" conversion_command = [ converter_binary, @@ -346,21 +210,11 @@ def vgf_compile( env=model_converter_env(), ) except subprocess.CalledProcessError as process_error: - failure_tosa_path = _copy_failure_artifacts( - tosa_path, - artifact_path, - tag_name, - ) - repro_command = ( - _replace_converter_input_path(conversion_command, failure_tosa_path) - if failure_tosa_path - else conversion_command - ) + conversion_command_str = " ".join(conversion_command) raise RuntimeError( - "Vgf compiler failed.\n" - f"Repro command:\n {_format_repro_command(repro_command)}\n" - f"Stderr:\n{process_error.stderr.decode()}\n" - f"Stdout:\n{process_error.stdout.decode()}" + f"Vgf compiler ('{conversion_command_str}') failed with error:\n \ + {process_error.stderr.decode()}\n \ + Stdout:\n{process_error.stdout.decode()}" ) if artifact_path: diff --git a/backends/arm/vgf/check_env.py b/backends/arm/vgf/check_env.py index 2c7fb9c5396..576964df160 100644 --- a/backends/arm/vgf/check_env.py +++ b/backends/arm/vgf/check_env.py @@ -26,18 +26,25 @@ import os import re import shutil +import subprocess # nosec B404 - invoked only for trusted local tools import sys from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path from typing import Any -from executorch.backends.arm.vgf import model_converter +from executorch.backends.arm.vgf.model_converter import ( + find_model_converter_binary, + model_converter_env, +) + STATUS_OK = "PASS" STATUS_WARN = "WARN" STATUS_FAIL = "FAIL" +VGF_BACKEND_NAME = "VgfBackend" + _REQUIRED_VKML_INSTANCE_LAYERS = { "VK_LAYER_ML_Graph_Emulation", "VK_LAYER_ML_Tensor_Emulation", @@ -209,17 +216,6 @@ def _format_check(check: VgfEnvironmentCheck) -> str: return "\n".join(lines) -def _as_environment_check(check: Any) -> VgfEnvironmentCheck: - """Convert a module-owned preflight result into the CLI report type.""" - - return VgfEnvironmentCheck( - check.name, - check.status, - check.detail, - getattr(check, "action", None), - ) - - def _repo_root() -> Path: resolved = Path(__file__).resolve() for parent in resolved.parents: @@ -301,22 +297,165 @@ def _check_tosa_serializer() -> VgfEnvironmentCheck: ) +def _resolve_executable(binary: str) -> Path | None: + path = Path(binary) + if path.is_absolute() or path.parent != Path("."): + if _safe_is_file(path) and os.access(path, os.X_OK): + return path + return None + + resolved = shutil.which(binary) + if resolved: + return Path(resolved) + return None + + +def _command_output(result: subprocess.CompletedProcess[str]) -> str: + text = "\n".join( + part.strip() for part in (result.stdout, result.stderr) if part.strip() + ) + lines = text.splitlines() + if not lines: + return "" + return "\n".join(lines[:4]) + + def _check_model_converter() -> VgfEnvironmentCheck: - """Convert a module-owned preflight result into the CLI report type.""" - return _as_environment_check(model_converter.check_model_converter_environment()) + binary = find_model_converter_binary() + if binary is None: + return VgfEnvironmentCheck( + "MLSDK model converter", + STATUS_FAIL, + "Could not find model-converter on PATH and MODEL_CONVERTER_PATH " + "does not point to an executable file.", + "Install VGF AoT dependencies with " + "python -m pip install 'executorch[vgf]' or, in a source checkout, " + "python -m pip install -r backends/arm/requirements-arm-vgf.txt. " + "Alternatively set MODEL_CONVERTER_PATH to the converter executable.", + ) + + executable = _resolve_executable(binary) + if executable is None: + return VgfEnvironmentCheck( + "MLSDK model converter", + STATUS_FAIL, + f"Resolved converter candidate {binary!r}, but it is not executable.", + "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.", + ) + + try: + result = subprocess.run( # nosec B603 - local converter executable + [str(executable), "--version"], + check=False, + capture_output=True, + text=True, + timeout=20, + env=model_converter_env(), + ) + except Exception as exc: + return VgfEnvironmentCheck( + "MLSDK model converter", + STATUS_FAIL, + f"Found {executable}, but running '--version' failed: {exc}", + "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. " + "For source setup, source examples/arm/arm-scratch/setup_path.sh.", + ) + + if result.returncode != 0: + return VgfEnvironmentCheck( + "MLSDK model converter", + STATUS_FAIL, + f"{executable} --version exited with {result.returncode}:\n" + f"{_command_output(result)}", + "Check that the model-converter binary and its shared libraries are " + "from the same MLSDK install.", + ) + + return VgfEnvironmentCheck( + "MLSDK model converter", + STATUS_OK, + f"{executable} --version succeeded:\n{_command_output(result)}", + ) def _check_model_converter_lib_dir() -> VgfEnvironmentCheck: - """Convert a module-owned preflight result into the CLI report type.""" - return _as_environment_check( - model_converter.check_model_converter_lib_dir_environment() + lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR") + if not lib_dir: + return VgfEnvironmentCheck( + "MODEL_CONVERTER_LIB_DIR", + STATUS_OK, + "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader " + "paths. This is OK when model-converter --version succeeds.", + ) + + path = Path(lib_dir).expanduser() + if _safe_is_dir(path): + return VgfEnvironmentCheck( + "MODEL_CONVERTER_LIB_DIR", + STATUS_OK, + f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}", + ) + + return VgfEnvironmentCheck( + "MODEL_CONVERTER_LIB_DIR", + STATUS_FAIL, + f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.", + "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.", ) +def _load_runtime() -> Any: + from executorch.runtime import Runtime + + return Runtime.get() + + def _check_runtime_vgf_backend() -> VgfEnvironmentCheck: - from executorch.backends.arm.vgf import backend as vgf_backend + try: + runtime = _load_runtime() + except Exception as exc: + return VgfEnvironmentCheck( + "VGF runtime backend", + STATUS_FAIL, + f"Could not initialize executorch.runtime.Runtime: {exc}", + "Install or rebuild ExecuTorch with runtime pybindings. For source " + "builds, enable the VGF runtime backend and reinstall the package.", + ) + + try: + registered_backend_names = list( + runtime.backend_registry.registered_backend_names + ) + is_available = runtime.backend_registry.is_available( + backend_name=VGF_BACKEND_NAME + ) + except Exception as exc: + return VgfEnvironmentCheck( + "VGF runtime backend", + STATUS_FAIL, + f"Runtime backend registry query failed: {exc}", + "Reinstall or rebuild ExecuTorch with backend registry pybindings.", + ) + + if is_available: + return VgfEnvironmentCheck( + "VGF runtime backend", + STATUS_OK, + f"{VGF_BACKEND_NAME} is available in the runtime backend registry.", + ) - return _as_environment_check(vgf_backend.check_vgf_runtime_backend_environment()) + rendered = ", ".join(registered_backend_names[:20]) + if len(registered_backend_names) > 20: + rendered += ", ..." + + return VgfEnvironmentCheck( + "VGF runtime backend", + STATUS_FAIL, + f"{VGF_BACKEND_NAME} is not available. Registered backends: " + f"{rendered or ''}.", + "Use a runtime build/package that includes the VGF backend. For source " + "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.", + ) def _package_dirs(package: str) -> list[Path]: diff --git a/backends/arm/vgf/model_converter.py b/backends/arm/vgf/model_converter.py index d76abbbcdf6..2d3868837b1 100644 --- a/backends/arm/vgf/model_converter.py +++ b/backends/arm/vgf/model_converter.py @@ -1,4 +1,4 @@ -# Copyright 2025-2026 Arm Limited and/or its affiliates. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -6,45 +6,12 @@ from __future__ import annotations import os -import subprocess # nosec B404 - invoked only for trusted local converter tools -from dataclasses import dataclass -from pathlib import Path from shutil import which from typing import Optional MODEL_CONVERTER_BINARY = "model-converter" _MODEL_CONVERTER_FALLBACK_BINARY = "model_converter" -STATUS_OK = "PASS" -STATUS_FAIL = "FAIL" - - -@dataclass(frozen=True) -class ModelConverterEnvironmentCheck: - """One model-converter environment preflight result. - - This lives in the same module that resolves and launches the converter so - the standalone VGF preflight CLI cannot drift from the actual compiler path. - - """ - - name: str - status: str - detail: str - action: str | None = None - - @property - def ok(self) -> bool: - return self.status != STATUS_FAIL - - def to_dict(self) -> dict[str, str | None]: - return { - "name": self.name, - "status": self.status, - "detail": self.detail, - "action": self.action, - } - def find_model_converter_binary() -> Optional[str]: """Return the path/name of the first model converter executable found.""" @@ -58,20 +25,6 @@ def find_model_converter_binary() -> Optional[str]: return None -def _safe_is_file(path: Path) -> bool: - try: - return path.is_file() - except OSError: - return False - - -def _safe_is_dir(path: Path) -> bool: - try: - return path.is_dir() - except OSError: - return False - - def model_converter_env() -> dict[str, str]: """Return an env dict suitable for running model-converter as a subprocess. @@ -99,134 +52,3 @@ def require_model_converter_binary() -> str: f"Tried: {tried}. Ensure the Model Converter is installed and on PATH." ) return binary - - -def resolve_model_converter_executable(binary: str) -> Path | None: - """Resolve a converter candidate to an executable path, if possible. - - This is shared by the VGF compiler path and the preflight checker so both - agree on what a usable converter executable means. - - """ - - path = Path(binary) - if path.is_absolute() or path.parent != Path("."): - if _safe_is_file(path) and os.access(path, os.X_OK): - return path - return None - - resolved = which(binary) - if resolved: - return Path(resolved) - return None - - -def require_model_converter_executable() -> Path: - """Return a usable converter executable path or raise a helpful error.""" - - binary = require_model_converter_binary() - executable = resolve_model_converter_executable(binary) - if executable is None: - raise RuntimeError( - f"Resolved converter candidate {binary!r}, but it is not executable. " - "Fix MODEL_CONVERTER_PATH or place model-converter on PATH." - ) - return executable - - -def _command_output(result: subprocess.CompletedProcess[str]) -> str: - text = "\n".join( - part.strip() for part in (result.stdout, result.stderr) if part.strip() - ) - lines = text.splitlines() - if not lines: - return "" - return "\n".join(lines[:4]) - - -def check_model_converter_environment() -> ModelConverterEnvironmentCheck: - """Check the model-converter dependency used by VGF compilation.""" - - binary = find_model_converter_binary() - if binary is None: - return ModelConverterEnvironmentCheck( - "MLSDK model converter", - STATUS_FAIL, - "Could not find model-converter on PATH and MODEL_CONVERTER_PATH " - "does not point to an executable file.", - "Install VGF AoT dependencies with " - "python -m pip install 'executorch[vgf]' or, in a source checkout, " - "python -m pip install -r backends/arm/requirements-arm-vgf.txt. " - "Alternatively set MODEL_CONVERTER_PATH to the converter executable.", - ) - - executable = resolve_model_converter_executable(binary) - if executable is None: - return ModelConverterEnvironmentCheck( - "MLSDK model converter", - STATUS_FAIL, - f"Resolved converter candidate {binary!r}, but it is not executable.", - "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.", - ) - - try: - result = subprocess.run( # nosec B603 - local converter executable - [str(executable), "--version"], - check=False, - capture_output=True, - text=True, - timeout=20, - env=model_converter_env(), - ) - except Exception as exc: - return ModelConverterEnvironmentCheck( - "MLSDK model converter", - STATUS_FAIL, - f"Found {executable}, but running '--version' failed: {exc}", - "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. " - "For source setup, source examples/arm/arm-scratch/setup_path.sh.", - ) - - if result.returncode != 0: - return ModelConverterEnvironmentCheck( - "MLSDK model converter", - STATUS_FAIL, - f"{executable} --version exited with {result.returncode}:\n" - f"{_command_output(result)}", - "Check that the model-converter binary and its shared libraries are " - "from the same MLSDK install.", - ) - - return ModelConverterEnvironmentCheck( - "MLSDK model converter", - STATUS_OK, - f"{executable} --version succeeded:\n{_command_output(result)}", - ) - - -def check_model_converter_lib_dir_environment() -> ModelConverterEnvironmentCheck: - """Check MODEL_CONVERTER_LIB_DIR used by model_converter_env().""" - - lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR") - if not lib_dir: - return ModelConverterEnvironmentCheck( - "MODEL_CONVERTER_LIB_DIR", - STATUS_OK, - "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader " - "paths. This is OK when model-converter --version succeeds.", - ) - - path = Path(lib_dir).expanduser() - if _safe_is_dir(path): - return ModelConverterEnvironmentCheck( - "MODEL_CONVERTER_LIB_DIR", - STATUS_OK, - f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}", - ) - - return ModelConverterEnvironmentCheck( - "MODEL_CONVERTER_LIB_DIR", - STATUS_FAIL, - f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.", - "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.", - ) diff --git a/backends/cadence/fused_quant/op_add.cpp b/backends/cadence/fused_quant/op_add.cpp index 1aea2ccfb6c..62e58c71c83 100644 --- a/backends/cadence/fused_quant/op_add.cpp +++ b/backends/cadence/fused_quant/op_add.cpp @@ -14,10 +14,10 @@ namespace cadence { namespace fused_quant { namespace native { +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/op_add.h b/backends/cadence/fused_quant/op_add.h index b32710f41de..9db1e907294 100644 --- a/backends/cadence/fused_quant/op_add.h +++ b/backends/cadence/fused_quant/op_add.h @@ -19,18 +19,19 @@ executorch::aten::Tensor& add_out( executorch::runtime::KernelRuntimeContext& ctx, const executorch::aten::Tensor& inp, const executorch::aten::Tensor& other, - const std::optional& inp_scale, - const std::optional& inp_zero_point, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, executorch::aten::ScalarType inp_dtype, int64_t inp_quant_min, int64_t inp_quant_max, - const std::optional& other_scale, - const std::optional& other_zero_point, + const executorch::aten::optional& other_scale, + const executorch::aten::optional& + other_zero_point, executorch::aten::ScalarType other_dtype, int64_t other_quant_min, int64_t other_quant_max, - const std::optional& out_scale, - const std::optional& out_zero_point, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, executorch::aten::ScalarType out_dtype, int64_t out_quant_min, int64_t out_quant_max, diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp index 8d071b48a33..7204ab6c88f 100644 --- a/backends/cadence/fused_quant/op_bmm.cpp +++ b/backends/cadence/fused_quant/op_bmm.cpp @@ -14,10 +14,10 @@ namespace cadence { namespace fused_quant { namespace native { +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h index c6a4502983f..ef9598eac98 100644 --- a/backends/cadence/fused_quant/op_bmm.h +++ b/backends/cadence/fused_quant/op_bmm.h @@ -19,18 +19,19 @@ executorch::aten::Tensor& bmm_out( executorch::runtime::KernelRuntimeContext& ctx, const executorch::aten::Tensor& inp, const executorch::aten::Tensor& other, - const std::optional& inp_scale, - const std::optional& inp_zero_point, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, executorch::aten::ScalarType inp_dtype, int64_t inp_quant_min, int64_t inp_quant_max, - const std::optional& other_scale, - const std::optional& other_zero_point, + const executorch::aten::optional& other_scale, + const executorch::aten::optional& + other_zero_point, executorch::aten::ScalarType other_dtype, int64_t other_quant_min, int64_t other_quant_max, - const std::optional& out_scale, - const std::optional& out_zero_point, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, executorch::aten::ScalarType out_dtype, int64_t out_quant_min, int64_t out_quant_max, diff --git a/backends/cadence/fused_quant/op_hardswish.cpp b/backends/cadence/fused_quant/op_hardswish.cpp index 4b968cebe6c..452ea90a405 100644 --- a/backends/cadence/fused_quant/op_hardswish.cpp +++ b/backends/cadence/fused_quant/op_hardswish.cpp @@ -16,10 +16,10 @@ namespace cadence { namespace fused_quant { namespace native { +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/op_hardswish.h b/backends/cadence/fused_quant/op_hardswish.h index de7d88b427b..ba9e09da23c 100644 --- a/backends/cadence/fused_quant/op_hardswish.h +++ b/backends/cadence/fused_quant/op_hardswish.h @@ -18,13 +18,13 @@ namespace native { executorch::aten::Tensor& hardswish_out( executorch::runtime::KernelRuntimeContext& ctx, const executorch::aten::Tensor& inp, - const std::optional& inp_scale, - const std::optional& inp_zero_point, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, executorch::aten::ScalarType inp_dtype, int64_t inp_quant_min, int64_t inp_quant_max, - const std::optional& out_scale, - const std::optional& out_zero_point, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, executorch::aten::ScalarType out_dtype, int64_t out_quant_min, int64_t out_quant_max, diff --git a/backends/cadence/fused_quant/op_mul.cpp b/backends/cadence/fused_quant/op_mul.cpp index a2595104ae8..3d071f7c2da 100644 --- a/backends/cadence/fused_quant/op_mul.cpp +++ b/backends/cadence/fused_quant/op_mul.cpp @@ -14,10 +14,10 @@ namespace cadence { namespace fused_quant { namespace native { +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/op_mul.h b/backends/cadence/fused_quant/op_mul.h index 62314c98003..f7afa016b79 100644 --- a/backends/cadence/fused_quant/op_mul.h +++ b/backends/cadence/fused_quant/op_mul.h @@ -19,18 +19,19 @@ executorch::aten::Tensor& mul_out( executorch::runtime::KernelRuntimeContext& ctx, const executorch::aten::Tensor& inp, const executorch::aten::Tensor& other, - const std::optional& inp_scale, - const std::optional& inp_zero_point, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, executorch::aten::ScalarType inp_dtype, int64_t inp_quant_min, int64_t inp_quant_max, - const std::optional& other_scale, - const std::optional& other_zero_point, + const executorch::aten::optional& other_scale, + const executorch::aten::optional& + other_zero_point, executorch::aten::ScalarType other_dtype, int64_t other_quant_min, int64_t other_quant_max, - const std::optional& out_scale, - const std::optional& out_zero_point, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, executorch::aten::ScalarType out_dtype, int64_t out_quant_min, int64_t out_quant_max, diff --git a/backends/cadence/fused_quant/op_relu.cpp b/backends/cadence/fused_quant/op_relu.cpp index e8e58522d2e..ebe7933a7b9 100644 --- a/backends/cadence/fused_quant/op_relu.cpp +++ b/backends/cadence/fused_quant/op_relu.cpp @@ -16,10 +16,10 @@ namespace cadence { namespace fused_quant { namespace native { +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/op_relu.h b/backends/cadence/fused_quant/op_relu.h index 522144eacd0..e8527c7633f 100644 --- a/backends/cadence/fused_quant/op_relu.h +++ b/backends/cadence/fused_quant/op_relu.h @@ -18,13 +18,13 @@ namespace native { executorch::aten::Tensor& relu_out( executorch::runtime::KernelRuntimeContext& ctx, const executorch::aten::Tensor& inp, - const std::optional& inp_scale, - const std::optional& inp_zero_point, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, executorch::aten::ScalarType inp_dtype, int64_t inp_quant_min, int64_t inp_quant_max, - const std::optional& out_scale, - const std::optional& out_zero_point, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, executorch::aten::ScalarType out_dtype, int64_t out_quant_min, int64_t out_quant_max, diff --git a/backends/cadence/fused_quant/quant_utils.h b/backends/cadence/fused_quant/quant_utils.h index 78884bfcceb..fff669a9e0e 100644 --- a/backends/cadence/fused_quant/quant_utils.h +++ b/backends/cadence/fused_quant/quant_utils.h @@ -64,8 +64,8 @@ struct QParams { }; inline QParams extract_qparams( - const std::optional& scale_tensor, - const std::optional& zp_tensor, + const executorch::aten::optional& scale_tensor, + const executorch::aten::optional& zp_tensor, int64_t quant_min, int64_t quant_max, const executorch::aten::Tensor& data_tensor) { diff --git a/backends/cadence/fused_quant/tests/test_op_add.cpp b/backends/cadence/fused_quant/tests/test_op_add.cpp index 61124f0b9b2..dca110cf0e1 100644 --- a/backends/cadence/fused_quant/tests/test_op_add.cpp +++ b/backends/cadence/fused_quant/tests/test_op_add.cpp @@ -14,10 +14,10 @@ #include #include +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::testing::TensorFactory; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp index bae04993a7a..5ede47ea8a9 100644 --- a/backends/cadence/fused_quant/tests/test_op_bmm.cpp +++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp @@ -14,10 +14,10 @@ #include #include +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::testing::TensorFactory; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp index eb6231161f2..502d680d2e3 100644 --- a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp +++ b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp @@ -14,10 +14,10 @@ #include #include +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::testing::TensorFactory; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/tests/test_op_mul.cpp b/backends/cadence/fused_quant/tests/test_op_mul.cpp index da27c7287c9..0b9addabc5e 100644 --- a/backends/cadence/fused_quant/tests/test_op_mul.cpp +++ b/backends/cadence/fused_quant/tests/test_op_mul.cpp @@ -14,10 +14,10 @@ #include #include +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::testing::TensorFactory; -using std::optional; namespace { diff --git a/backends/cadence/fused_quant/tests/test_op_relu.cpp b/backends/cadence/fused_quant/tests/test_op_relu.cpp index 1096daae202..6b83551fd2b 100644 --- a/backends/cadence/fused_quant/tests/test_op_relu.cpp +++ b/backends/cadence/fused_quant/tests/test_op_relu.cpp @@ -14,10 +14,10 @@ #include #include +using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::testing::TensorFactory; -using std::optional; namespace { diff --git a/backends/cadence/generic/operators/op_avg_pool2d.cpp b/backends/cadence/generic/operators/op_avg_pool2d.cpp index c33f91151fb..b04187db62e 100644 --- a/backends/cadence/generic/operators/op_avg_pool2d.cpp +++ b/backends/cadence/generic/operators/op_avg_pool2d.cpp @@ -19,11 +19,11 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; // Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype, // and AT is the accumulation datatype. 'quantized' is true when the input is diff --git a/backends/cadence/generic/operators/op_avg_pool2d.h b/backends/cadence/generic/operators/op_avg_pool2d.h index 85b5d55a84b..05f1810bb61 100644 --- a/backends/cadence/generic/operators/op_avg_pool2d.h +++ b/backends/cadence/generic/operators/op_avg_pool2d.h @@ -23,8 +23,9 @@ ::executorch::aten::Tensor& avg_pool2d_out( ::executorch::aten::IntArrayRef padding, bool ceil_mode, bool count_include_pad, - std::optional divisor_override, - const std::optional<::executorch::aten::Tensor>& in_zero_point_t, + ::executorch::aten::optional divisor_override, + const ::executorch::aten::optional<::executorch::aten::Tensor>& + in_zero_point_t, bool channel_last, ::executorch::aten::Tensor& out); diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp index b65f8016880..36befc52102 100644 --- a/backends/cadence/generic/operators/op_fully_connected.cpp +++ b/backends/cadence/generic/operators/op_fully_connected.cpp @@ -15,10 +15,10 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; void linear( const Tensor& input, diff --git a/backends/cadence/generic/operators/op_fully_connected.h b/backends/cadence/generic/operators/op_fully_connected.h index 7e03f5ef664..d23bcbeb70c 100644 --- a/backends/cadence/generic/operators/op_fully_connected.h +++ b/backends/cadence/generic/operators/op_fully_connected.h @@ -15,9 +15,9 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; Tensor& fully_connected_out( KernelRuntimeContext& ctx, diff --git a/backends/cadence/generic/operators/op_linalg_svd.cpp b/backends/cadence/generic/operators/op_linalg_svd.cpp index 4cb4f6397ea..4974b617418 100644 --- a/backends/cadence/generic/operators/op_linalg_svd.cpp +++ b/backends/cadence/generic/operators/op_linalg_svd.cpp @@ -261,7 +261,7 @@ std::tuple linalg_svd_out( const Tensor& A, bool full_matrices, bool compute_uv, - std::optional driver, + ::executorch::aten::optional<::executorch::aten::string_view> driver, Tensor& U, Tensor& S, Tensor& Vh) { diff --git a/backends/cadence/generic/operators/op_linalg_svd.h b/backends/cadence/generic/operators/op_linalg_svd.h index e8335b7fa0e..7635276c4f5 100644 --- a/backends/cadence/generic/operators/op_linalg_svd.h +++ b/backends/cadence/generic/operators/op_linalg_svd.h @@ -26,7 +26,7 @@ linalg_svd_out( const ::executorch::aten::Tensor& A, bool full_matrices, bool compute_uv, - std::optional driver, + ::executorch::aten::optional<::executorch::aten::string_view> driver, ::executorch::aten::Tensor& U, ::executorch::aten::Tensor& S, ::executorch::aten::Tensor& Vh); diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp index 8a427045a83..6f42543cfc1 100644 --- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp +++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp @@ -256,7 +256,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out( int64_t output_zero_point, __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, - __ET_UNUSED const std::optional& offset, + __ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { (void)ctx; quantized_conv1d_nlc( diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h index f1780497f73..4f4d2877b27 100644 --- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h +++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h @@ -54,7 +54,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - const std::optional& offset, + const ::executorch::aten::optional& offset, Tensor& out); } // namespace native diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.cpp b/backends/cadence/generic/operators/op_quantized_conv2d.cpp index f6755f9dda8..0811267a3b8 100644 --- a/backends/cadence/generic/operators/op_quantized_conv2d.cpp +++ b/backends/cadence/generic/operators/op_quantized_conv2d.cpp @@ -16,11 +16,11 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::quantize; -using std::optional; /* This implements a generic 2d conv kernel that operates on raw pointers. * The quantized version handles quantized convolutions for 2D inputs. @@ -936,7 +936,7 @@ Tensor& quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, ET_UNUSED int64_t out_multiplier, ET_UNUSED int64_t out_shift, - ET_UNUSED const std::optional& offset, + ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { quantized_conv2d_nhwc( input, diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.h b/backends/cadence/generic/operators/op_quantized_conv2d.h index 02740d3afec..bb9476e2644 100644 --- a/backends/cadence/generic/operators/op_quantized_conv2d.h +++ b/backends/cadence/generic/operators/op_quantized_conv2d.h @@ -205,7 +205,7 @@ ::executorch::aten::Tensor& quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - const std::optional& offset, + const ::executorch::aten::optional& offset, Tensor& out); ::executorch::aten::Tensor& quantized_conv2d_depthwise_nhwc_out( diff --git a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp index 05fb809cd51..a8f98a76ffc 100644 --- a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp +++ b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp @@ -57,7 +57,7 @@ ::executorch::aten::Tensor& quantized_depthwise_conv1d_nlc_per_tensor_out( output_zero_point, out_multiplier, out_shift, - std::optional(), + ::executorch::aten::optional(), out); } diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp index d2e0d6a8bd9..55ca67648ca 100644 --- a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp +++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp @@ -19,11 +19,11 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; #define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \ _(uint8_t, Byte) \ diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.h b/backends/cadence/generic/operators/op_quantized_embedding_byte.h index 84fc53620a0..a46bebe09df 100644 --- a/backends/cadence/generic/operators/op_quantized_embedding_byte.h +++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.h @@ -19,7 +19,8 @@ ::executorch::aten::Tensor& quantized_embedding_byte_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& weight, const ::executorch::aten::Tensor& weight_scales, - const std::optional<::executorch::aten::Tensor>& weight_zero_points, + const ::executorch::aten::optional<::executorch::aten::Tensor>& + weight_zero_points, const ::executorch::aten::Tensor& indices, bool pruned_weights, ::executorch::aten::Tensor& out); diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp index ce74b5b8b7f..55e29cb7f52 100644 --- a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp +++ b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp @@ -16,10 +16,10 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; Tensor& quantized_fully_connected_out( ET_UNUSED KernelRuntimeContext& ctx, diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.h b/backends/cadence/generic/operators/op_quantized_fully_connected.h index 408fbabe726..a7510fba95f 100644 --- a/backends/cadence/generic/operators/op_quantized_fully_connected.h +++ b/backends/cadence/generic/operators/op_quantized_fully_connected.h @@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_out( const ::executorch::aten::Tensor& out_multiplier, const ::executorch::aten::Tensor& out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out( @@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out( int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& @@ -52,7 +52,7 @@ quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out( int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& @@ -66,7 +66,7 @@ quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out( int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); } // namespace native diff --git a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp index 85825cff94d..e34ed342d22 100644 --- a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp +++ b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp @@ -24,6 +24,7 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; @@ -31,7 +32,6 @@ using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::dequantize; using ::impl::generic::kernels::quantize; -using std::optional; // Compute quantized layer_norm. The current implementation assumes that the // input is per-tensor quantized. diff --git a/backends/cadence/generic/operators/op_quantized_linear.cpp b/backends/cadence/generic/operators/op_quantized_linear.cpp index 02ff97de74d..87f990a855b 100644 --- a/backends/cadence/generic/operators/op_quantized_linear.cpp +++ b/backends/cadence/generic/operators/op_quantized_linear.cpp @@ -18,11 +18,11 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::executorch::runtime::toString; -using std::optional; Tensor& quantized_linear_out( ET_UNUSED KernelRuntimeContext& ctx, diff --git a/backends/cadence/generic/operators/op_quantized_linear.h b/backends/cadence/generic/operators/op_quantized_linear.h index 517357d5bf9..b5396cb9701 100644 --- a/backends/cadence/generic/operators/op_quantized_linear.h +++ b/backends/cadence/generic/operators/op_quantized_linear.h @@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_linear_out( const ::executorch::aten::Tensor& out_multiplier, const ::executorch::aten::Tensor& out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& quantized_linear_per_tensor_out( @@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_linear_per_tensor_out( const int64_t out_multiplier, const int64_t out_shift, const int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& diff --git a/backends/cadence/generic/operators/op_quantized_matmul.cpp b/backends/cadence/generic/operators/op_quantized_matmul.cpp index b84c879e65d..e3fb0f00fdc 100644 --- a/backends/cadence/generic/operators/op_quantized_matmul.cpp +++ b/backends/cadence/generic/operators/op_quantized_matmul.cpp @@ -21,12 +21,12 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::quantize; -using std::optional; // The quantized matmul. The quantized matmul accumulates in a wider register, // whose type is TA. diff --git a/backends/cadence/generic/operators/op_quantized_matmul.h b/backends/cadence/generic/operators/op_quantized_matmul.h index c28862aa11e..70775380aac 100644 --- a/backends/cadence/generic/operators/op_quantized_matmul.h +++ b/backends/cadence/generic/operators/op_quantized_matmul.h @@ -15,9 +15,9 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; Tensor& quantized_matmul_out( KernelRuntimeContext& ctx, diff --git a/backends/cadence/generic/operators/op_quantized_mul.cpp b/backends/cadence/generic/operators/op_quantized_mul.cpp index 359a305b020..30352ee9d52 100644 --- a/backends/cadence/generic/operators/op_quantized_mul.cpp +++ b/backends/cadence/generic/operators/op_quantized_mul.cpp @@ -21,13 +21,13 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::dequantize; using ::impl::generic::kernels::quantize; -using std::optional; DECLARE_POINTWISE_TENSOR_QUANTIZED_BINARY_OP(quantized_mul_, *); diff --git a/backends/cadence/generic/operators/op_quantized_relu.cpp b/backends/cadence/generic/operators/op_quantized_relu.cpp index ecb87bd1b90..9430951f65b 100644 --- a/backends/cadence/generic/operators/op_quantized_relu.cpp +++ b/backends/cadence/generic/operators/op_quantized_relu.cpp @@ -21,12 +21,12 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::quantize; -using std::optional; template void quantized_relu_per_tensor_out_( diff --git a/backends/cadence/generic/operators/op_requantize.cpp b/backends/cadence/generic/operators/op_requantize.cpp index b9df6f1f355..f846a1964a3 100644 --- a/backends/cadence/generic/operators/op_requantize.cpp +++ b/backends/cadence/generic/operators/op_requantize.cpp @@ -19,13 +19,13 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::dequantize; using ::impl::generic::kernels::quantize; -using std::optional; // Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor. // The scale and zero_point for requantization are in the args. diff --git a/backends/cadence/generic/operators/op_rope.cpp b/backends/cadence/generic/operators/op_rope.cpp index fcc7d629cf7..17ee6d2a684 100644 --- a/backends/cadence/generic/operators/op_rope.cpp +++ b/backends/cadence/generic/operators/op_rope.cpp @@ -12,8 +12,8 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::Tensor; -using std::optional; Tensor& rope_out( ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx, @@ -75,8 +75,8 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::Tensor; -using std::optional; Tensor& rope_rotate_stacked_halves_out( ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx, diff --git a/backends/cadence/generic/operators/op_rope.h b/backends/cadence/generic/operators/op_rope.h index d738cfda6c1..638677bf118 100644 --- a/backends/cadence/generic/operators/op_rope.h +++ b/backends/cadence/generic/operators/op_rope.h @@ -20,7 +20,7 @@ ::executorch::aten::Tensor& rope_out( const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& sin_tensor, const ::executorch::aten::Tensor& cos_tensor, - const std::optional<::executorch::aten::Tensor>& pos, + const ::executorch::aten::optional<::executorch::aten::Tensor>& pos, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& rope_rotate_stacked_halves_out( @@ -28,7 +28,7 @@ ::executorch::aten::Tensor& rope_rotate_stacked_halves_out( const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& sin_tensor, const ::executorch::aten::Tensor& cos_tensor, - const std::optional<::executorch::aten::Tensor>& pos, + const ::executorch::aten::optional<::executorch::aten::Tensor>& pos, ::executorch::aten::Tensor& out); } // namespace native diff --git a/backends/cadence/generic/operators/op_softmax.cpp b/backends/cadence/generic/operators/op_softmax.cpp index b680d1e2471..97c64a22511 100644 --- a/backends/cadence/generic/operators/op_softmax.cpp +++ b/backends/cadence/generic/operators/op_softmax.cpp @@ -125,7 +125,7 @@ Tensor& _softmax_f32_f32_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& X, int64_t dim, - __ET_UNUSED std::optional half_to_float, + __ET_UNUSED ::executorch::aten::optional half_to_float, Tensor& Y) { _softmax_out(ctx, X, dim, false, Y); diff --git a/backends/cadence/generic/operators/op_softmax.h b/backends/cadence/generic/operators/op_softmax.h index d83703117b0..ec51b1d00c0 100644 --- a/backends/cadence/generic/operators/op_softmax.h +++ b/backends/cadence/generic/operators/op_softmax.h @@ -26,7 +26,7 @@ ::executorch::aten::Tensor& _softmax_f32_f32_out( __ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& X, int64_t dim, - __ET_UNUSED std::optional half_to_float, + __ET_UNUSED ::executorch::aten::optional half_to_float, ::executorch::aten::Tensor& Y); } // namespace native diff --git a/backends/cadence/generic/operators/op_transposed_convolution.cpp b/backends/cadence/generic/operators/op_transposed_convolution.cpp index b742ec635b2..121b479e65f 100644 --- a/backends/cadence/generic/operators/op_transposed_convolution.cpp +++ b/backends/cadence/generic/operators/op_transposed_convolution.cpp @@ -16,12 +16,12 @@ namespace generic { namespace native { using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::quantize; -using std::optional; // This implements a generic 2d transposed_conv kernel that operates on raw // pointers. The version handles both quantized and fp32 convolutions. diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index ccd54e80698..514813fbe05 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -17,8 +17,8 @@ using executorch::aten::RuntimeContext; using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::ArrayRef; -using std::optional; using torch::executor::Error; +using torch::executor::optional; namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp index 9d363469f74..5171c2908bc 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp @@ -238,7 +238,7 @@ void quantized_conv1d_nlc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - __ET_UNUSED const std::optional& offset, + __ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { // HiFi nnlib kernels only support dilation=1. // Fall back to generic implementation for dilation > 1. diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index 86ef244711d..ea3a756f995 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -17,7 +17,7 @@ using Tensor = executorch::aten::Tensor; using KernelRuntimeContext = torch::executor::KernelRuntimeContext; using ScalarType = executorch::aten::ScalarType; using ::executorch::aten::IntArrayRef; -using std::optional; +using ::executorch::aten::optional; namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp index a8e2b42d77d..4299990b52a 100644 --- a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp +++ b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp @@ -206,7 +206,7 @@ void quantized_depthwise_conv1d_nlc_per_tensor_out( output_zero_point, out_multiplier, out_shift, - std::optional(), + ::executorch::aten::optional(), out); return; } diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h index a567c7f650d..c53a07b58aa 100644 --- a/backends/cadence/hifi/operators/op_quantized_matmul_out.h +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.h @@ -21,7 +21,7 @@ ::executorch::aten::Tensor& quantized_matmul_out( int64_t X_zero_point, const ::executorch::aten::Tensor& Y, int64_t Y_zero_point, - const std::optional<::executorch::aten::Tensor>& bias, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, diff --git a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp index 907156af1f7..074ff29b301 100644 --- a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp +++ b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp @@ -22,7 +22,7 @@ inline Tensor& _softmax_f32_f32_out( KernelRuntimeContext& ctx, const Tensor& in, int64_t dim, - std::optional half_to_float, + ::executorch::aten::optional half_to_float, Tensor& out) { constexpr int kNnlibMaxDim = 16; @@ -146,7 +146,7 @@ Tensor& softmax_f32_f32_out( KernelRuntimeContext& ctx, const Tensor& in, int64_t dim, - std::optional half_to_float, + ::executorch::aten::optional half_to_float, Tensor& out) { return _softmax_f32_f32_out(ctx, in, dim, half_to_float, out); } diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h index fa6847f744b..3ca505d40cb 100644 --- a/backends/cadence/hifi/operators/operators.h +++ b/backends/cadence/hifi/operators/operators.h @@ -72,7 +72,7 @@ void quantized_linear_out( const ::executorch::aten::Tensor& out_multiplier, const ::executorch::aten::Tensor& out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); void quantized_linear_per_tensor_out( @@ -85,7 +85,7 @@ void quantized_linear_per_tensor_out( int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); void quantized_conv2d_nhwc_out( @@ -158,7 +158,7 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - const std::optional<::executorch::aten::Tensor>& offset, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& cat_out( diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp index aaba9f5696d..be4b34bff03 100644 --- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp @@ -582,7 +582,7 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - ET_UNUSED const std::optional& offset, + ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { quantized_conv_per_tensor_out( ctx, diff --git a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp index c53f7f7667a..29aa8906414 100644 --- a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp @@ -13,10 +13,10 @@ namespace impl { namespace vision { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; -using std::optional; void quantized_fully_connected_out( __ET_UNUSED KernelRuntimeContext& ctx, diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp index 7b3daed8ef6..b6b7cdd17bc 100644 --- a/backends/cadence/vision/operators/op_quantized_linear_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp @@ -84,7 +84,7 @@ void quantized_linear_out( const Tensor& out_multiplier, const Tensor& out_shift, int64_t out_zero_point, - __ET_UNUSED const std::optional& offset, + __ET_UNUSED const executorch::aten::optional& offset, Tensor& out) { // TODO: refactor to use switch case as quantized_linear_per_tensor_out if (out.scalar_type() == executorch::aten::ScalarType::Byte) { @@ -127,7 +127,7 @@ void quantized_linear_per_tensor_out( const int64_t out_multiplier, const int64_t out_shift, const int64_t out_zero_point, - __ET_UNUSED const std::optional& offset, + __ET_UNUSED const executorch::aten::optional& offset, Tensor& out) { #define typed_quantized_linear_per_tensor(ctype, dtype) \ case executorch::aten::ScalarType::dtype: { \ diff --git a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp index e63ae5bdda1..54a303288c3 100644 --- a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp @@ -60,7 +60,7 @@ void inline _typed_quantized_matmul( int64_t X_zero_point, const Tensor& Y, int64_t Y_zero_point, - const std::optional& bias, + const executorch::aten::optional& bias, int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, @@ -114,7 +114,7 @@ void quantized_matmul_out( int64_t X_zero_point, const Tensor& Y, int64_t Y_zero_point, - const std::optional& bias, + const executorch::aten::optional& bias, int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp index 6b93709b226..58ca33c6a0b 100644 --- a/backends/cadence/vision/operators/op_softmax.cpp +++ b/backends/cadence/vision/operators/op_softmax.cpp @@ -50,7 +50,7 @@ Tensor& _softmax_out( // Adjust for negative dim dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; - const std::optional& dim_t = dim; + const executorch::aten::optional& dim_t = dim; const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim()); const size_t size = in.size(d); diff --git a/backends/cadence/vision/operators/operators.h b/backends/cadence/vision/operators/operators.h index 1c756c0b237..8b5db4161eb 100644 --- a/backends/cadence/vision/operators/operators.h +++ b/backends/cadence/vision/operators/operators.h @@ -31,7 +31,7 @@ using ::executorch::runtime::getLeadingDims; inline __attribute__((always_inline)) void linear_( const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& weight, - const std::optional<::executorch::aten::Tensor>& bias, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, ::executorch::aten::Tensor& output) { const float* __restrict__ input_data = input.const_data_ptr(); const float* __restrict__ weight_data = weight.const_data_ptr(); diff --git a/backends/cortex_m/TARGETS b/backends/cortex_m/TARGETS index 1b73bb03bfc..b84add05516 100644 --- a/backends/cortex_m/TARGETS +++ b/backends/cortex_m/TARGETS @@ -20,23 +20,12 @@ python_library( ], ) -python_library( - name = "cmsis_nn", - srcs = [ - "library/__init__.py", - "library/cmsis_nn.py", - ], - deps = [ - "fbsource//third-party/cmsis-nn:cmsis_nn_py", - ], -) - python_library( name = "target_config", srcs = [ "target_config.py", ], deps = [ - ":cmsis_nn", + "fbsource//third-party/cmsis-nn:cmsis_nn_py", ], ) diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h index 807cf18cebc..656309abcee 100644 --- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h +++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h @@ -49,7 +49,7 @@ class CMSISScratchBufferContext final { Tensor& scratch_buffer, const Tensor& weights, const Tensor& weight_zero_point, - const std::optional& bias) + const torch::executor::optional& bias) : scratch_ptr_(scratch_buffer.mutable_data_ptr()), total_size_(scratch_buffer.size(0)), base_ptr_(reinterpret_cast(scratch_ptr_)), diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 13e8b132410..3d4f19e10d0 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -19,7 +19,7 @@ bool validate_conv2d_arguments( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Tensor& output, const Int64ArrayRef& stride, const Int64ArrayRef& padding, @@ -103,7 +103,7 @@ Tensor& quantized_conv2d_out( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Int64ArrayRef stride, const Int64ArrayRef padding, const Int64ArrayRef dilation, diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index 0793606de44..a8e1fc21ed7 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -19,7 +19,7 @@ bool validate_depthwise_conv2d_arguments( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Tensor& output, const Int64ArrayRef& stride, const Int64ArrayRef& padding, @@ -140,7 +140,7 @@ Tensor& quantized_depthwise_conv2d_out( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Int64ArrayRef stride, const Int64ArrayRef padding, const Int64ArrayRef dilation, diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp index c92ec493cd5..7448058de8e 100644 --- a/backends/cortex_m/ops/op_quantized_linear.cpp +++ b/backends/cortex_m/ops/op_quantized_linear.cpp @@ -18,8 +18,8 @@ Tensor& quantized_linear_out( KernelRuntimeContext& context, const Tensor& input, const Tensor& weights, - const std::optional& bias, - const std::optional& kernel_sum, + const torch::executor::optional& bias, + const torch::executor::optional& kernel_sum, const int64_t input_offset, const int64_t filter_offset, const int64_t output_offset, diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index 04d57d4c693..e7ecbc7c7b4 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -21,7 +21,7 @@ bool validate_transpose_conv2d_arguments( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Tensor& output, const Tensor& requantize_multipliers, const Tensor& requantize_shifts) { @@ -88,7 +88,7 @@ Tensor& quantized_transpose_conv2d_out( KernelRuntimeContext& context, const Tensor& input, const Tensor& weight, - const std::optional& bias, + const torch::executor::optional& bias, const Int64ArrayRef stride, const Int64ArrayRef padding, const Int64ArrayRef output_padding, diff --git a/backends/cortex_m/passes/BUCK b/backends/cortex_m/passes/BUCK index c792583f657..20444f16718 100644 --- a/backends/cortex_m/passes/BUCK +++ b/backends/cortex_m/passes/BUCK @@ -1,7 +1,6 @@ load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target") # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -41,7 +40,6 @@ fbcode_target(_kind = runtime.python_library, deps=[ "//caffe2:torch", "//executorch/backends/arm/_passes:passes", - "//executorch/backends/cortex_m:cmsis_nn", "//executorch/backends/cortex_m:target_config", "//executorch/backends/cortex_m/ops:ops", "//executorch/backends/cortex_m/passes:passes_utils", diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py index ec3d67c4d31..6d6783488fe 100644 --- a/backends/cortex_m/passes/__init__.py +++ b/backends/cortex_m/passes/__init__.py @@ -3,6 +3,36 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from importlib.util import find_spec + + +def _missing_dependencies_error(missing: str) -> ModuleNotFoundError: + return ModuleNotFoundError( + "Cortex-M backend dependencies are not installed " + f"(missing: {missing}). Install ExecuTorch with " + "`pip install executorch[cortex_m]`, or if building from source run " + "`examples/arm/setup.sh --i-agree-to-the-contained-eula`." + ) + + +def _ensure_cortex_m_dependencies() -> None: + required_modules = { + "cmsis_nn": "cmsis_nn", + } + missing_packages = [] + for module_name, package_name in required_modules.items(): + try: + if find_spec(module_name) is None: + missing_packages.append(package_name) + except (ImportError, ValueError): + missing_packages.append(package_name) + + if missing_packages: + raise _missing_dependencies_error(", ".join(missing_packages)) + + +_ensure_cortex_m_dependencies() + from .cortex_m_pass import CortexMPass # noqa # usort: skip from .activation_fusion_pass import ActivationFusionPass # noqa from .aten_to_cortex_m_pass import AtenToCortexMPass # noqa diff --git a/backends/cortex_m/passes/aten_to_cortex_m_pass.py b/backends/cortex_m/passes/aten_to_cortex_m_pass.py index 3f5a6055331..ecc7187797d 100644 --- a/backends/cortex_m/passes/aten_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/aten_to_cortex_m_pass.py @@ -8,12 +8,12 @@ import math from typing import cast, Optional +import cmsis_nn # type: ignore[import-not-found, import-untyped] import executorch.backends.cortex_m.ops.operators # noqa import executorch.exir as exir import torch import torch.fx from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor -from executorch.backends.cortex_m.library import cmsis_nn from executorch.backends.cortex_m.passes.passes_utils import ( build_activation_lut, diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py index b247e2be944..95a9c441f61 100644 --- a/backends/cortex_m/passes/scratch_buffer_sizes.py +++ b/backends/cortex_m/passes/scratch_buffer_sizes.py @@ -6,11 +6,11 @@ from collections.abc import Callable from typing import Any, cast +import cmsis_nn # type: ignore[import-not-found, import-untyped] import executorch.backends.cortex_m.ops.operators # noqa import torch import torch.fx -from executorch.backends.cortex_m.library import cmsis_nn from executorch.exir.dialects._ops import ops as exir_ops diff --git a/backends/cortex_m/target_config.py b/backends/cortex_m/target_config.py index 341ae612cb5..23cb15c4a53 100644 --- a/backends/cortex_m/target_config.py +++ b/backends/cortex_m/target_config.py @@ -1,6 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -11,7 +10,7 @@ from enum import auto, Enum from typing import Optional -from executorch.backends.cortex_m.library import cmsis_nn +import cmsis_nn # type: ignore[import-not-found, import-untyped] class CortexM(Enum): diff --git a/backends/cortex_m/test/misc/test_cmsis_pybind.py b/backends/cortex_m/test/misc/test_cmsis_pybind.py index 08a1d973234..f85a4bacece 100644 --- a/backends/cortex_m/test/misc/test_cmsis_pybind.py +++ b/backends/cortex_m/test/misc/test_cmsis_pybind.py @@ -1,4 +1,5 @@ # Copyright 2026 Arm Limited and/or its affiliates. +# All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -10,7 +11,7 @@ def _import_cmsis_nn(): try: - return importlib.import_module("executorch.backends.cortex_m.library.cmsis_nn") + return importlib.import_module("cmsis_nn") except Exception as exc: pytest.fail(f"Failed to resolve cmsis_nn: {exc}") diff --git a/backends/cortex_m/test/misc/test_target_config.py b/backends/cortex_m/test/misc/test_target_config.py index 472d1927886..3e648b0a81c 100644 --- a/backends/cortex_m/test/misc/test_target_config.py +++ b/backends/cortex_m/test/misc/test_target_config.py @@ -1,13 +1,12 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import cmsis_nn # type: ignore[import-not-found, import-untyped] import pytest -from executorch.backends.cortex_m.library import cmsis_nn from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig diff --git a/backends/cortex_m/test/ops/test_avg_pool2d.py b/backends/cortex_m/test/ops/test_avg_pool2d.py index a2992b50905..315d968188f 100644 --- a/backends/cortex_m/test/ops/test_avg_pool2d.py +++ b/backends/cortex_m/test/ops/test_avg_pool2d.py @@ -93,7 +93,7 @@ def test_dialect_avg_pool2d(test_case, cortex_m_target): qtol=1, ) - from executorch.backends.cortex_m.library import cmsis_nn + import cmsis_nn # type: ignore[import-not-found, import-untyped] module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module() pool_target = exir_ops.edge.cortex_m.quantized_avg_pool2d.default diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index c3d7446eaa2..f7d095540ad 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -319,12 +319,8 @@ class ET_EXPERIMENTAL CudaBackend final } } - std::string so_blob_key; - std::string weights_blob_key; - ET_CHECK_OK_OR_RETURN_ERROR( - executorch::backends::aoti::resolve_blob_keys( - processed, method_name, so_blob_key, weights_blob_key), - "Malformed named-data key payload"); + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; const NamedDataMap* named_data_map = context.get_named_data_map(); auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str()); @@ -398,11 +394,11 @@ class ET_EXPERIMENTAL CudaBackend final // methods are independent sub-graphs that may have FQN collisions // (e.g. parakeet). if (is_weight_sharing_across_methods_enabled()) { - ET_CHECK_OK_OR_RETURN_ERROR(load_constants_with_cache( - handle, named_data_map, method_name, weights_blob_key)); + ET_CHECK_OK_OR_RETURN_ERROR( + load_constants_with_cache(handle, named_data_map, method_name)); } else { ET_CHECK_OK_OR_RETURN_ERROR( - load_constants_legacy(handle, named_data_map, weights_blob_key)); + load_constants_legacy(handle, named_data_map, method_name)); } // Use shared CUDA stream if enabled via options, otherwise create one. @@ -1015,14 +1011,13 @@ class ET_EXPERIMENTAL CudaBackend final Error load_constants_with_cache( cuda::CudaDelegateHandle* handle, const NamedDataMap* named_data_map, - const std::string& method_name, - const std::string& weights_blob_key) const { + const std::string& method_name) const { // Check if the required APIs are available if (!handle->get_num_constants || !handle->get_constant_name || !handle->get_constant_original_fqn || !handle->extract_constants_map || !handle->update_user_managed_constant_buffer_pairs) { // Fall back to the legacy path - return load_constants_legacy(handle, named_data_map, weights_blob_key); + return load_constants_legacy(handle, named_data_map, method_name); } // Step 1: Enumerate constants and partition into cached/uncached @@ -1074,6 +1069,8 @@ class ET_EXPERIMENTAL CudaBackend final if (!uncached_fqns.empty()) { // Need to load from blob — use update_constants_from_blob for all, // then extract the new constants into the cache. + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); ET_CHECK_OR_RETURN_ERROR( @@ -1193,7 +1190,9 @@ class ET_EXPERIMENTAL CudaBackend final Error load_constants_legacy( cuda::CudaDelegateHandle* handle, const NamedDataMap* named_data_map, - const std::string& weights_blob_key) const { + const std::string& method_name) const { + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py index 89c1204ea00..0ee345be08a 100644 --- a/backends/cuda/tests/test_cuda_partitioner.py +++ b/backends/cuda/tests/test_cuda_partitioner.py @@ -12,18 +12,17 @@ from executorch.backends.cuda.cuda_partitioner import CudaPartitioner from executorch.exir.backend.partitioner import PartitionResult from executorch.exir.delegate import executorch_call_delegate -from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param +from torch._export.utils import is_buffer from torch.export import export -from torch.fx.passes.utils.fuser_utils import validate_partition class TestCudaPartitioner(unittest.TestCase): """ Test CUDA partitioner functionality. - A fully delegatable graph collapses to a single partition. When a - non-delegated node splits the delegatable ops, the partitioner emits one - convex partition per island. + After CUDA partitioning, there should be exactly one partitioned graph that contains + all operators from the input graph. This means all operators should be tagged with + the same delegation tag, indicating they will all be executed by the CUDA backend. """ def _get_partition_result( @@ -179,6 +178,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: for node in partition_result.tagged_exported_program.graph.nodes: if node.op == "placeholder": # Check if this is a constant (param, buffer, or lifted tensor constant) + from torch._export.utils import ( + is_buffer, + is_lifted_tensor_constant, + is_param, + ) + is_constant = ( is_param(partition_result.tagged_exported_program, node) or is_buffer(partition_result.tagged_exported_program, node) @@ -211,9 +216,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: f"All constant placeholders should be tagged. Found untagged constants: {untagged_constants}", ) - # Verify all tagged constants share the (single) partition's tag. - self.assertEqual(len(partition_result.partition_tags), 1) - expected_tag = next(iter(partition_result.partition_tags)) + # Verify all tagged constants have the expected tag + expected_tag = "tag0" for node in constant_placeholders: actual_tag = node.meta.get("delegation_tag") self.assertEqual( @@ -316,143 +320,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.assertNotIn("delegation_tag", buffer_placeholder.meta) self.assertNotIn("delegation_tag", delegate.meta) self.assertIn("delegation_tag", aten_node.meta) - - def test_multiple_partitions_for_split_graph(self) -> None: - """Ops split by a non-delegated node must land in separate partitions. - - One tag over the disconnected islands would be non-convex and fail fusion. - """ - - class TwoAddModule(torch.nn.Module): - def forward(self, x: torch.Tensor) -> torch.Tensor: - a = x + 1.0 - return a + 2.0 - - exported_program = export(TwoAddModule(), (torch.randn(3, 4),), strict=True) - graph_module = exported_program.graph_module - graph = graph_module.graph - - add_nodes = [ - n - for n in graph.nodes - if n.op == "call_function" and n.target != operator.getitem - ] - first_add, second_add = add_nodes[0], add_nodes[1] - - # Splice an already-lowered region between the two adds so the second add - # depends on the first only through that non-delegated node. - graph_module.lowered_module_0 = torch.nn.Module() - with graph.inserting_before(second_add): - lowered = graph.get_attr("lowered_module_0") - delegate = graph.call_function( - executorch_call_delegate, (lowered, first_add) - ) - delegate_output = graph.call_function(operator.getitem, (delegate, 0)) - second_add.replace_input_with(first_add, delegate_output) - graph.lint() - - result = CudaPartitioner([]).partition(exported_program) - - # Separated by the delegate, the adds must land in different partitions. - self.assertEqual(len(result.partition_tags), 2) - self.assertIn("delegation_tag", first_add.meta) - self.assertIn("delegation_tag", second_add.meta) - self.assertNotEqual( - first_add.meta["delegation_tag"], second_add.meta["delegation_tag"] - ) - self.assertNotIn("delegation_tag", delegate.meta) - self.assertNotIn("delegation_tag", delegate_output.meta) - - # Each partition must be convex on its own so fusion does not cycle. - for tag in result.partition_tags: - tagged = [ - n - for n in exported_program.graph.nodes - if n.meta.get("delegation_tag") == tag - ] - self.assertTrue(validate_partition(tagged)) - - def test_control_flow_get_attr_shares_op_tag(self) -> None: - """A control-flow op's branch get_attrs must share the op's partition tag. - - They are not call_function nodes, so the capability partitioner does not - claim them; they must be lowered into the same submodule as the op. - """ - - class CondModule(torch.nn.Module): - def forward(self, x: torch.Tensor) -> torch.Tensor: - return torch.cond(x.sum() > 0, torch.sin, torch.cos, (x,)) - - exported_program = export(CondModule(), (torch.randn(3, 4),), strict=True) - result = CudaPartitioner([]).partition(exported_program) - - cond_node = next( - n - for n in exported_program.graph.nodes - if n.op == "call_function" and n.target is torch.ops.higher_order.cond - ) - branch_get_attrs = [ - arg - for arg in cond_node.args - if isinstance(arg, torch.fx.Node) and arg.op == "get_attr" - ] - - self.assertEqual(len(branch_get_attrs), 2) - self.assertIn(cond_node.meta["delegation_tag"], result.partition_tags) - for get_attr in branch_get_attrs: - self.assertEqual( - get_attr.meta.get("delegation_tag"), - cond_node.meta["delegation_tag"], - ) - - def test_shared_constant_across_partitions(self) -> None: - """A constant read by two partitions is claimed, not dropped. - - tag_constant_data assigns it one partition's tag; backend lowering later - duplicates it per consumer, so partitioning must not crash or drop it. - """ - - class SharedWeightModule(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.register_buffer("w", torch.randn(3, 4)) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return (x + self.w) + self.w - - exported_program = export( - SharedWeightModule(), (torch.randn(3, 4),), strict=True - ) - graph_module = exported_program.graph_module - graph = graph_module.graph - - add_nodes = [ - n - for n in graph.nodes - if n.op == "call_function" and n.target != operator.getitem - ] - first_add, second_add = add_nodes[0], add_nodes[1] - - # Split the two adds (both reading w) with an already-lowered region. - graph_module.lowered_module_0 = torch.nn.Module() - with graph.inserting_before(second_add): - lowered = graph.get_attr("lowered_module_0") - delegate = graph.call_function( - executorch_call_delegate, (lowered, first_add) - ) - delegate_output = graph.call_function(operator.getitem, (delegate, 0)) - second_add.replace_input_with(first_add, delegate_output) - graph.lint() - - result = CudaPartitioner([]).partition(exported_program) - - # Two islands, and the shared buffer is claimed by one of them, not dropped. - self.assertEqual(len(result.partition_tags), 2) - buffer_placeholder = next( - n - for n in graph.nodes - if n.op == "placeholder" and is_buffer(exported_program, n) - ) - self.assertIn( - buffer_placeholder.meta.get("delegation_tag"), result.partition_tags - ) diff --git a/backends/cuda/tests/test_tq4_sdpa.py b/backends/cuda/tests/test_tq4_sdpa.py index f9543b1ff18..9cf1e9e2d57 100644 --- a/backends/cuda/tests/test_tq4_sdpa.py +++ b/backends/cuda/tests/test_tq4_sdpa.py @@ -20,6 +20,7 @@ import numpy as np import torch import torch.nn.functional as F + from executorch.backends.cuda.cuda_backend import CudaBackend from executorch.backends.cuda.cuda_partitioner import CudaPartitioner from executorch.backends.cuda.triton.kernels.tq4_sdpa import tq4_sdpa @@ -252,7 +253,7 @@ def test_gqa_prefill(self): self._run_test(1, H_q, H_kv, 64, 64, 128, is_causal=True) def test_gqa_8x_head_dim_256(self): - """GQA 8:1 with head_dim=256.""" + """GQA 8:1 with head_dim=256 — matches Qwen 3.5 MoE config.""" self._run_test(1, 16, 2, 1, 128, 256) L = 64 mask = torch.tril(torch.ones(1, 1, L, L, dtype=torch.bool, device="cuda")) @@ -374,8 +375,8 @@ def test_float_mask_rejected(self): float_mask, ) - def test_config_hd256_gqa_16_2(self): - """head_dim=256, GQA 16:2, decode + prefill.""" + def test_qwen35_moe_config(self): + """Qwen 3.5 MoE: head_dim=256, GQA 16:2, decode + prefill.""" self._run_test(1, 16, 2, 1, 256, 256) self._run_test(1, 16, 2, 128, 128, 256, is_causal=True) @@ -437,437 +438,6 @@ def test_output_shape_and_dtype(self): self.assertEqual(out.shape, (1, H_q, Lq, D)) self.assertEqual(out.dtype, torch.bfloat16) - # ------------------------------------------------------------------ - # 128k code path: kv_len clamp (decode) + mask_is_causal (prefill) - # - # Every test above calls tq4_sdpa WITHOUT kv_len and WITHOUT - # mask_is_causal, so they only exercise the kv_len=None fallback - # (full-Lk loop) at short KV. The cases below drive the actual - # long-context paths at two representative GQA shapes (head_dim=512 - # GQA 8:4, and head_dim=256 GQA 16:2): - # * the on-device kv_len scalar that bounds the KV loop to the - # filled context (decode), and - # * the mask_is_causal per-tile causal block-skip (prefill). - # - # "GARBAGE TAIL": in production the KV cache is a fixed buffer - # pre-allocated to max_seq_len (e.g. 131072). At any step only the - # first kv_len positions hold real K/V; the rest is stale / - # uninitialized memory that attention must ignore. We simulate that - # tail by writing large-magnitude (x1000) values into [kv_len:]. If - # the clamp / block-skip works the kernel never reads the tail and - # the output matches a reference built from [0, kv_len) only; if it - # is broken the huge tail values dominate the softmax and the cosine - # collapses to ~0. So the garbage tail is a built-in negative control - # (verified: dropping kv_len drives the cosine to ~-0.01 and fails). - # - # CAUSAL ALIGNMENT (top-left vs bottom-right): when L_q < L_kv (a - # chunked prefill / decode, where the Lq new queries sit at the END - # of a kv_len-long context) there are two ways to place the causal - # triangle. PyTorch F.sdpa(is_causal=True) uses TOP-LEFT alignment - # (query row i attends to keys [0, i]) -- wrong for a KV cache. This - # kernel (and a KV-cache decoder's mask builder) use BOTTOM-RIGHT - # alignment: query row i is absolute position (kv_len - Lq + i) and - # attends to keys [0, kv_len - Lq + i]. So the reference below builds - # an explicit bottom-right mask (q_pos >= cache_pos) rather than - # passing is_causal=True, which would otherwise mismatch the kernel. - # ------------------------------------------------------------------ - - def _run_long_kv_test( - self, - *, - H_q, - H_kv, - D, - Lq, - kv_len, - buffer_len, - causal=False, - garbage=True, - pass_kv_len=True, - min_cosine=0.99, - seed=42, - ): - """Drive tq4_sdpa over a buffer whose first ``kv_len`` positions are - real and whose ``[kv_len:]`` tail is large-magnitude garbage, then - compare against an fp32 reference built from the first ``kv_len`` - positions only. - - The kernel sees the full (garbage-tailed) compressed buffer; the - on-device ``kv_len`` scalar (and, for prefill, the bottom-right - causal mask) must confine attention to ``[0, kv_len)``. - - ``causal=True`` builds a bottom-right-aligned mask (the Lq queries - are the last Lq positions of a kv_len-long context), mirroring a - KV-cache decoder's ``q_pos >= cache_pos`` mask and the kernel's - ``(kv_len - Lq) + seq_pos`` block bound. We deliberately do NOT use - ``F.sdpa(is_causal=True)`` for the reference: PyTorch aligns - is_causal top-left when L_q < L_kv, while this kernel (and such a - decoder) align bottom-right. - """ - torch.manual_seed(seed) - centroids, boundaries, rotation = _make_codebook_and_rotation(D) - centroids = centroids.cuda() - boundaries = boundaries.cuda() - rotation = rotation.cuda() - - B = 1 - k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda") - v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda") - if garbage and buffer_len > kv_len: - g = buffer_len - kv_len - k[:, :, kv_len:, :] = ( - torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0 - ) - v[:, :, kv_len:, :] = ( - torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0 - ) - - q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda") - - k_packed, k_norms = _compress(k, boundaries, rotation) - v_packed, v_norms = _compress(v, boundaries, rotation) - - attn_mask = None - if causal: - cache_pos = torch.arange(buffer_len, device="cuda") - q_pos = torch.arange(kv_len - Lq, kv_len, device="cuda").unsqueeze(1) - attn_mask = (q_pos >= cache_pos.unsqueeze(0)).view(1, 1, Lq, buffer_len) - - kv_len_t = ( - torch.tensor([kv_len], dtype=torch.int32, device="cuda") - if pass_kv_len - else None - ) - - out = self.tq4_sdpa( - q, - k_packed, - k_norms, - v_packed, - v_norms, - centroids, - rotation, - attn_mask=attn_mask, - is_causal=False, - scale=None, - kv_len=kv_len_t, - mask_is_causal=causal, - ) - - # Reference: the same decompress-then-fp32-SDPA path the other tests - # use (_reference_tq4_sdpa), but over ONLY the first kv_len positions - # so the garbage tail can never influence it. _compress is per-row, - # so compressing the sliced K/V here is bit-identical to the kernel's - # view of the full buffer sliced to [:, :, :kv_len]; the helper also - # handles the GQA repeat_interleave and mask broadcast internally. - ref_mask = attn_mask[:, :, :, :kv_len] if attn_mask is not None else None - ref, *_ = _reference_tq4_sdpa( - q, - k[:, :, :kv_len], - v[:, :, :kv_len], - centroids, - boundaries, - rotation, - attn_mask=ref_mask, - ) - - self.assertFalse(torch.isnan(out).any(), "NaN in output") - cos = _cosine_sim(out, ref) - self.assertGreater( - cos, - min_cosine, - f"Cosine {cos:.5f} < {min_cosine} " - f"(H_q={H_q} H_kv={H_kv} D={D} Lq={Lq} kv_len={kv_len} " - f"buffer={buffer_len} causal={causal} kv_len_passed={pass_kv_len})", - ) - return cos - - def _run_splitk_vs_fused_test( - self, - *, - H_q, - H_kv, - D, - Lq, - kv_len, - buffer_len, - B=1, - seed=42, - ): - """Verify split-K output matches fused kernel output for same inputs. - - Runs tq4_sdpa twice: once with kv_len (triggers split-K for Lq=1, kv_len>=256), - and once without kv_len (forces fused kernel path). Both outputs must match - within fp tolerance, proving split-K computes the same result. - """ - torch.manual_seed(seed) - centroids, boundaries, rotation = _make_codebook_and_rotation(D) - centroids = centroids.cuda() - boundaries = boundaries.cuda() - rotation = rotation.cuda() - - k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda") - v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda") - # Add garbage tail to ensure split-K respects kv_len bound - if buffer_len > kv_len: - g = buffer_len - kv_len - k[:, :, kv_len:, :] = ( - torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0 - ) - v[:, :, kv_len:, :] = ( - torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0 - ) - - q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda") - - k_packed, k_norms = _compress(k, boundaries, rotation) - v_packed, v_norms = _compress(v, boundaries, rotation) - - # Split-K path: with kv_len (triggers split-K for Lq=1, kv_len>=256) - kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda") - out_splitk = self.tq4_sdpa( - q, - k_packed, - k_norms, - v_packed, - v_norms, - centroids, - rotation, - attn_mask=None, - is_causal=False, - scale=None, - kv_len=kv_len_t, - mask_is_causal=False, - ) - - # Fused kernel path: without kv_len (forces fused kernel) - # But we need to slice the buffer to kv_len to avoid garbage - k_packed_sliced = k_packed[:, :, :kv_len, :] - k_norms_sliced = k_norms[:, :, :kv_len, :] - v_packed_sliced = v_packed[:, :, :kv_len, :] - v_norms_sliced = v_norms[:, :, :kv_len, :] - - out_fused = self.tq4_sdpa( - q, - k_packed_sliced, - k_norms_sliced, - v_packed_sliced, - v_norms_sliced, - centroids, - rotation, - attn_mask=None, - is_causal=False, - scale=None, - kv_len=None, - mask_is_causal=False, - ) - - # Both outputs must match (split-K computes same result as fused) - self.assertFalse(torch.isnan(out_splitk).any(), "NaN in split-K output") - self.assertFalse(torch.isnan(out_fused).any(), "NaN in fused output") - cos = _cosine_sim(out_splitk, out_fused) - self.assertGreater( - cos, - 0.99, - f"Split-K vs Fused cosine {cos:.5f} < 0.99 " - f"(B={B} H_q={H_q} H_kv={H_kv} D={D} kv_len={kv_len})", - ) - - def test_splitk_batch2(self): - """Split-K decode (Lq=1) with batch size B=2. - - Exercises the per-batch indexing in the split-K and reduce kernels - (b = pid_bh // H_grid). Split-K output must match the fused-kernel - path for the same inputs.""" - self._run_splitk_vs_fused_test( - H_q=16, H_kv=2, D=256, Lq=1, kv_len=512, buffer_len=1024, B=2 - ) - - def test_splitk_noncontiguous_query(self): - """Split-K decode (Lq=1, B=2) with a non-contiguous query. - - The host wrapper rotates Q (Q @ Pi^T) before launching the kernel, - so a strided query must yield the same result as its contiguous - copy. Builds a query whose last-dim stride is 2 by slicing a padded - buffer, then checks it matches the contiguous query.""" - H_q, H_kv, D, kv_len, B = 16, 2, 256, 512, 2 - torch.manual_seed(42) - centroids, boundaries, rotation = _make_codebook_and_rotation(D) - centroids = centroids.cuda() - boundaries = boundaries.cuda() - rotation = rotation.cuda() - - k = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda") - v = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda") - k_packed, k_norms = _compress(k, boundaries, rotation) - v_packed, v_norms = _compress(v, boundaries, rotation) - - q = torch.randn(B, H_q, 1, D, dtype=torch.bfloat16, device="cuda") - # Non-contiguous alias with identical values (last-dim stride 2). - q_pad = torch.empty(B, H_q, 1, D, 2, dtype=torch.bfloat16, device="cuda") - q_pad[..., 0] = q - q_nc = q_pad[..., 0] - self.assertFalse(q_nc.is_contiguous(), "query should be non-contiguous") - - kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda") - - def _run(query): - return self.tq4_sdpa( - query, - k_packed, - k_norms, - v_packed, - v_norms, - centroids, - rotation, - attn_mask=None, - is_causal=False, - scale=None, - kv_len=kv_len_t, - mask_is_causal=False, - ) - - out_contig = _run(q) - out_nc = _run(q_nc) - - self.assertFalse(torch.isnan(out_nc).any(), "NaN in non-contiguous output") - cos = _cosine_sim(out_nc, out_contig) - self.assertGreater( - cos, 0.999, f"non-contiguous vs contiguous query cosine {cos:.5f}" - ) - - def test_kv_len_clamp_decode_hd512_gqa_8_4(self): - """Decode (Lq=1) kv_len clamp at a head_dim=512, GQA 8:4 shape. - N=8192 leaves a 24k garbage tail in a 32k buffer (clamp guard); - N=32768 fills the buffer (full 32k loop).""" - for N in (8192, 32768): - with self.subTest(N=N): - self._run_long_kv_test( - H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768 - ) - - def test_kv_len_clamp_decode_hd512_gqa_8_4_splitk(self): - """Split-K decode (Lq=1) at a head_dim=512, GQA 8:4 shape with long - KV. Verifies split-K output matches BOTH (a) fp32 reference over first - kv_len positions AND (b) existing fused-kernel output (byte-identical - within fp tolerance). Uses garbage tail as negative control.""" - for N in (8192, 32768): - with self.subTest(N=N): - # Run with split-K (kv_len >= 256 triggers split-K) - _ = self._run_long_kv_test( - H_q=8, - H_kv=4, - D=512, - Lq=1, - kv_len=N, - buffer_len=32768, - min_cosine=0.99, - ) - # Also verify split-K matches fused kernel by running without kv_len - # (which forces fused kernel path) and comparing outputs - self._run_splitk_vs_fused_test( - H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768 - ) - - def test_kv_len_clamp_decode_hd256_gqa_16_2(self): - """Decode (Lq=1) kv_len clamp at a head_dim=256, GQA 16:2 shape.""" - for N in (8192, 32768): - with self.subTest(N=N): - self._run_long_kv_test( - H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768 - ) - - def test_kv_len_clamp_decode_hd256_gqa_16_2_splitk(self): - """Split-K decode (Lq=1) at a head_dim=256, GQA 16:2 shape with long - KV. Verifies split-K output matches BOTH fp32 reference AND fused - kernel.""" - for N in (8192, 32768): - with self.subTest(N=N): - _ = self._run_long_kv_test( - H_q=16, - H_kv=2, - D=256, - Lq=1, - kv_len=N, - buffer_len=32768, - min_cosine=0.99, - ) - self._run_splitk_vs_fused_test( - H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768 - ) - - def test_mask_is_causal_prefill_hd512_gqa_8_4(self): - """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=512, - GQA 8:4 shape. The Lq queries are the last Lq of a kv_len-long - context; the per-tile causal block-skip plus bottom-right mask must - match the fp32 causal reference over the first kv_len positions. A - garbage tail beyond kv_len also exercises the clamp.""" - for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)): - with self.subTest(Lq=Lq, kv_len=kv_len): - self._run_long_kv_test( - H_q=8, - H_kv=4, - D=512, - Lq=Lq, - kv_len=kv_len, - buffer_len=buf, - causal=True, - ) - - def test_mask_is_causal_prefill_hd256_gqa_16_2(self): - """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=256, - GQA 16:2 shape.""" - for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)): - with self.subTest(Lq=Lq, kv_len=kv_len): - self._run_long_kv_test( - H_q=16, - H_kv=2, - D=256, - Lq=Lq, - kv_len=kv_len, - buffer_len=buf, - causal=True, - ) - - def test_kv_len_none_fallback_hd256_gqa_16_2(self): - """Regression: the kv_len=None fallback (HAS_KV_LEN False, full-Lk - loop) still matches the fp32 reference. This guards the original - behavior the kv_len feature must preserve for callers that pass - neither kv_len nor mask_is_causal.""" - self._run_long_kv_test( - H_q=16, - H_kv=2, - D=256, - Lq=1, - kv_len=256, - buffer_len=256, - garbage=False, - pass_kv_len=False, - ) - - @unittest.skipUnless( - os.environ.get("TQ4_RUN_128K") == "1", - "128k case is heavy for the 24GB CI runner; set TQ4_RUN_128K=1 to run", - ) - def test_kv_len_clamp_128k(self): - """Full 131072-entry buffer (head_dim=256, GQA 16:2). (a) kv_len=8192 - with a ~123k garbage tail — the clamp keeps decode O(context) and - never touches the tail; (b) kv_len=131072 — correctness at true 128k - scale. Gated behind TQ4_RUN_128K because the fp32 reference for (b) - needs >~6GB and CI runs on a 24GB A10G.""" - self._run_long_kv_test( - H_q=16, H_kv=2, D=256, Lq=1, kv_len=8192, buffer_len=131072 - ) - self._run_long_kv_test( - H_q=16, - H_kv=2, - D=256, - Lq=1, - kv_len=131072, - buffer_len=131072, - garbage=False, - ) - # ------------------------------------------------------------------ # Validation errors # ------------------------------------------------------------------ diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py index ff8cbb660cb..6ec8ee80688 100644 --- a/backends/nxp/backend/edge_program_converter.py +++ b/backends/nxp/backend/edge_program_converter.py @@ -39,7 +39,6 @@ exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter, # noqa F405 exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter, # noqa F405 exir_ops.edge.aten.convolution.default: ConvolutionConverter, # noqa F405 - exir_ops.edge.aten.exp.default: ExpConverter, # noqa F405 exir_ops.edge.aten.hardtanh.default: HardTanhConverter, # noqa F405 exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter, # noqa F405 exir_ops.edge.aten.log.default: LogConverter, # noqa F405 diff --git a/backends/nxp/backend/graph_utils.py b/backends/nxp/backend/graph_utils.py index f5d8e16475c..88cd996d6fd 100644 --- a/backends/nxp/backend/graph_utils.py +++ b/backends/nxp/backend/graph_utils.py @@ -56,7 +56,7 @@ def get_output_shape(node: Node) -> tuple[torch.Size] | torch.Size | None: def is_clamp_preserved_under_quantization( - node: Node, min_val: float = 0, max_val: float | None = None + node: Node, min_val: int = 0, max_val: int | None = None ) -> bool: """ Checks if Clamp/ReLU/HardTanh is preserved under quantization and did diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py index 93ba24e61bd..5f19b2e48dc 100755 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py @@ -31,9 +31,6 @@ from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.convolution_converter import ( ConvolutionConverter, ) -from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.exp_converter import ( - ExpConverter, -) from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.getitem_converter import ( GetItemConverter, ) @@ -114,7 +111,6 @@ "CloneConverter", "ConstantPadNDConverter", "ConvolutionConverter", - "ExpConverter", "GetItemConverter", "HardTanhConverter", "LeakyReluConverter", diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py index a1e8c19e9bd..25cf6074701 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py @@ -42,6 +42,17 @@ from torch.nn import Parameter +def _is_convertible_to_relu(node): + bounds = ClampConverter._get_clamp_bounds(node) + bounds = tuple(v if v is not None and math.isfinite(v) else None for v in bounds) + + # Some specific bounds can be replaced with single op ReLU. + if bounds not in ClampConverter.RELU_COMPATIBLE_BOUNDS.values(): + return False + + return True + + class ClampConverter(NodeConverter): RELU_COMPATIBLE_BOUNDS = { "ReluN1To1": (-1, 1), @@ -59,25 +70,12 @@ class ClampConverter(NodeConverter): # noinspection PyShadowingBuiltins @staticmethod - def _get_bounds(node: Node) -> tuple[float | None, float | None]: + def _get_clamp_bounds(clamp_node: Node) -> tuple[float | None, float | None]: """Extract min and max bounds from `aten.clamp.default` node.""" - min = try_get_arg(node, 1) - max = try_get_arg(node, 2) + min = try_get_arg(clamp_node, 1) + max = try_get_arg(clamp_node, 2) return min, max - @classmethod - def _is_convertible_to_relu(cls, node): - bounds = cls._get_bounds(node) - bounds = tuple( - v if v is not None and math.isfinite(v) else None for v in bounds - ) - - # Some specific bounds can be replaced with single op ReLU. - if bounds not in cls.RELU_COMPATIBLE_BOUNDS.values(): - return False - - return True - @staticmethod def _is_supported_in_IR( node: Node, @@ -102,21 +100,20 @@ def _io_quant_is_same(node: Node): dq_params = dequant.args[1:] return all(q == dq for q, dq in zip(q_params, dq_params)) - @classmethod + @staticmethod def _is_supported_on_target( - cls, node: Node, neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - relu_compatible = cls._is_convertible_to_relu(node) - bounds = cls._get_bounds(node) + relu_compatible = _is_convertible_to_relu(node) + bounds = ClampConverter._get_clamp_bounds(node) if all(b is None or math.isinf(b) for b in bounds): return False - io_quant_consistent = cls._io_quant_is_same(node) + io_quant_consistent = ClampConverter._io_quant_is_same(node) quant_supported = NodeConverter.uses_quantization_type_for_io( node, supported_types=[torch.int8, torch.uint8], @@ -141,20 +138,19 @@ def supports_partitioning_result( neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], ) -> bool: - bounds = cls._get_bounds(node) + bounds = cls._get_clamp_bounds(node) # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator # and at the same time the node does not satisfy delegation requirements. - # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfully. + # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly. if bounds in cls.RELU_COMPATIBLE_BOUNDS.values(): is_alone_in_partition = cls.is_node_alone_in_partition( node, partition_list, filter_fn=is_not_qdq_node ) if is_alone_in_partition: - # noinspection PyTypeChecker return is_clamp_preserved_under_quantization( node, - min_val=bounds[0] if bounds[0] is not None else 0, + min_val=bounds[0], max_val=bounds[1], ) @@ -171,9 +167,9 @@ def convert(self, node: Node): ) -> Tensor """ self.assert_convertible(node) - to_relu = self._is_convertible_to_relu(node) + to_relu = _is_convertible_to_relu(node) - bounds = self._get_bounds(node) + bounds = self._get_clamp_bounds(node) bounds = tuple( v if v is not None and math.isfinite(v) else None for v in bounds ) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py index 0159143c5f7..f67851895c2 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py @@ -3,16 +3,43 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - -from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import ( - ClampConverter, +from executorch.backends.nxp.backend.ir.converter.node_converter import ( + CustomDelegationOptions, + is_not_qdq_node, + NodeConverter, + Partition, +) +from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import ( + BuiltinOperator, ) +from executorch.backends.nxp.backend.neutron_operator_support import ( + activation_supported_on_target, +) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.nn import Parameter + +class HardTanhConverter(NodeConverter): + + # Maps possible input parameters of HardTanh to equivalent ReLU-based operators supported by TFLite. + SUPPORTED_MODES_MAP = { + (0.0, 6.0): BuiltinOperator.RELU6, + (-1.0, 1.0): BuiltinOperator.RELU_N1_TO_1, + (0.0, 1.0): BuiltinOperator.RELU_0_TO_1, + (0.0, float("inf")): BuiltinOperator.RELU, + } + + # Maps possible modes of HardTanh to equivalent ReLU bounds. + SUPPORTED_BOUNDS_MAP = { + "ReluN1To1": (-1.0, 1.0), + "Relu0To1": (0.0, 1.0), + "Relu6": (0.0, 6.0), + "Relu": (0.0, float("inf")), + } -class HardTanhConverter(ClampConverter): @staticmethod - def _get_bounds(node: Node) -> tuple[float | None, float | None]: + def _get_hardtanh_bounds(node: Node) -> tuple[float, float]: args = node.args match len(args): @@ -35,3 +62,51 @@ def _get_bounds(node: Node) -> tuple[float | None, float | None]: ) return min_val, max_val + + @staticmethod + def _is_supported_in_IR( + node: Node, + parameters_mapping: dict[str, Parameter], + custom_delegation_options: CustomDelegationOptions, + ) -> bool: + bounds = HardTanhConverter._get_hardtanh_bounds(node) + return bounds in HardTanhConverter.SUPPORTED_MODES_MAP + + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + bounds = HardTanhConverter._get_hardtanh_bounds(node) + + # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator + # and at the same time the node does not satisfy delegation requirements. + # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly. + if bounds in [ + cls.SUPPORTED_BOUNDS_MAP["Relu"], + cls.SUPPORTED_BOUNDS_MAP["Relu6"], + ]: + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + if is_alone_in_partition: + return activation_supported_on_target(node) + + return True + + def convert(self, node: Node): + """Convert 'aten::hardtanh' to its supported ReLU equivalent.""" + self.assert_convertible(node) + + t_op = self._create_tflite_op_with_io_tensors(node) + + bounds = HardTanhConverter._get_hardtanh_bounds(node) + + op = self.SUPPORTED_MODES_MAP[bounds] + t_op.opcode_index = self.builder.op_code_index_for_op_type(op) + + self.builder.append_operators([t_op]) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py index 8674bf697c7..a76abfbef91 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py @@ -5,9 +5,6 @@ import torch -from executorch.backends.nxp.backend.data_format import DataFormat -from executorch.backends.nxp.backend.ir.converter.conversion import translator -from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( create_channels_last_to_channels_first_permutation, ) @@ -92,15 +89,10 @@ def _is_supported_in_IR( def _to_pos_dim(d: int, rank: int): return d + rank if d < 0 else d - @staticmethod - def _normalize_dim(dim: list[int], rank: int) -> list[int]: - # convert negative index to positive - return [MeanDimConverter._to_pos_dim(d, rank) for d in dim] - @staticmethod def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]: # convert negative index to positive - dim = MeanDimConverter._normalize_dim(dim, rank) + dim = [MeanDimConverter._to_pos_dim(d, rank) for d in dim] perm = create_channels_last_to_channels_first_permutation(rank, True) dim = [perm[d] for d in dim] @@ -114,114 +106,6 @@ def _get_attrs(node: Node) -> tuple[list[int], bool]: keepdim = node.args[2] if len(node.args) >= 3 else False return dim, keepdim - def _get_dim_and_handle_io_formats( - self, ops: OpsList, dim: list[int], keep_dim: bool - ): - t_op = ops.middle_op - x = t_op.tmp_inputs[0] - y = t_op.tmp_outputs[0] - - channels_last_input = x.tensor_format.is_channels_last() - channels_last_output = y.tensor_format.is_channels_last() - formatless_input = not channels_last_input - formatless_output = not channels_last_output - - dim = self._normalize_dim(dim, x.rank) - - if keep_dim: - # The rank is preserved and the io formats should always be equal. - assert ( - x.tensor_format == y.tensor_format - ), "NXP backend: There is a bug in `mean.dim` format inference." - - # Just adjust the dim to match the input format. - if channels_last_input: - dim = self._normalize_and_to_channel_last_dim(dim, x.rank) - - else: - # `keep_dim = False`, so the output rank != input rank, and the operator changes the tensor format. - - if channels_last_input and formatless_output: - if 1 in dim: - # If we are reducing over the channels, the channels dimension gets removed and the output ends up - # exactly equal in channels last and channels first, regardless of which other dimensions are - # removed. Therefore, we can just adjust the `dim` and we don't need to insert any `Transpose` ops. - dim = self._normalize_and_to_channel_last_dim(dim, x.rank) - elif all(spatial_dim in dim for spatial_dim in range(2, x.rank)): - # All spatial dims are reduced, leaving only batch and channels (both optionally). So the result is - # equal in channels first and channels last as long as we adjust the `dim` to match a channels last - # input (similarly to the case above). - dim = self._normalize_and_to_channel_last_dim(dim, x.rank) - else: - # If the channels dimension is preserved, we must transpose the input to channels first (to match - # the edge model) and we must keep the `dim` unchanged (referencing channels first dimensions). - # Otherwise, the output would not match the input. - to_channels_first_perm = ( - translator.create_channels_last_to_channels_first_permutation( - x.rank - ) - ) - ops.add_pre( - self.builder.create_transpose_operator_before( - t_op, 0, to_channels_first_perm - ) - ) - t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST - - elif formatless_input and channels_last_output: - # We need apply the `mean` with the original `dim`, which will produce a channels first output. Then, - # we need to append a `Transpose` operator to make the output channels last. - to_channels_last_perm = ( - translator.create_channels_first_to_channels_last_permutation( - y.rank, True - ) - ) - ops.add_post( - self.builder.create_transpose_operator_after( - t_op, 0, to_channels_last_perm - ) - ) - t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST - - elif formatless_input and formatless_output: - # No action needed. - pass - - else: # channels_last_input and channels_last_output - # This case cannot currently occur, as it would require the case: - # channels last 4D -> mean -> channels_last 3D - # which cannot currently happen as the 3D conv/pooling/... is supported by adding `view_copy` nodes in - # the edge dialect and converting the node to 4D, and the `view_copy` nodes prevent the propagation of - # the format to the `mean.dim` output. - # Therefore, the implementation cannot be tested. But from experience with other operators, it should - # work correctly. We just need to add 2 `Transpose` ops to make the IO channels first, and keep the - # `dim` unchanged. - to_channels_first_perm = ( - translator.create_channels_last_to_channels_first_permutation( - x.rank - ) - ) - ops.add_pre( - self.builder.create_transpose_operator_before( - t_op, 0, to_channels_first_perm - ) - ) - t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST - - to_channels_last_perm = ( - translator.create_channels_first_to_channels_last_permutation( - y.rank, True - ) - ) - ops.add_post( - self.builder.create_transpose_operator_after( - t_op, 0, to_channels_last_perm - ) - ) - t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST - - return dim - def convert(self, node: Node): """Convert the 'mean.dim' operator to NeutronIR 'Mean'. The ExecuTorch schema is: @@ -239,9 +123,10 @@ def convert(self, node: Node): t_op = self._create_tflite_op_with_io_tensors(node) t_op.builtin_options = mean_options.Mean(keepdim) + x = t_op.tmp_inputs[0] - ops = OpsList(middle_op=t_op) - dim = self._get_dim_and_handle_io_formats(ops, dim, keepdim) + if x.tensor_format.is_channels_last(): + dim = self._normalize_and_to_channel_last_dim(dim, x.rank) convert_axes_from_attribute(t_op, self.builder, dim) - self.builder.append_operators(ops.flatten()) + self.builder.append_operators([t_op]) diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py index f3fe868ae83..ba4ad14222b 100755 --- a/backends/nxp/backend/ir/converter/quantization_utils.py +++ b/backends/nxp/backend/ir/converter/quantization_utils.py @@ -1,4 +1,4 @@ -# Copyright 2023-2026 NXP +# Copyright 2023-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -135,12 +135,11 @@ def set_quantization_parameters_to_tensor( def quantize_int8( data: np.ndarray, scale: List[float], zero_point: List[int] ) -> np.ndarray: - # noinspection PyTypeChecker return quantize(data, zero_point=zero_point, scale=scale) def quantize( - value: np.ndarray | float, + value: np.ndarray | int, zero_point: List[int] | int, scale: List[float] | float, quant_min: int = -128, diff --git a/backends/nxp/backend/node_format_inference.py b/backends/nxp/backend/node_format_inference.py index 030873c88ab..65e34b7fbde 100644 --- a/backends/nxp/backend/node_format_inference.py +++ b/backends/nxp/backend/node_format_inference.py @@ -9,27 +9,10 @@ import torch from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT -from executorch.backends.nxp.backend.edge_helper import ( - is_channels_last_dim_order, - try_get_arg, -) + +from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order from executorch.backends.nxp.backend.edge_program_converter import functions_converters -from executorch.backends.nxp.tests.ops_aliases import ( - AdaptiveAvgPool2D, - AvgPool2D, - Convolution, - DequantizePerChannel, - DequantizePerTensor, - GetItem, - MaxPool2D, - MaxPool2DWithIndices, - MeanDim, - PermuteCopy, - QuantizePerTensor, - UpsampleBilinear2D, - UpsampleNearest2D, - ViewCopy, -) +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload from torch.export import ExportedProgram from torch.fx import Node @@ -42,22 +25,21 @@ class NodeFormatInference: # The op in the dictionary is mapped to a dictionary, which holds indices to input nodes # that are always channels first. ops_with_channels_first_nodes = { - AdaptiveAvgPool2D: {"inputs": [0]}, + exir_ops.edge.aten._adaptive_avg_pool2d.default: {"inputs": [0]}, torch.ops.aten.adaptive_avg_pool2d.default: {"inputs": [0]}, - AvgPool2D: {"inputs": [0]}, - Convolution: {"inputs": [0, 1]}, - MaxPool2DWithIndices: {"inputs": [0]}, - MaxPool2D: {"inputs": [0]}, - UpsampleBilinear2D: {"inputs": [0]}, - UpsampleNearest2D: {"inputs": [0]}, + exir_ops.edge.aten.avg_pool2d.default: {"inputs": [0]}, + exir_ops.edge.aten.convolution.default: {"inputs": [0, 1]}, + exir_ops.edge.aten.max_pool2d_with_indices.default: {"inputs": [0]}, + exir_ops.edge.aten.max_pool2d.default: {"inputs": [0]}, + exir_ops.edge.aten.upsample_bilinear2d.vec: {"inputs": [0]}, + exir_ops.edge.aten.upsample_nearest2d.vec: {"inputs": [0]}, } # A set of Edge Aten ops, which have the ability to change the format (for example - input nodes # are channels first but output is formatless). ops_that_can_change_tensor_format = { - ViewCopy, - PermuteCopy, - MeanDim, + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.permute_copy.default, } _type_changed_during_last_run: bool @@ -89,10 +71,10 @@ def __init__(self, edge_program: ExportedProgram, only_for_op_support_check=Fals self._type_changed_during_last_run = False self._known_targets = list(functions_converters) + [ - DequantizePerTensor, - DequantizePerChannel, - QuantizePerTensor, - GetItem, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + operator.getitem, ] def identify_node_formats(self): @@ -122,7 +104,10 @@ def _infer_format_of_nodes(self, node: Node): self._handle_node_which_uses_channels_first_format(node) elif op_type in self.ops_that_can_change_tensor_format: - if op_type in [ViewCopy, PermuteCopy]: + if op_type in [ + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.permute_copy.default, + ]: # Try to assign the `formatless` format to the input and output. The converter will then handle the # transition. # Note: If the format for the input/output has already been assigned as channels first, it will NOT be @@ -134,28 +119,10 @@ def _infer_format_of_nodes(self, node: Node): self._node_inputs[node][0], DataFormat.FORMATLESS ) - elif op_type == MeanDim: - # The operator schema is: - # mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - keep_dim = try_get_arg(node, 2) or False - if keep_dim: - # The operator preserves the rank, so we can handle it as an operator that can use any node format. - self._handle_node_which_can_use_any_node_format(node) - else: - # The operator removes dimensions, so the IO must be marked as `formatless` (unless overridden by - # channels first of course). - self._assign_format_to_node( - self._node_outputs[node][0], DataFormat.FORMATLESS - ) - self._assign_format_to_node( - self._node_inputs[node][0], DataFormat.FORMATLESS - ) - else: logger.error( f"Node format inference for node type: {op_type} not found!" ) - elif node.op != "call_function" or ( hasattr(node, "target") and node.target in self._known_targets ): diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py index 9cc174b97e0..d4262b3a9f6 100644 --- a/backends/nxp/neutron_partitioner.py +++ b/backends/nxp/neutron_partitioner.py @@ -212,7 +212,6 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]): exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter, # noqa F405 exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter, # noqa F405 exir_ops.edge.aten.convolution.default: ConvolutionConverter, # noqa F405 - exir_ops.edge.aten.exp.default: ExpConverter, # noqa F405 exir_ops.edge.aten.hardtanh.default: HardTanhConverter, # noqa F405 exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter, # noqa F405 exir_ops.edge.aten.log.default: LogConverter, # noqa F405 @@ -437,7 +436,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: graph_module.recompile() - operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",") + operators_not_to_delegate = self.delegation_spec[1][3].value.decode().split(",") logging.info(f"Operators not to delegate: {operators_not_to_delegate}") parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters( diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index 1a84a418e92..f28eb34064c 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -9,9 +9,8 @@ # import logging -import os import struct -from typing import final +from typing import final, List, Optional import numpy as np import torch @@ -46,11 +45,10 @@ class NeutronCompileSpecBuilder: config: NeutronTargetSpec def __init__(self): - self.compile_spec: list[CompileSpec] = [] + self.compile_spec: List[CompileSpec] = [] self.compiler_flags = [] self.output_format = None - self.intermediates_dir = None - self.operators_not_to_delegate: list[str] = [] + self.operators_not_to_delegate: List[str] = [] self.use_neutron_for_format_conversion = True self.fetch_constants_to_sram = False self.dump_kernel_selection_code = False @@ -64,9 +62,8 @@ def _replace_colons(self, operator: str) -> str: def neutron_compile_spec( self, config: str, - intermediates_dir: str | None = None, - extra_flags: str | None = None, - operators_not_to_delegate: list[str] | None = None, + extra_flags: Optional[str] = None, + operators_not_to_delegate: Optional[List[str]] = None, use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, @@ -74,7 +71,6 @@ def neutron_compile_spec( """Generate compile spec for Neutron NPU :param config: Neutron accelerator configuration, e.g. "imxrt700" - :param intermediates_dir: Directory to store intermediate artifact files. :param extra_flags: Extra flags for the Neutron compiler :param operators_not_to_delegate: List of operators that should not be delegated :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to @@ -87,7 +83,6 @@ def neutron_compile_spec( """ self.config = NeutronTargetSpec(config) - self.intermediates_dir = intermediates_dir assert ( self.output_format is None @@ -118,7 +113,6 @@ def build(self): CompileSpec("output_format", "tflite".encode()), CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()), CompileSpec("target", self.config.get_name().encode()), - CompileSpec("intermediates_dir", f"{self.intermediates_dir}".encode()), CompileSpec( "operators_not_to_delegate", ",".join(self.operators_not_to_delegate).encode(), @@ -142,19 +136,17 @@ def build(self): def generate_neutron_compile_spec( config: str, # The target platform. For example "imxrt700". - system_config: str | None = None, - extra_flags: str | None = None, - intermediates_dir: str | None = None, - operators_not_to_delegate: list[str] | None = None, + system_config: Optional[str] = None, + extra_flags: Optional[str] = None, + operators_not_to_delegate: Optional[List[str]] = None, use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, -) -> list[CompileSpec]: +) -> List[CompileSpec]: return ( NeutronCompileSpecBuilder() .neutron_compile_spec( config, - intermediates_dir=intermediates_dir, extra_flags=extra_flags, operators_not_to_delegate=operators_not_to_delegate, use_neutron_for_format_conversion=use_neutron_for_format_conversion, @@ -171,7 +163,7 @@ class NeutronBackend(BackendDetails): @staticmethod def preprocess( # noqa C901 edge_program: ExportedProgram, - compile_spec: list[CompileSpec], + compile_spec: List[CompileSpec], ) -> PreprocessResult: logging.info("NeutronBackend::preprocess") @@ -181,7 +173,6 @@ def preprocess( # noqa C901 compile_flags = [] binary = bytes() target = "" - intermediates_dir = "None" use_neutron_for_format_conversion = None fetch_constants_to_sram = False dump_kernel_selection_code = None @@ -190,8 +181,6 @@ def preprocess( # noqa C901 output_format = spec.value.decode() if spec.key == "target": target = spec.value.decode() - if spec.key == "intermediates_dir": - intermediates_dir = spec.value.decode() if spec.key == "compile_flags": compile_flags.append(spec.value.decode()) if spec.key == "use_neutron_for_format_conversion": @@ -205,10 +194,6 @@ def preprocess( # noqa C901 if not output_format: raise RuntimeError("output format is required") - # Check if provided intermediates_dir is a correct path (None is decoded to str) - if intermediates_dir != "None" and not os.path.isdir(intermediates_dir): - raise ValueError("intermediates_dir is not a directory path.") - for node in edge_program.graph.nodes: if node.op == "call_function": logging.debug(f"Operator to be processed: {node.target}") @@ -243,22 +228,16 @@ def preprocess( # noqa C901 fetch_constants_to_sram, ) - # Dump the tflite file if intermediates_dir is set - if intermediates_dir != "None": + # Dump the tflite file if logging level is enabled + if logging.root.isEnabledFor(logging.DEBUG): + import os + logging.debug( - f"Serializing converted graph with tag {delegation_tag} to {intermediates_dir}" + f"Serializing converted graph with tag {delegation_tag} to {os.getcwd()}" ) - with open( - os.path.join(intermediates_dir, f"{delegation_tag}_pure.et.tflite"), - "wb", - ) as f: + with open(f"{delegation_tag}_pure.et.tflite", "wb") as f: f.write(bytes(tflite_model)) - with open( - os.path.join( - intermediates_dir, f"{delegation_tag}_neutron.et.tflite" - ), - "wb", - ) as f: + with open(f"{delegation_tag}_neutron.et.tflite", "wb") as f: f.write(bytes(neutron_model)) binary = PayloadComposer().get_binary_payload(io_formats, neutron_model) diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py index 94ee8e8656a..048172ea212 100644 --- a/backends/nxp/quantizer/neutron_quantizer.py +++ b/backends/nxp/quantizer/neutron_quantizer.py @@ -25,7 +25,6 @@ Conv2dPattern, ConvTranspose2dPattern, DropoutPattern, - ExpPattern, FlattenPattern, HardTanhInPlacePattern, HardTanhPattern, @@ -271,7 +270,6 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec, is_qat: bool = False) ConvTranspose2dPattern(self, is_qat=is_qat), static_qconfig ), OpQuantizer(DropoutPattern(is_qat=is_qat), static_qconfig), - OpQuantizer(ExpPattern(is_qat=is_qat), static_qconfig), OpQuantizer(FlattenPattern(is_qat=is_qat), static_qconfig), OpQuantizer(HardTanhPattern(is_qat=is_qat), static_qconfig), OpQuantizer(HardTanhInPlacePattern(is_qat=is_qat), static_qconfig), diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py index d6cf1d7e063..9e21e4f1660 100644 --- a/backends/nxp/quantizer/patterns.py +++ b/backends/nxp/quantizer/patterns.py @@ -11,10 +11,7 @@ import torch from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import ( - ClampConverter, -) -from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.hardtanh_converter import ( - HardTanhConverter, + _is_convertible_to_relu, ) from executorch.backends.nxp.quantizer.utils import ( get_bias_qparams, @@ -441,7 +438,7 @@ def get_anchors( ) -> PartitionAnchors | None: node = fused_partition[0].nodes[-1] - if not ClampConverter._is_convertible_to_relu(node): + if not _is_convertible_to_relu(node): return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition) else: return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition) @@ -712,15 +709,6 @@ def partition_types(self): return [torch.ops.aten.dropout.default] -class ExpPattern(SharedSpecPattern): - """ - Quantizer for Exp operator. - """ - - def partition_types(self): - return [torch.ops.aten.exp.default] - - class FlattenPattern(SharedSpecPattern): """ Quantizer for Flatten operator. @@ -738,21 +726,11 @@ class HardTanhPattern(SingleInputBasicPattern): def partition_types(self): return [torch.ops.aten.hardtanh.default] - def get_anchors( - self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] - ) -> PartitionAnchors | None: - node = fused_partition[0].nodes[-1] - - if not HardTanhConverter._is_convertible_to_relu(node): - return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition) - else: - return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition) - def replacement_op(self): raise AssertionError() -class HardTanhInPlacePattern(HardTanhPattern): +class HardTanhInPlacePattern(SingleInputBasicPattern): """ Quantizer for HardTanh operator with param inplace=True. """ @@ -760,6 +738,21 @@ class HardTanhInPlacePattern(HardTanhPattern): def partition_types(self): return [torch.ops.aten.hardtanh_.default] + def get_anchors( + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors | None: + node = fused_partition[0].nodes[-1] + + return PartitionAnchors( + inputs=[(node, NodeArgsIdx(0))], + weights=[], + biases=[], + output=[(node,)], + ) + + def replacement_op(self): + raise AssertionError() + class LeakyReluPattern(SingleInputBasicPattern): """Quantizer for the `aten.leaky_relu.default` operator.""" diff --git a/backends/nxp/run_unittests.sh b/backends/nxp/run_unittests.sh index 66e51c39a1d..78e35d2617a 100755 --- a/backends/nxp/run_unittests.sh +++ b/backends/nxp/run_unittests.sh @@ -11,6 +11,6 @@ EXECUTORCH_DIR=$(dirname $(dirname $SCRIPT_DIR)) cd $EXECUTORCH_DIR # '-c /dev/null' is used to ignore root level pytest.ini. -pytest -c /dev/null -n "logical" backends/nxp/tests/ +pytest -c /dev/null backends/nxp/tests/ python -m unittest discover -s backends/nxp/tests/ -v diff --git a/backends/nxp/tests/conftest.py b/backends/nxp/tests/conftest.py index af2011a8000..34fe343ca6a 100644 --- a/backends/nxp/tests/conftest.py +++ b/backends/nxp/tests/conftest.py @@ -35,4 +35,4 @@ def pytest_sessionstart(session): # Remove all cached test files shutil.rmtree(outputs_dir.OUTPUTS_DIR, ignore_errors=True) - os.makedirs(outputs_dir.OUTPUTS_DIR, exist_ok=True) + os.mkdir(outputs_dir.OUTPUTS_DIR) diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index 44a96010593..5cfcb37c8a8 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -180,7 +180,6 @@ def to_quantized_edge_program( operators_not_to_delegate: list[str] = None, get_calibration_inputs_fn: GetCalibrationInputsFn = get_random_calibration_inputs, target: str = "imxrt700", - intermediates_dir: str | None = None, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, remove_quant_io_ops: bool = False, @@ -218,7 +217,6 @@ def to_quantized_edge_program( preserve_ops = [torch.ops.aten.prelu.default] compile_spec = generate_neutron_compile_spec( target, - intermediates_dir=intermediates_dir, operators_not_to_delegate=operators_not_to_delegate, use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, @@ -268,7 +266,6 @@ def to_quantized_edge_program( def to_quantized_executorch_program( model: torch.nn.Module, input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]], - intermediates_dir: str | None = None, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, use_neutron_for_format_conversion: bool = True, @@ -290,7 +287,6 @@ def to_quantized_executorch_program( edge_program_manager = to_quantized_edge_program( model, input_spec, - intermediates_dir=intermediates_dir, use_qat=use_qat, train_fn=train_fn, use_neutron_for_format_conversion=use_neutron_for_format_conversion, diff --git a/backends/nxp/tests/generic_tests/test_cifarnet.py b/backends/nxp/tests/generic_tests/test_cifarnet.py index c874ba24e47..1d795c938fe 100644 --- a/backends/nxp/tests/generic_tests/test_cifarnet.py +++ b/backends/nxp/tests/generic_tests/test_cifarnet.py @@ -34,7 +34,7 @@ def cifar_test_files(tmp_path_factory): @pytest.mark.parametrize("channels_last", [False, True]) -def test_cifarnet(mocker, request, cifar_test_files, channels_last): +def test_cifarnet(mocker, cifar_test_files, channels_last): model = ( CifarNet( pth_file=os.path.join( @@ -64,10 +64,9 @@ def test_cifarnet(mocker, request, cifar_test_files, channels_last): lower_run_compare( model, [input_spec], - BaseGraphVerifier(1, non_dlg_nodes), - request, dataset_creator=CopyDatasetCreator(cifar_test_files), output_comparator=comparator, + dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes), mocker=mocker, # Run the channels last reference in PyTorch as the ExecuTorch CPU model contains incorrectly # lowered channels last convolution weights, which cause incorrect inference results. The issue @@ -80,7 +79,7 @@ def test_cifarnet(mocker, request, cifar_test_files, channels_last): ) -def test_cifarnet_qat(mocker, request, cifar_test_files): +def test_cifarnet_qat(mocker, cifar_test_files): model = CifarNet().get_eager_model().eval() input_shape = (1, 3, 32, 32) @@ -95,10 +94,9 @@ def test_cifarnet_qat(mocker, request, cifar_test_files): lower_run_compare( model, input_shape, - BaseGraphVerifier(1, non_dlg_nodes), - request, dataset_creator=CopyDatasetCreator(cifar_test_files), output_comparator=comparator, + dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes), mocker=mocker, use_qat=True, ) diff --git a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py index 3415b79a39d..fcd0aae2130 100644 --- a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py +++ b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py @@ -208,7 +208,7 @@ class TestConvertDivToMul: ids=lambda is_scalar: "scalar" if is_scalar else "tensor", ) def test__static__full_pipeline( - self, mocker, request, input_shape: tuple[int, ...], is_scalar: bool + self, mocker, input_shape: tuple[int, ...], is_scalar: bool ): if is_scalar: divisor = np.random.uniform(0.01, 15) @@ -231,6 +231,5 @@ def test__static__full_pipeline( model, input_shape, graph_verifier, - request, dataset_creator, ) diff --git a/backends/nxp/tests/generic_tests/test_integration.py b/backends/nxp/tests/generic_tests/test_integration.py index edefd905dbf..fe157b44c48 100644 --- a/backends/nxp/tests/generic_tests/test_integration.py +++ b/backends/nxp/tests/generic_tests/test_integration.py @@ -19,7 +19,7 @@ def test_conv_fc_softmax__to_executorch_program(use_qat): model = ConvFCSoftmaxModule() input_shape = (1, 4, 5, 5) - exec_prog = to_quantized_executorch_program(model, input_shape, use_qat=use_qat) + exec_prog = to_quantized_executorch_program(model, input_shape, use_qat) program = exec_prog.exported_program() assert ( diff --git a/backends/nxp/tests/generic_tests/test_quantized_input_data.py b/backends/nxp/tests/generic_tests/test_quantized_input_data.py index a9f9f3e47e6..8b2f6823e8d 100644 --- a/backends/nxp/tests/generic_tests/test_quantized_input_data.py +++ b/backends/nxp/tests/generic_tests/test_quantized_input_data.py @@ -17,7 +17,7 @@ from executorch.backends.nxp.tests.ops_aliases import AvgPool2D, MulTensor -def test__single_quantized_inputs(mocker, request): +def test__single_quantized_inputs(mocker): input_spec = ModelInputSpec((2, 4, 6, 7)) model = AvgPool2dModule(False, 0) graph_verifier = DetailedGraphVerifier( @@ -29,19 +29,19 @@ def test__single_quantized_inputs(mocker, request): model, [input_spec], graph_verifier, - request, remove_quant_io_ops=True, ) - test_name = nsys_testing.get_test_name(request) - assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists() + assert ( + OUTPUTS_DIR / "test__single_quantized_inputs" / "dataset_quant" / "0000.bin" + ).exists() # Check outputs are in quantized int8 format output_tensor_spec = output_tensor_spec_spy.spy_return assert output_tensor_spec[0].dtype == torch.int8 -def test__single_quantized_inputs_edge_python_reference(mocker, request): +def test__single_quantized_inputs_edge_python_reference(mocker): input_spec = ModelInputSpec((2, 4, 6, 7)) model = AvgPool2dModule(False, 0) graph_verifier = DetailedGraphVerifier( @@ -53,20 +53,23 @@ def test__single_quantized_inputs_edge_python_reference(mocker, request): model, [input_spec], graph_verifier, - request, reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON, remove_quant_io_ops=True, ) - test_name = nsys_testing.get_test_name(request) - assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists() + assert ( + OUTPUTS_DIR + / "test__single_quantized_inputs_edge_python_reference" + / "dataset_quant" + / "0000.bin" + ).exists() # Check outputs are in quantized int8 format output_tensor_spec = output_tensor_spec_spy.spy_return assert output_tensor_spec[0].dtype == torch.int8 -def test__multiple_quantized_inputs(mocker, request): +def test__multiple_quantized_inputs(mocker): x_input_spec = ModelInputSpec((1, 4, 8, 8)) model = MulTensorModule() graph_verifier = DetailedGraphVerifier( @@ -78,19 +81,23 @@ def test__multiple_quantized_inputs(mocker, request): model, [x_input_spec, x_input_spec], graph_verifier, - request, remove_quant_io_ops=True, ) - test_name = nsys_testing.get_test_name(request) - assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists() + assert ( + OUTPUTS_DIR + / "test__multiple_quantized_inputs" + / "dataset_quant" + / "0000" + / "00.bin" + ).exists() # Check outputs are in quantized int8 format output_tensor_spec = output_tensor_spec_spy.spy_return assert output_tensor_spec[0].dtype == torch.int8 -def test__multiple_quantized_inputs_edge_python_reference(mocker, request): +def test__multiple_quantized_inputs_edge_python_reference(mocker): x_input_spec = ModelInputSpec((1, 4, 8, 8)) model = MulTensorModule() graph_verifier = DetailedGraphVerifier( @@ -102,13 +109,17 @@ def test__multiple_quantized_inputs_edge_python_reference(mocker, request): model, [x_input_spec, x_input_spec], graph_verifier, - request, reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON, remove_quant_io_ops=True, ) - test_name = nsys_testing.get_test_name(request) - assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists() + assert ( + OUTPUTS_DIR + / "test__multiple_quantized_inputs_edge_python_reference" + / "dataset_quant" + / "0000" + / "00.bin" + ).exists() # Check outputs are in quantized int8 format output_tensor_spec = output_tensor_spec_spy.spy_return diff --git a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py index d42ef4c6e7d..ebe782c5a98 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py @@ -68,7 +68,7 @@ def _get_dataset_creator(): dataset = RandomDatasetCreator(low=low, high=high) return dataset - def test__basic_nsys_inference(self, mocker, request): + def test__basic_nsys_inference(self, mocker): input_shape = (2, 3, 6, 7) model = AbsModule() graph_verifier = DetailedGraphVerifier( @@ -80,11 +80,10 @@ def test__basic_nsys_inference(self, mocker, request): model, input_shape, graph_verifier, - request, dataset_creator, ) - def test__basic_nsys_inference__big(self, mocker, request): + def test__basic_nsys_inference__big(self, mocker): # some operators have delegation requirement that size must be < 4096 input_shape = (4097, 1) model = AbsModule() @@ -97,6 +96,5 @@ def test__basic_nsys_inference__big(self, mocker, request): model, input_shape, graph_verifier, - request, dataset_creator, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py index 9646c04a3f2..8b8f2da8c4e 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py @@ -44,9 +44,7 @@ class TestAdaptiveAvgPool2D: ), ], ) - def test__basic_nsys_inference( - self, mocker, request, use_qat, input_shape, output_size - ): + def test__basic_nsys_inference(self, mocker, use_qat, input_shape, output_size): model = AdaptiveAvgPool2dModule(output_size) graph_verifier = DetailedGraphVerifier( mocker, @@ -62,7 +60,6 @@ def test__basic_nsys_inference( model, input_shape, graph_verifier, - request, RandomDatasetCreator(low=-1, high=1), output_comparator=output_comparator, use_qat=use_qat, @@ -72,7 +69,7 @@ def test__basic_nsys_inference( strict=True, reason="Known Neutron bad compute issue. Will be fixed in Neutron SW 3.1.2.", ) - def test__know_neutron_issue(self, mocker, request): + def test__know_neutron_issue(self, mocker): input_shape = (2, 3, 10, 15) output_size = (5, 5) model = AdaptiveAvgPool2dModule(output_size) @@ -89,12 +86,11 @@ def test__know_neutron_issue(self, mocker, request): model, input_shape, graph_verifier, - request, RandomDatasetCreator(low=-1, high=1), output_comparator=output_comparator, ) - def test__kernel_size_and_stride_limit(self, mocker, request): + def test__kernel_size_and_stride_limit(self, mocker): input_shape = (1, 3, 4, 4096) # input_size = (1, 4096) output_size = ( 2, @@ -118,7 +114,6 @@ def test__kernel_size_and_stride_limit(self, mocker, request): model, input_shape, graph_verifier, - request, RandomDatasetCreator(low=-1, high=1), output_comparator=output_comparator, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py index 6ac96e41cd1..3ede2cfaadd 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py @@ -16,9 +16,6 @@ ) from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.model_output_comparator import ( - AllCloseOutputComparator, -) from executorch.backends.nxp.tests.models import AddTensorConvModule, AddTensorModule from executorch.backends.nxp.tests.nsys_testing import lower_run_compare from executorch.backends.nxp.tests.ops_aliases import ( @@ -41,49 +38,67 @@ class TestAddTensor: [ pytest.param((1,), id="1D."), pytest.param((6, 5), id="2D."), - pytest.param((6, 82), id="2D alt."), pytest.param((1, 4, 7), id="3D."), - pytest.param((1, 68, 7), id="3D alt."), pytest.param((2, 4, 3, 15), id="4D."), - pytest.param((1, 4, 9, 11, 4), id="5D."), + pytest.param( + (6, 82), + id="2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 68, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), ], ) - def test__basic_nsys_inference(self, mocker, request, x_input_shape): + def test__basic_nsys_inference(self, x_input_shape, mocker): x_input_spec = ModelInputSpec(x_input_shape) model = AddTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, [x_input_spec, x_input_spec], graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) - def test__basic_nsys_inference_qat(self, mocker, request): - x_input_spec = ModelInputSpec((1, 4, 7)) + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((1, 4, 7), id="3D."), + pytest.param((2, 4, 3, 15), id="4D."), + pytest.param( + (1, 4, 9, 11, 4), + id="5D.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference_qat(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) model = AddTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, [x_input_spec, x_input_spec], graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, use_qat=True, ) @@ -93,10 +108,6 @@ def test__basic_nsys_inference_qat(self, mocker, request): pytest.param( [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D." ), - pytest.param( - [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], - id="2 inputs 2D alt.", - ), pytest.param( [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))], id="2 inputs 3D.", @@ -104,24 +115,25 @@ def test__basic_nsys_inference_qat(self, mocker, request): pytest.param( [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D." ), + pytest.param( + [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], + id="2 inputs 2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), ], ) - def test__broadcast(self, mocker, request, input_spec): + def test__broadcast(self, input_spec, mocker): model = AddTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, input_spec, graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) @pytest.mark.parametrize( @@ -160,7 +172,7 @@ def test__broadcast_unsupported(self, input_spec): ), ], ) - def test__w_conv(self, mocker, request, x_input_shape): + def test__w_conv(self, x_input_shape, mocker): model = AddTensorConvModule() n, c, h, w = x_input_shape @@ -175,11 +187,7 @@ def test__w_conv(self, mocker, request, x_input_shape): dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) lower_run_compare( - model, - [x_input_spec, y_input_spec], - graph_verifier, - request, - dataset_creator, + model, [x_input_spec, y_input_spec], graph_verifier, dataset_creator ) @pytest.mark.parametrize( @@ -190,12 +198,13 @@ def test__w_conv(self, mocker, request, x_input_shape): id="2 inputs 4D + 4D.", ), pytest.param( - [ModelInputSpec((1, 4, 1, 67)), ModelInputSpec((1, 8, 5, 67))], - id="2 inputs 4D + 4D same width.", + [ModelInputSpec((1, 4, 5, 67)), ModelInputSpec((1, 8, 5, 1))], + id="2 inputs 4D + 4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), ), ], ) - def test__w_conv_broadcast(self, mocker, request, input_spec): + def test__w_conv_broadcast(self, input_spec, mocker): model = AddTensorConvModule() graph_verifier = DetailedGraphVerifier( @@ -204,16 +213,12 @@ def test__w_conv_broadcast(self, mocker, request, input_spec): expected_non_delegated_ops={}, ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, input_spec, graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) @pytest.mark.parametrize( diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py index 3db1158d637..120c3899ed4 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py @@ -41,16 +41,16 @@ def forward(self, x): class TestAvgPool2D: - def test__basic_nsys_inference(self, mocker, request): + def test__basic_nsys_inference(self, mocker): input_shape = (2, 4, 6, 7) model = AvgPool2dModule(False, 0) graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) - def test__basic_nsys_inference_qat(self, mocker, request): + def test__basic_nsys_inference_qat(self, mocker): input_shape = (2, 9, 6, 15) model = AvgPool2dModule(False, 0) graph_verifier = DetailedGraphVerifier( @@ -61,11 +61,10 @@ def test__basic_nsys_inference_qat(self, mocker, request): model, input_shape, graph_verifier, - request, use_qat=True, ) - def test__kernel_size_limit(self, mocker, request): + def test__kernel_size_limit(self, mocker): kernel_size = (1, 4096) input_shape = (1, 4) + kernel_size model = AvgPool2dModule(False, 0, kernel_size) @@ -73,7 +72,7 @@ def test__kernel_size_limit(self, mocker, request): mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) def test__kernel_size_limit_exceeded(self): kernel_size = (1, 4097) # Exceeds the kernel size limit. @@ -88,7 +87,7 @@ def test__kernel_size_limit_exceeded(self): ) assert graph_contains_any_of_ops(delegated_ep.graph, [AvgPool2D]) - def test__stride_limit(self, mocker, request): + def test__stride_limit(self, mocker): stride = 4096 input_shape = (1, 4, 1, 4096) model = AvgPool2dModule(False, 0, 1, stride) @@ -96,7 +95,7 @@ def test__stride_limit(self, mocker, request): mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) def test__stride_limit_exceeded(self): stride = 4097 # Exceeds the stride limit. @@ -115,7 +114,7 @@ def test__stride_limit_exceeded(self): class TestAvgPool1D: # Just a basic test to verify that the operator gets extended to the 2D variant correctly. - def test__basic_nsys_inference(self, mocker, request): + def test__basic_nsys_inference(self, mocker): input_shape = (2, 4, 6) # The old flow limited the batch size to 1. model = AvgPool1DModule() graph_verifier = DetailedGraphVerifier( @@ -124,4 +123,4 @@ def test__basic_nsys_inference(self, mocker, request): expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py index b28a431e3ca..9bb1f30ee60 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py @@ -56,7 +56,7 @@ def forward(self, *inputs: torch.Tensor): class TestCat: - def test__qat(self, mocker, request, use_qat): + def test__qat(self, mocker, use_qat): input_shape = (2, 3, 5) num_inputs = 2 @@ -66,11 +66,11 @@ def test__qat(self, mocker, request, use_qat): mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shapes, graph_verifier, request, use_qat=use_qat) + lower_run_compare(model, input_shapes, graph_verifier, use_qat=use_qat) @pytest.mark.parametrize("dim", list(range(-3, 3)), ids=lambda dim: f"dim={dim}") @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}") - def test__same_shapes(self, mocker, request, dim, num_inputs): + def test__same_shapes(self, mocker, dim, num_inputs): input_shape = (2, 3, 5) input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)] @@ -79,11 +79,11 @@ def test__same_shapes(self, mocker, request, dim, num_inputs): mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shapes, graph_verifier, request) + lower_run_compare(model, input_shapes, graph_verifier) @pytest.mark.parametrize("dim", [0, -3, 2, -1], ids=lambda dim: f"dim={dim}") @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}") - def test__same_shapes__channels_first(self, mocker, request, dim, num_inputs): + def test__same_shapes__channels_first(self, mocker, dim, num_inputs): input_shape = (2, 3, 4, 5) input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)] @@ -94,12 +94,12 @@ def test__same_shapes__channels_first(self, mocker, request, dim, num_inputs): expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shapes, graph_verifier, request) + lower_run_compare(model, input_shapes, graph_verifier) @pytest.mark.parametrize("dim", [0, -1], ids=lambda dim: f"dim={dim}") @pytest.mark.parametrize("rank", [2, 3, 4], ids=lambda rank: f"rank={rank}") @pytest.mark.parametrize("num_inputs", [2, 3], ids=lambda n: f"n={n}") - def test__different_shapes(self, mocker, request, dim, rank, num_inputs): + def test__different_shapes(self, mocker, dim, rank, num_inputs): # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input. # e.g. [(2, 3, 4), (3, 3, 4), (4, 3, 4), (5, 3, 4), (6, 3, 4)] base_shape = [i + 2 for i in range(rank)] @@ -113,11 +113,11 @@ def test__different_shapes(self, mocker, request, dim, rank, num_inputs): mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_shapes, graph_verifier, request) + lower_run_compare(model, input_shapes, graph_verifier) @pytest.mark.parametrize("dim", [1, -1], ids=lambda dim: f"dim={dim}") @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}") - def test__different_shapes__channels_first(self, mocker, request, dim, num_inputs): + def test__different_shapes__channels_first(self, mocker, dim, num_inputs): # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input. # e.g. [(1, 3, 4, 5), (2, 3, 4, 5)] base_shape = (2, 3, 4, 5) @@ -133,7 +133,7 @@ def test__different_shapes__channels_first(self, mocker, request, dim, num_input expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shapes, graph_verifier, request) + lower_run_compare(model, input_shapes, graph_verifier) def test__single_input__alone_in_partition__not_delegated(self): # The operator is a noop, and there is no other op in the model. The Neutron Converter would produce an empty @@ -149,7 +149,7 @@ def test__single_input__alone_in_partition__not_delegated(self): ) assert graph_contains_any_of_ops(delegated_ep.graph, [Cat]) - def test__single_input__not_alone_in_partition__delegated(self, mocker, request): + def test__single_input__not_alone_in_partition__delegated(self, mocker): # The operator is a noop, but there is another op in the model, so they are both delegated. input_shape = [ModelInputSpec((2, 3, 4, 5))] @@ -160,4 +160,4 @@ def test__single_input__not_alone_in_partition__delegated(self, mocker, request) expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py index bd296bb856f..e0ae44b61f8 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py @@ -24,6 +24,9 @@ ) from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + NumericalStatsOutputComparator, +) from executorch.backends.nxp.tests.nsys_testing import lower_run_compare from executorch.backends.nxp.tests.ops_aliases import ( AddTensor, @@ -65,35 +68,6 @@ def forward(self, x): class TestClamp: - - @pytest.mark.parametrize( - "min, max", - [ - pytest.param(-1, 2, id="min = -1, max = 2 (Max/Min)"), - pytest.param(0.0, None, id="min = 0, max = None (Relu)"), - ], - ) - def test__qat(self, mocker, request, min, max, use_qat): - input_shape = (2, 7, 2) # Indivisible by num_macs - model = AddClampModule(min, max) - - x_input_spec = ModelInputSpec(input_shape) - graph_verifier = DetailedGraphVerifier( - mocker, - expected_delegated_ops={ - AddTensor: 1, - Clamp: 1, - }, - expected_non_delegated_ops={}, - ) - - lower_run_compare( - model=model, - input_spec=[x_input_spec], - request=request, - dlg_model_verifier=graph_verifier, - ) - @pytest.mark.parametrize( "min, max", [ @@ -116,11 +90,12 @@ def test__qat(self, mocker, request, min, max, use_qat): pytest.param(0.0, None, id="min = 0, max = None (Relu)"), ], ) - def test_convert_clamp__full_pipeline(self, mocker, request, min, max): + def test_convert_clamp__full_pipeline(self, mocker, min, max, use_qat): input_shape = (2, 7, 2) # Indivisible by num_macs model = AddClampModule(min, max) x_input_spec = ModelInputSpec(input_shape) + comparator = NumericalStatsOutputComparator() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={ @@ -134,7 +109,8 @@ def test_convert_clamp__full_pipeline(self, mocker, request, min, max): model=model, input_spec=[x_input_spec], dlg_model_verifier=graph_verifier, - request=request, + output_comparator=comparator, + use_qat=use_qat, ) @pytest.mark.parametrize( diff --git a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py index 32bbf93fae4..9ffa69139f6 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py @@ -34,7 +34,7 @@ class TestConstantPadND: """ # noinspection PyMethodMayBeStatic - def assert_delegated(self, model, input_shape, mocker, request, use_qat=False): + def assert_delegated(self, model, input_shape, mocker, use_qat=False): graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={ConstantPadND: 1}, @@ -45,16 +45,15 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False): model, input_shape, graph_verifier, - request, use_qat=use_qat, ) def assert_delegated_and_output_shape_equals( - self, model, input_shape, expected_output_shape, mocker, request + self, model, input_shape, expected_output_shape, mocker ): model_builder_spy = mocker.spy(ModelBuilder, "finish") - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) neutron_ir_subgraph = model_builder_spy.call_args[0][0].get_sub_graph() assert neutron_ir_subgraph.outputs.tmp_outputs[0].shape.vector == list( @@ -75,14 +74,12 @@ def assert_delegated_and_output_shape_equals( pytest.param((1, 2, 3, 4, 5), tuple(range(4)), id="5D, padding W, D"), ], ) - def test__basic_nsys_inference( - self, mocker, request, input_shape, paddings, use_qat - ): + def test__basic_nsys_inference(self, mocker, input_shape, paddings, use_qat): # These test cases are also supported by the old flow. model = ConstantPadNDModule(paddings) - self.assert_delegated(model, input_shape, mocker, request, use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat) - def test__channels_padding(self, mocker, request): + def test__channels_padding(self, mocker): input_shape = (2, 4, 6) # These paddings will be applied to the last dimension, which is the channels as the input is formatless. paddings = (1, 1) @@ -90,25 +87,25 @@ def test__channels_padding(self, mocker, request): model = ConstantPadNDModule(paddings) self.assert_delegated_and_output_shape_equals( - model, input_shape, expected_output_shape, mocker, request + model, input_shape, expected_output_shape, mocker ) - def test__batch_padding(self, mocker, request): + def test__batch_padding(self, mocker): input_shape = (2, 4, 6) paddings = (0, 0, 0, 0, 1, 1) # Padding applied to the batch dimension. expected_output_shape = (4, 4, 6) # Padded batch. model = ConstantPadNDModule(paddings) self.assert_delegated_and_output_shape_equals( - model, input_shape, expected_output_shape, mocker, request + model, input_shape, expected_output_shape, mocker ) @pytest.mark.parametrize("constant", [0.0, -13.37]) - def test__specific_constant(self, mocker, request, constant): + def test__specific_constant(self, mocker, constant): input_shape = (2, 4, 6) paddings = (1, 1) model = ConstantPadNDModule(paddings, constant) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) @pytest.mark.parametrize( "input_shape, paddings", @@ -118,7 +115,7 @@ def test__specific_constant(self, mocker, request, constant): pytest.param((1, 2, 6, 8), (0, 1, 2, 3, 1, 1), id="4D, padding H, W"), ], ) - def test__channels_first(self, mocker, request, input_shape, paddings): + def test__channels_first(self, mocker, input_shape, paddings): model = ConstantPadNDConvModule(paddings) graph_verifier = DetailedGraphVerifier( mocker, @@ -126,4 +123,4 @@ def test__channels_first(self, mocker, request, input_shape, paddings): expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py index 3799aa91623..67d3add978c 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py @@ -4,31 +4,22 @@ # LICENSE file in the root directory of this source tree. import numpy as np - -# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import ( - AtenModelBuilderDirector, -) -from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import ( - BuiltinOperator as Ops, -) -from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program -from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops -from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.models import Conv2dWithActivation, HardTanhModule -from executorch.backends.nxp.tests.nsys_testing import lower_run_compare -from executorch.backends.nxp.tests.ops_aliases import ( - Convolution, - ExecutorchDelegateCall, - HardTanh, +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + graph_contains_any_of_ops, + ToChannelFirstPreprocess, + ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.models import Conv2dWithActivation +from executorch.exir.dialects._ops import ops as exir_ops +from torch.export import ExportedProgram from executorch.backends.nxp.tests.use_qat import * # noqa F403 @@ -38,237 +29,91 @@ def reseed_model_per_test_run(): np.random.seed(23) -class AddHardTanhModule(HardTanhModule): - def forward(self, x): - x = x + x - x = super().forward(x) - return x - - -class TestHardTanh: - # noinspection PyMethodMayBeStatic - def assert_delegated( - self, - model, - input_shape, - mocker, - request, - use_qat=False, - expected_delegated_ops=None, - ): - graph_verifier = DetailedGraphVerifier( - mocker, - expected_delegated_ops=( - expected_delegated_ops - if expected_delegated_ops is not None - else {HardTanh: 1} - ), - expected_non_delegated_ops={}, - ) +ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate +HardTanh = exir_ops.edge.aten.hardtanh.default +HardTanh_ = exir_ops.edge.aten.hardtanh_.default - # Create a RandomDatasetCreator that covers also negative numbers to properly test the operator. - dataset_creator = RandomDatasetCreator(low=-2, high=2) - lower_run_compare( - model, - input_shape, - graph_verifier, - request, - dataset_creator, - use_qat=use_qat, - ) - - @pytest.mark.parametrize( - "activation_range", - [ - (-1, 3), - (0, float("inf")), - ], - ) - @pytest.mark.parametrize( - "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace" +@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128)]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_relu6_quant(mocker, input_shape: tuple[int], inplace: bool, use_qat: bool): + # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen. + # Testing the hardtanh originated from torch.nn.Relu6 op. + model = Conv2dWithActivation( + activation=torch.nn.ReLU6(inplace=inplace), in_channels=input_shape[1] ) - def test__qat( - self, mocker, request, activation_range: tuple[float, float], use_qat, inplace - ): - input_shape = (23,) - model = HardTanhModule(*activation_range, inplace) - self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat) + converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") - @pytest.mark.parametrize( - "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace" - ) - def test__from_relu6__after_conv(self, mocker, request, inplace: bool): - # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen. - # Testing the hardtanh originated from torch.nn.Relu6 op. - input_shape = (1, 3, 4, 5) - model = Conv2dWithActivation( - activation=torch.nn.ReLU6(inplace=inplace), - in_channels=input_shape[1], - out_channels=2, - ) + quantized_program = to_quantized_edge_program( + model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False + ).exported_program() - self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={HardTanh: 1, Convolution: 1}, - ) + tflite_flatbuffers_model, io_formats = converter_spy.spy_return + exported_program: ExportedProgram = converter_spy.call_args.args[1] - @pytest.mark.parametrize( - "activation_range", - [ - (0.0, 6.0), - (-1.0, 1), - (0, 1), - (0.0, float("inf")), - ], - ) - @pytest.mark.parametrize( - "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace" - ) - def test__hardtanh__mappable_to_relu__after_conv( - self, - mocker, - request, - activation_range: tuple[float, float], - inplace: bool, - ): - input_shape = (1, 3, 4, 5) - model = Conv2dWithActivation( - activation=torch.nn.Hardtanh(*activation_range, inplace), - in_channels=input_shape[1], - out_channels=2, - ) + assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_]) + assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall]) - self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={HardTanh: 1, Convolution: 1}, - ) - - @pytest.mark.parametrize( - "activation_range", - [ - (-1, 3), - (2.27, 3.14), - (-0.1, 0), - (float("-inf"), 1.23), - ], + input_data = (np.random.random(input_shape) * 50).astype(np.int8) + convert_run_compare( + exported_program, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + input_data=input_data, + atol=2.0, ) - def test__hardtanh__not_mappable_to_relu( - self, - mocker, - request, - activation_range: tuple[float, float], - ): - input_shape = (23,) - model = HardTanhModule(*activation_range) - - self.assert_delegated(model, input_shape, mocker, request) - - def test__unsupported_bounds(self): - # TODO ONLY WHEN ALONE IN PARTITION - input_shape = (2, 7, 2) - min_value, max_value = float("-inf"), float("inf") - model = HardTanhModule(min_value, max_value) - delegated_ep = to_quantized_edge_program(model, input_shape).exported_program() - # Make sure the `hardtanh` was NOT delegated. - assert graph_contains_any_of_ops(delegated_ep.graph, [HardTanh]) - - @pytest.mark.parametrize( - "activation_range", - [ - pytest.param((None, float("inf")), id="min = None, max = inf"), - pytest.param((float("inf"), None), id="min = inf, max = None"), - ], - ) - def test__invalid_bounds(self, activation_range): - # PyTorch doesn't allow these cases, so we cannot test our handling of this edge case. - with pytest.raises(TypeError, match="'<=' not supported between instances of"): - _ = HardTanhModule(*activation_range) - - @pytest.mark.parametrize( - "min, max, expected_neutron_ir_ops", - [ - pytest.param( - 0.1, - 0.5, - [Ops.ADD, Ops.MAXIMUM, Ops.MINIMUM], - id="min = 0.1, max = 0.5 (Max/Min)", - ), - pytest.param( - 0.0, 1.0, [Ops.ADD, Ops.RELU_0_TO_1], id="min = 0, max = 1 (Relu0To1)" - ), - pytest.param( - -1.0, - 1.0, - [Ops.ADD, Ops.RELU_N1_TO_1], - id="min = -1, max = 1 (ReluN1To1)", - ), - pytest.param( - 0.0, - float("inf"), - [Ops.ADD, Ops.RELU], - id="min = 0, max = infinity (Relu)", - ), - pytest.param( - 0, - 1.0, - [Ops.ADD, Ops.RELU_0_TO_1], - id="min = 0, max = 1 (Relu0To1)", - ), - pytest.param( - 0, - 6.0, - [Ops.ADD, Ops.RELU6], - id="min = 0, max = 6 (Relu6)", - ), - ], +@pytest.mark.parametrize("input_shape", [(1, 3, 16, 16), (1, 3, 32, 32)]) +@pytest.mark.parametrize( + "activation_range", + [ + (0.0, 6.0), + (-1.0, 1.0), + (0.0, 1.0), + (0.0, float("inf")), + (0, 6), + (-1, 1), + (0, 1), + (0, float("inf")), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_custom_hardtanh_quant( + mocker, + input_shape: tuple[int], + activation_range: tuple[float, float], + inplace: bool, + use_qat: bool, +): + # TODO(13063): This test suffers from non-ideal testing random quantization, because we always use range <0,1>. + # We should update (decrease atol) when the Conv/Linear + Activation fuse at quantization is in place. + min_val, max_val = activation_range + model = Conv2dWithActivation( + activation=torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace), + in_channels=input_shape[1], ) - def test_convert_clamp__relu_vs_maxmin( - self, mocker, min, max, expected_neutron_ir_ops - ): - input_shape = (23,) - model = AddHardTanhModule(min, max) - converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") - neutron_ir_spy = mocker.spy(AtenModelBuilderDirector, "finish") + converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") - delegated_ep = to_quantized_edge_program( - model, - input_shape, - ).exported_program() + quantized_program = to_quantized_edge_program( + model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False + ).exported_program() - # Make sure the `clamp` was delegated. - assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) - assert not graph_contains_any_of_ops(delegated_ep.graph, [HardTanh]) + tflite_flatbuffers_model, io_formats = converter_spy.spy_return + exported_program: ExportedProgram = converter_spy.call_args.args[1] - intermediate_ep = converter_spy.call_args.args[1] - quant_node = list(intermediate_ep.graph.nodes)[-2] - dequant_node = list(intermediate_ep.graph.nodes)[-4] - neutron_ir_internal_ops = [ - op.builtin_code for op in neutron_ir_spy.spy_return.operator_codes.vector - ] + assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_]) + assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall]) - assert graph_contains_any_of_ops(intermediate_ep.graph, [HardTanh]) - assert ( - len(neutron_ir_internal_ops) == len(expected_neutron_ir_ops) + 1 - ) # Transpose - assert all(op in neutron_ir_internal_ops for op in expected_neutron_ir_ops) - - if len(expected_neutron_ir_ops) == 3: - # Min/Max variant should have same input and output quantization - assert all( - q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:]) - ) - else: - assert not all( - q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:]) - ) + input_data = (np.random.random(input_shape) * 50).astype(np.int8) + convert_run_compare( + exported_program, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + input_data=input_data, + atol=2.0, + ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py index 567cf85ebe5..81dbe9aa0fb 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py @@ -34,7 +34,7 @@ def forward(self, x): class TestLeakyRelu: # noinspection PyMethodMayBeStatic - def assert_delegated(self, model, input_shape, mocker, request, use_qat=False): + def assert_delegated(self, model, input_shape, mocker, use_qat=False): graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={LeakyRelu: 1}, @@ -48,7 +48,6 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False): model, input_shape, graph_verifier, - request, dataset_creator, use_qat=use_qat, ) @@ -64,29 +63,28 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False): ], ids=lambda shape: f"{len(shape)}D", ) - def test__default_alpha__input_shapes(self, mocker, request, input_shape): + def test__default_alpha__input_shapes(self, mocker, input_shape): model = LeakyReluModule() - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) - def test__default_alpha__qat(self, mocker, request, use_qat): + def test__default_alpha__qat(self, mocker, use_qat): model = LeakyReluModule() input_shape = (23,) - self.assert_delegated(model, input_shape, mocker, request, use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat) @pytest.mark.parametrize( "alpha", [0.01, 3.14159, 0, 1, float("inf")], ids=lambda alpha: f"alpha = {alpha}", ) - def test__specific_alpha(self, mocker, request, alpha): + def test__specific_alpha(self, mocker, alpha): model = LeakyReluModule(negative_slope=alpha) - self.assert_delegated(model, (23,), mocker, request) + self.assert_delegated(model, (23,), mocker) - def test__inplace(self, mocker, request): + def test__inplace(self, mocker): model = LeakyReluModule(inplace=True) self.assert_delegated( model, (23,), mocker, - request, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py index 0b7fe88cffc..3e1d066103a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py @@ -35,7 +35,7 @@ def forward(self, x): class TestLog: - def test__basic_nsys_inference(self, mocker, request): + def test__basic_nsys_inference(self, mocker): # Use 256 elements so that, after quantization to int8, the input can # cover the full discrete range [-128, 127]. # The dataset is generated as a linear float ramp and later quantized, @@ -49,7 +49,6 @@ def test__basic_nsys_inference(self, mocker, request): model, input_shape, graph_verifier, - request, dataset_creator=LinearRampDatasetCreator(low=0.0, high=1.0), ) @@ -61,7 +60,7 @@ def test__basic_nsys_inference(self, mocker, request): pytest.param((1, 3, 16, 16), id="4D"), ], ) - def test__basic_nsys_inference__qat(self, mocker, request, input_shape, use_qat): + def test__basic_nsys_inference__qat(self, mocker, input_shape, use_qat): model = LogModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={Log: 1}, expected_non_delegated_ops={} @@ -70,7 +69,6 @@ def test__basic_nsys_inference__qat(self, mocker, request, input_shape, use_qat) model, input_shape, graph_verifier, - request, dataset_creator=RandomDatasetCreator(low=1.0, high=10.0), use_qat=use_qat, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py index 55a47146bfc..c95b3cd3b8d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py @@ -51,14 +51,14 @@ def reseed_model_per_test_run(): class TestMaxPool2D: # noinspection PyMethodMayBeStatic - def assert_delegated(self, model, input_shape, mocker, request): + def assert_delegated(self, model, input_shape, mocker): graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1}, expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) # noinspection PyMethodMayBeStatic def assert_not_delegated(self, model, input_shape): @@ -70,12 +70,12 @@ def assert_not_delegated(self, model, input_shape): ) assert graph_contains_any_of_ops(delegated_ep.graph, [MaxPool2DWithIndices]) - def test__basic_nsys_inference(self, mocker, request): + def test__basic_nsys_inference(self, mocker): input_shape = (2, 4, 6, 7) # The old flow limited the batch size to 1. model = MaxPool2dModule() - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) - def test__basic_nsys_inference_qat(self, mocker, request): + def test__basic_nsys_inference_qat(self, mocker): input_shape = (2, 11, 7, 16) # The old flow limited the batch size to 1. model = MaxPool2dModule() graph_verifier = DetailedGraphVerifier( @@ -88,21 +88,20 @@ def test__basic_nsys_inference_qat(self, mocker, request): model, input_shape, graph_verifier, - request, use_qat=True, ) - def test__large_kernel_size(self, mocker, request): + def test__large_kernel_size(self, mocker): kernel_size = (1, 5000) input_shape = (1, 4) + kernel_size model = MaxPool2dModule(kernel_size, stride=1) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) - def test__stride_limit__no_padding(self, mocker, request): + def test__stride_limit__no_padding(self, mocker): stride = 4096 input_shape = (1, 4, 1, 4096) model = MaxPool2dModule(1, stride=stride) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) def test__stride_limit_exceeded__no_padding(self): stride = 4097 # Exceeds the stride limit. @@ -110,12 +109,12 @@ def test__stride_limit_exceeded__no_padding(self): model = MaxPool2dModule(1, stride=stride) self.assert_not_delegated(model, input_shape) - def test__stride_limit__padding(self, mocker, request): + def test__stride_limit__padding(self, mocker): padding = 1 stride = 4096 input_shape = (1, 2, 3, stride) model = MaxPool2dModule(3, stride=stride, padding=padding) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) def test__stride_limit_exceeded__padding(self): padding = 1 @@ -127,7 +126,7 @@ def test__stride_limit_exceeded__padding(self): @pytest.mark.skip( reason="Large padding requires large kernel size which results in an extremely slow test." ) - def test__padding_limit(self, mocker, request): + def test__padding_limit(self, mocker): # As the padding is added wia a `Pad` operator (not the `MaxPool` arguments), there is no limit to the padded # value. But as padding can be at most half of the kernel size (PyTorch requirement) and kernel size is limited # to 4096, padding of 2048 is the limit. @@ -135,16 +134,16 @@ def test__padding_limit(self, mocker, request): kernel_size = padding * 2 input_shape = (1, 1, 2, 3) model = MaxPool2dModule(kernel_size, padding=padding) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) - def test__padding__max_pool_limit_exceeded(self, mocker, request): + def test__padding__max_pool_limit_exceeded(self, mocker): # NeutronIR `MaxPool` padding is limited to 32. But as it is added by the `Pad` operator instead, there is no # limit. This tests ensures the `MaxPool` padding limit is not a problem. padding = 33 kernel_size = padding * 2 input_shape = (1, 2, 3, 4) model = MaxPool2dModule(kernel_size, padding=padding) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) def test__padding_to_kernel_ratio_exceeded(self): # Both PyTorch and Neutron require the padding to be at most half of the kernel size. @@ -161,7 +160,7 @@ def test__padding_to_kernel_ratio_exceeded(self): class TestMaxPool1D: # Just a basic test to verify that the operator gets extended to the 2D variant correctly. - def test__basic_nsys_inference__view_not_delegated(self, mocker, request): + def test__basic_nsys_inference__view_not_delegated(self, mocker): input_shape = (2, 4, 6) # The old flow limited the batch size to 1. model = MaxPool1DModule() @@ -171,4 +170,4 @@ def test__basic_nsys_inference__view_not_delegated(self, mocker, request): expected_non_delegated_ops={}, ) - lower_run_compare(model, input_shape, graph_verifier, request) + lower_run_compare(model, input_shape, graph_verifier) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py index f84471169ea..8195581c0f6 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py @@ -9,18 +9,6 @@ import pytest import torch -from executorch.backends.nxp.backend.ir.converter.builder.model_builder import ( - ModelBuilder, -) -from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.max_pool_2d_options import ( - MaxPool2D, -) -from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.mean_options import ( - Mean, -) -from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.transpose_options import ( - Transpose, -) from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops @@ -62,81 +50,71 @@ def forward(self, x): class MaxPoolMeanDimModule(torch.nn.Module): - @staticmethod - def noop_max_pool_2d(x): - """Call `torch.max_pool2d` that is a NoOp, but it enforces the ChannelsFirst format in the `NodeFormatInference`.""" - return torch.max_pool2d(x, kernel_size=1) - def __init__(self, dim, keepdim): super().__init__() self.dim, self.keepdim = dim, keepdim def forward(self, x): - x = self.noop_max_pool_2d(x) - x = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - return x - - -class MeanDimMaxPoolModule(MaxPoolMeanDimModule): - def forward(self, x): - x = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - x = self.noop_max_pool_2d(x) - return x - - -def assert_delegated( - model, - input_shape, - mocker, - request, - use_qat=False, - expected_delegated_ops=None, -): - if expected_delegated_ops is None: - expected_delegated_ops = {MeanDim: 1} - - graph_verifier = DetailedGraphVerifier( - mocker, - expected_delegated_ops=expected_delegated_ops, - expected_non_delegated_ops={}, - ) + x = torch.max_pool2d( + x, kernel_size=1 + ) # NoOp, but it enforces the channels first format. + return torch.mean(x, dim=self.dim, keepdim=self.keepdim) - # Cover also negative values to thoroughly test the operator. - dataset_creator = RandomDatasetCreator(low=-2, high=2) - remove_quant_io_ops = True # Use quantized dataset. - output_comparator = AllCloseOutputComparator(atol=1) # Allow single bit error. +class TestMeanDim: - lower_run_compare( + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, model, input_shape, - graph_verifier, - request, - dataset_creator, - output_comparator, - use_qat=use_qat, - remove_quant_io_ops=remove_quant_io_ops, - ) + mocker, + use_qat=False, + atol=None, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {MeanDim: 1} + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) -def assert_not_delegated(model, input_shape): - delegated_ep = to_quantized_edge_program(model, input_shape).exported_program() + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) - # Make sure the `mean` was NOT delegated. - assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) - assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim]) + kwargs = {"atol": atol} if atol is not None else {} + output_comparator = AllCloseOutputComparator(**kwargs) + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + output_comparator, + use_qat=use_qat, + ) -class TestMeanDim: + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program(model, input_shape).exported_program() + + # Make sure the `mean` was NOT delegated. + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim]) @pytest.fixture(params=[True, False], ids=lambda keep_dim: f"keep_dim = {keep_dim}") def keep_dim(self, request): return request.param - def test__basic_nsys_inference__qat(self, mocker, request, use_qat, keep_dim): + def test__basic_nsys_inference__qat(self, mocker, use_qat, keep_dim): input_shape = (23,) model = MeanDimModule(0, keep_dim) - assert_delegated(model, input_shape, mocker, request, use_qat=use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) @pytest.mark.parametrize( "input_shape, dim", @@ -150,9 +128,12 @@ def test__basic_nsys_inference__qat(self, mocker, request, use_qat, keep_dim): pytest.param((3, 1, 4, 1, 5), 0, id="5D, dim = 0."), ], ) - def test__single_dims(self, mocker, request, input_shape, dim, keep_dim): + def test__single_dims(self, mocker, input_shape, dim, keep_dim): model = MeanDimModule(dim, keep_dim) - assert_delegated(model, input_shape, mocker, request) + # Relatively large error, but it is actually equal to the output scale, so it is a single bit error. + # TODO Replace with quantized dataset testing and `atol = 1`. + atol = 0.014 + self.assert_delegated(model, input_shape, mocker, atol=atol) @pytest.mark.parametrize( "input_shape, dim", @@ -164,9 +145,12 @@ def test__single_dims(self, mocker, request, input_shape, dim, keep_dim): pytest.param((3, 1, 4, 1, 5), (3, -5, -4), id="5D, dim = (3, -5 ,-4)."), ], ) - def test__tuple_dims(self, mocker, request, input_shape, dim, keep_dim): + def test__tuple_dims(self, mocker, input_shape, dim, keep_dim): model = MeanDimModule(dim, keep_dim) - assert_delegated(model, input_shape, mocker, request) + # Relatively large error, but it is actually equal to the output scale, so it is a single bit error. + # TODO Replace with quantized dataset testing and `atol = 1`. + atol = 0.015 + self.assert_delegated(model, input_shape, mocker, atol=atol) @pytest.mark.parametrize( "input_shape, dim", @@ -178,7 +162,7 @@ def test__tuple_dims(self, mocker, request, input_shape, dim, keep_dim): def test__noop__only_node__not_delegated(self, input_shape, dim): keep_dim = True # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op. model = MeanDimModule(dim, keep_dim) - assert_not_delegated(model, input_shape) + self.assert_not_delegated(model, input_shape) @pytest.mark.parametrize( "input_shape, dim", @@ -187,14 +171,13 @@ def test__noop__only_node__not_delegated(self, input_shape, dim): pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."), ], ) - def test__noop__not_only_node__delegated(self, mocker, request, input_shape, dim): + def test__noop__not_only_node__delegated(self, mocker, input_shape, dim): keep_dim = True # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op. model = MeanDimAddModule(dim, keep_dim) - assert_delegated( + self.assert_delegated( model, input_shape, mocker, - request, expected_delegated_ops={MeanDim: 1, AddTensor: 1}, ) @@ -203,207 +186,44 @@ def test__noop__not_only_node__delegated(self, mocker, request, input_shape, dim [ pytest.param((3, 1, 4), 1, id="3D, dim = 1."), pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."), - pytest.param((1, 7, 3, 3), [0], id="4D, dim = [0]."), ], ) - def test__no_reduction__keepdim_false__delegated( - self, mocker, request, input_shape, dim - ): + def test__no_reduction__keepdim_false__delegated(self, mocker, input_shape, dim): # These cases reduce over a dimension of size 1. # When `keep_dim=True` the node is a noop, and it's not delegated (see `test__noop__only_node__not_delegated`), # but with `keep_dim=False` it changes the shape so it's not a noop and is therefore delegated successfully. keep_dim = False model = MeanDimModule(dim, keep_dim) - assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) - def test__channels_first__keep_dim__true(self, mocker, request): + @pytest.mark.parametrize( + "input_shape, dim", + [((1, 7, 3, 3), 1)], + ids=lambda val: f"shape={val}" if isinstance(val, tuple) else f"dim={val}", + ) + @pytest.mark.parametrize( + "keep_dim", + [ + pytest.param(True), + pytest.param( + False, + marks=pytest.mark.xfail( + strict=True, reason="Known format inference bug (EIEX-937)." + ), + ), + ], + ids=lambda kd: f"keep_dim={kd}", + ) + def test__channels_first__keep_dim__true(self, mocker, input_shape, dim, keep_dim): # Just 1 test case to verify correct handling of the `dim`. # Most cases fall into the single bit error case, and since this test uses 2 operators, the error accumulates # and the final error is larger. We cannot with 100% certainty say that the error is only caused by the single # bit errors and not related to the format. That's why only this 1 case with no errors is used. - input_shape, dim = (1, 7, 3, 3), 1 - model = MaxPoolMeanDimModule(dim, True) - assert_delegated( + + model = MaxPoolMeanDimModule(dim, keep_dim) + self.assert_delegated( model, input_shape, mocker, - request, expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1, MeanDim: 1}, ) - - class TestKeepDimFalseFormatHandling: - """When `keep_dim = False`, the `mean.dim` operator changes the rank, so the format have to be explicitly - handled. The tests in this class focus on the related edge cases. - """ - - def _assert_neutron_ir_model_has_ops( - self, model_builder_finish_spy, expected_ops - ): - assert ( - model_builder_finish_spy.call_count == 1 - ), "Conversion to Neutron IR happened multiple times." - - neutron_ir_ops = model_builder_finish_spy.spy_return.sub_graphs[ - 0 - ].operators.vector - assert len(neutron_ir_ops) == len( - expected_ops - ), "Neutron IR model doesn't have the expected number of ops." - - for op, expected_op in zip(neutron_ir_ops, expected_ops, strict=True): - assert isinstance( - op.builtin_options, expected_op - ), f"Expected {expected_op}, got {op}." - - @pytest.mark.parametrize( - "dim", - [ - 1, - [0, -3], - (-4, 1, 2), - [-3, 3], - [1, 2, 3], - ], - ids=lambda dim: f"dim={dim}", - ) - def test__channels_first_input__reducing_channels(self, mocker, request, dim): - # If the channels dimension is reduced (removed), the `mean` output will always be equal in channels first - # and channels last, so no `Transpose` ops are added. - input_shape = (1, 7, 3, 3) - model = MaxPoolMeanDimModule(dim, False) - - model_builder_finish_spy = mocker.spy(ModelBuilder, "finish") - assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={ - MaxPool2DWithIndices: 1, - GetItem: 1, - MeanDim: 1, - }, - ) - self._assert_neutron_ir_model_has_ops( - model_builder_finish_spy, - expected_ops=[ - Transpose, - MaxPool2D, - Mean, - ], - ) - - @pytest.mark.parametrize( - "dim", - [ - (2, 3), - [1, -2, 3], - [-1, -2, 0], - ], - ids=lambda dim: f"dim={dim}", - ) - def test__channels_first_input__reducing_all_spatial_dims( - self, mocker, request, dim - ): - # If tall he spatial dimensions are reduced (removed), the `mean` output will always be equal in channels - # first and channels last, so no `Transpose` ops are added. - input_shape = (1, 7, 3, 3) - model = MaxPoolMeanDimModule(dim, False) - - model_builder_finish_spy = mocker.spy(ModelBuilder, "finish") - assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={ - MaxPool2DWithIndices: 1, - GetItem: 1, - MeanDim: 1, - }, - ) - self._assert_neutron_ir_model_has_ops( - model_builder_finish_spy, - expected_ops=[ - Transpose, - MaxPool2D, - Mean, - ], - ) - - @pytest.mark.xfail(strict=True, reason="Known Neutron bug (AIR-14726).") - @pytest.mark.parametrize( - "dim", - [ - 0, - (2,), - [-1, 0], - ], - ids=lambda dim: f"dim={dim}", - ) - def test__channels_first_input__not_reducing_channels_or_all_spatial_dims( - self, mocker, request, dim - ): - # If the channels dimension is not reduced, a `Transpose` operator must be added to make the input channels - # first in Neutron IR. - - input_shape = (1, 7, 3, 3) - model = MaxPoolMeanDimModule(dim, False) - - model_builder_finish_spy = mocker.spy(ModelBuilder, "finish") - assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={ - MaxPool2DWithIndices: 1, - GetItem: 1, - MeanDim: 1, - }, - ) - - self._assert_neutron_ir_model_has_ops( - model_builder_finish_spy, - expected_ops=[ - Transpose, - MaxPool2D, - Transpose, # The necessary `Transpose` operator. - Mean, - ], - ) - - @pytest.mark.parametrize( - "input_shape, dim", - [ - pytest.param((2, 3, 4, 5, 6), 0, id="dim=0, 5D->4D"), - pytest.param((2, 3, 4, 5, 6), [-3], id="dim=[-3], 5D->4D"), - pytest.param((1, 2, 3, 4, 5, 6), (1, -1), id="dim=(1, -1), 6D->4D"), - ], - ids=lambda dim: f"dim={dim}", - ) - def test__channels_first_output(self, mocker, request, input_shape, dim): - model = MeanDimMaxPoolModule(dim, False) - - model_builder_finish_spy = mocker.spy(ModelBuilder, "finish") - assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={ - MaxPool2DWithIndices: 1, - GetItem: 1, - MeanDim: 1, - }, - ) - - self._assert_neutron_ir_model_has_ops( - model_builder_finish_spy, - expected_ops=[ - Mean, - Transpose, # The necessary `Transpose` operator. - MaxPool2D, - Transpose, - ], - ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py index d112ff1e1ac..897c3efd850 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py @@ -41,7 +41,7 @@ class TestMulTensor: pytest.param((1, 4, 8, 8), id="4D."), ], ) - def test__basic_nsys_inference(self, mocker, request, x_input_shape): + def test__basic_nsys_inference(self, x_input_shape, mocker): x_input_spec = ModelInputSpec(x_input_shape) model = MulTensorModule() graph_verifier = DetailedGraphVerifier( @@ -52,7 +52,6 @@ def test__basic_nsys_inference(self, mocker, request, x_input_shape): model, [x_input_spec, x_input_spec], graph_verifier, - request, ) @pytest.mark.parametrize( @@ -62,7 +61,7 @@ def test__basic_nsys_inference(self, mocker, request, x_input_shape): pytest.param((1, 4, 8, 8), id="4D."), ], ) - def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape): + def test__basic_nsys_inference_qat(self, x_input_shape, mocker): x_input_spec = ModelInputSpec(x_input_shape) model = MulTensorModule() graph_verifier = DetailedGraphVerifier( @@ -73,7 +72,6 @@ def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape): model, [x_input_spec, x_input_spec], graph_verifier, - request, use_qat=True, ) @@ -92,13 +90,13 @@ def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape): ), ], ) - def test__correct_broadcast(self, input_spec, mocker, request): + def test__correct_broadcast(self, input_spec, mocker): model = MulTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={} ) - lower_run_compare(model, input_spec, graph_verifier, request) + lower_run_compare(model, input_spec, graph_verifier) @pytest.mark.parametrize( "input_spec", @@ -136,7 +134,7 @@ def test__incorrect_broadcast(self, input_spec): ), ], ) - def test__w_conv(self, mocker, request, x_input_shape): + def test__w_conv(self, x_input_shape, mocker): model = MulTensorConvModule() n, c, h, w = x_input_shape @@ -153,7 +151,6 @@ def test__w_conv(self, mocker, request, x_input_shape): model, [x_input_spec, y_input_spec], graph_verifier, - request, ) @pytest.mark.parametrize( diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py index bdfd1e9da25..31436a3f200 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py @@ -77,13 +77,7 @@ def forward(self, x): class TestPermuteCopy: # noinspection PyMethodMayBeStatic def assert_delegated( - self, - model, - input_shape, - mocker, - request, - expected_delegated_ops=None, - use_qat=False, + self, model, input_shape, mocker, expected_delegated_ops=None, use_qat=False ): graph_verifier = DetailedGraphVerifier( mocker, @@ -95,7 +89,6 @@ def assert_delegated( model, input_shape, graph_verifier, - request, use_qat=use_qat, ) @@ -122,18 +115,18 @@ def _special_4d_permutations() -> list[ParameterSet]: pytest.param((3, 2, 1, 0), id="reverse"), ] - def test__qat(self, mocker, request, use_qat): + def test__qat(self, mocker, use_qat): input_shape = (2, 3, 5, 7) permutation = (0, 2, 3, 1) # NCHW -> NHWC model = PermuteModule(permutation) - self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) @pytest.mark.parametrize( "permutation", _all_permutations_for_rank(3), ids=lambda perm: f"permutation = {perm}", ) - def test__all_permutations__3d(self, mocker, request, permutation: tuple[int]): + def test__all_permutations__3d(self, mocker, permutation: tuple[int]): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5) model = PermuteModule(permutation) @@ -142,14 +135,14 @@ def test__all_permutations__3d(self, mocker, request, permutation: tuple[int]): # would result in an empty graph, which is not allowed. Therefore, it's not delegated. self.assert_not_delegated(model, input_shape) else: - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) @pytest.mark.parametrize( "permutation", _all_permutations_for_rank(4), ids=lambda perm: f"permutation = {perm}", ) - def test__all_permutations__4d(self, mocker, request, permutation: tuple[int]): + def test__all_permutations__4d(self, mocker, permutation: tuple[int]): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5, 7) model = PermuteModule(permutation) @@ -158,55 +151,43 @@ def test__all_permutations__4d(self, mocker, request, permutation: tuple[int]): # would result in an empty graph, which is not allowed. Therefore, it's not delegated. self.assert_not_delegated(model, input_shape) else: - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) @pytest.mark.parametrize("permutation", _special_4d_permutations()) def test__all_permutations__4d__channels_first_input( - self, mocker, request, permutation: tuple[int] + self, mocker, permutation: tuple[int] ): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5, 7) model = MaxPoolPermuteModule(permutation) expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1} self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops=expected_delegated_ops, + model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops ) @pytest.mark.parametrize("permutation", _special_4d_permutations()) def test__all_permutations__4d__channels_first_output( - self, mocker, request, permutation: tuple[int] + self, mocker, permutation: tuple[int] ): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5, 7) model = PermuteMaxPoolModule(permutation) expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1} self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops=expected_delegated_ops, + model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops ) @pytest.mark.parametrize("perm1", _special_4d_permutations()) @pytest.mark.parametrize("perm2", _special_4d_permutations()) def test__all_permutations__4d__channels_first_io( - self, mocker, request, perm1: tuple[int], perm2: tuple[int] + self, mocker, perm1: tuple[int], perm2: tuple[int] ): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5, 7) model = PermuteMaxPoolPermuteModule(perm1, perm2) expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 2} self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops=expected_delegated_ops, + model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops ) @pytest.mark.parametrize( @@ -219,7 +200,7 @@ def test__all_permutations__4d__channels_first_io( pytest.param((4, 2, 3, 0, 1), id="perm = (4, 2, 3, 0, 1)"), ], ) - def test__5d(self, mocker, request, permutation): + def test__5d(self, mocker, permutation): # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test. input_shape = (2, 3, 5, 3, 5) model = PermuteModule(permutation) @@ -228,4 +209,4 @@ def test__5d(self, mocker, request, permutation): # would result in an empty graph, which is not allowed. Therefore, it's not delegated. self.assert_not_delegated(model, input_shape) else: - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py index ca2abd18f32..ab42560f075 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py @@ -62,7 +62,7 @@ def forward(self, x): return self.relu(x) -class TestReLU: +class TestReLUNewNeutronFlow: @pytest.mark.parametrize( ["model", "input_shape"], [ @@ -98,7 +98,7 @@ class TestReLU: ), ], ) - def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shape): + def test_relu_conversion__full_pipeline(self, mocker, model, input_shape): model = model() # Avoid model creation at import time is_conv_module = not hasattr(model, "linear") @@ -108,20 +108,19 @@ def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shap {Convolution: 1, Relu: 1} if is_conv_module else {AddMm: 1, Relu: 1} ), expected_non_delegated_ops={}, - ops_to_ignore={ + ops_to_ignore=[ PermuteCopy, ViewCopy, QuantizePerTensor, DequantizePerTensor, DequantizePerChannel, - }, + ], ) lower_run_compare( model, input_shape, graph_verifier, - request, ) @pytest.mark.parametrize( @@ -137,9 +136,7 @@ def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shap ), ], ) - def test_relu_conversion__non_delegated_with_old_flow( - self, mocker, request, input_shape - ): + def test_relu_conversion__non_delegated_with_old_flow(self, mocker, input_shape): verifier = DetailedGraphVerifier( mocker=mocker, expected_delegated_ops={Relu: 1}, @@ -149,9 +146,8 @@ def test_relu_conversion__non_delegated_with_old_flow( lower_run_compare( ReLUModule(), input_shape, - verifier, - request, - RandomDatasetCreator(low=-1, high=1), + dlg_model_verifier=verifier, + dataset_creator=RandomDatasetCreator(low=-1, high=1), ) @pytest.mark.parametrize( diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py index bdd41d1eab0..75a32254a1d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py @@ -30,9 +30,7 @@ def reseed_model_per_test_run(): class TestSigmoid: # noinspection PyMethodMayBeStatic - def assert_delegated( - self, model, input_shape, mocker, request, use_qat=False, atol=None - ): + def assert_delegated(self, model, input_shape, mocker, use_qat=False, atol=None): graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={Sigmoid: 1}, @@ -49,16 +47,15 @@ def assert_delegated( model, input_shape, graph_verifier, - request, dataset_creator, output_comparator, use_qat=use_qat, ) - def test__basic_nsys_inference__qat(self, mocker, request, use_qat): + def test__basic_nsys_inference__qat(self, mocker, use_qat): input_shape = (23,) model = nn.Sigmoid() - self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) @pytest.mark.parametrize( "input_shape", @@ -71,13 +68,13 @@ def test__basic_nsys_inference__qat(self, mocker, request, use_qat): ], ids=lambda shape: f"{len(shape)}D", ) - def test__input_shapes(self, mocker, request, input_shape): + def test__input_shapes(self, mocker, input_shape): model = nn.Sigmoid() output_scale = 1.0 / 256.0 lowering_spy = mocker.spy(NeutronPartitioner, "partition") self.assert_delegated( - model, input_shape, mocker, request, atol=output_scale + model, input_shape, mocker, atol=output_scale ) # Allow single bit error. # Verify that the `atol` is indeed equal to the output scale. diff --git a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py index 98cc924ee85..cb0ec09bcce 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py @@ -41,9 +41,7 @@ def _slice_id(prefix, input_shape, dims, starts, ends): return f"{prefix}rank={len(input_shape)}_dims={str(dims)}_starts={str(starts)}_ends={str(ends)}" @staticmethod - def assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat - ): + def assert_delegated_and_correct(model, input_shape, num_slices, mocker, use_qat): graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={SliceCopy: num_slices}, @@ -56,7 +54,6 @@ def assert_delegated_and_correct( model, input_shape, graph_verifier, - request, dataset, comparator, use_qat=use_qat, @@ -185,14 +182,12 @@ def assert_not_delegated(model, input_shape): ), ], ) - def test_nsys_inference__basic( - self, input_shape, dims, starts, ends, mocker, request - ): + def test_nsys_inference__basic(self, input_shape, dims, starts, ends, mocker): model = SliceTensorModule(dims, starts, ends) num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=False + model, input_shape, num_slices, mocker, use_qat=False ) @pytest.mark.parametrize( @@ -214,9 +209,7 @@ def test_nsys_inference__basic( ), ], ) - def test_nsys_inference__reduction( - self, input_shape, dims, starts, ends, mocker, request - ): + def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker): model = SliceTensorModule(dims, starts, ends) slice_lengths = [e - s for s, e in zip(starts, ends)] @@ -226,7 +219,7 @@ def test_nsys_inference__reduction( else: num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=False + model, input_shape, num_slices, mocker, use_qat=False ) @pytest.mark.parametrize( @@ -248,14 +241,12 @@ def test_nsys_inference__reduction( ), ], ) - def test_nsys_inference__clipped( - self, input_shape, dims, starts, ends, mocker, request - ): + def test_nsys_inference__clipped(self, input_shape, dims, starts, ends, mocker): model = SliceTensorModule(dims, starts, ends) num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=False + model, input_shape, num_slices, mocker, use_qat=False ) @pytest.mark.parametrize( @@ -278,13 +269,13 @@ def test_nsys_inference__clipped( ], ) def test_nsys_inference__normalization( - self, input_shape, dims, starts, ends, mocker, request + self, input_shape, dims, starts, ends, mocker ): model = SliceTensorModule(dims, starts, ends) num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=False + model, input_shape, num_slices, mocker, use_qat=False ) @pytest.mark.parametrize( @@ -313,14 +304,12 @@ def test_nsys_inference__normalization( ), ], ) - def test_nsys_inference__big( - self, input_shape, dims, starts, ends, mocker, request - ): + def test_nsys_inference__big(self, input_shape, dims, starts, ends, mocker): model = SliceTensorModule(dims, starts, ends) num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=False + model, input_shape, num_slices, mocker, use_qat=False ) @pytest.mark.parametrize( @@ -347,7 +336,7 @@ def test_nsys_inference__identity(self, input_shape, dims, starts, ends): self.assert_model_without_slices(model, input_shape) - def test_nsys_inference__with_conv(self, mocker, request): + def test_nsys_inference__with_conv(self, mocker): input_shape = (11, 13, 5, 7) in_channels = input_shape[1] out_channels = 19 @@ -371,13 +360,12 @@ def test_nsys_inference__with_conv(self, mocker, request): model, input_shape, graph_verifier, - request, dataset, comparator, use_qat=False, ) - def test_nsys_inference__qat(self, mocker, request): + def test_nsys_inference__qat(self, mocker): input_shape = (7, 13, 7, 9) dims = (0, 1, 2, 3) starts = (1, 2, 3, 2) @@ -387,5 +375,5 @@ def test_nsys_inference__qat(self, mocker, request): num_slices = len(dims) self.assert_delegated_and_correct( - model, input_shape, num_slices, mocker, request, use_qat=True + model, input_shape, num_slices, mocker, use_qat=True ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py index e71ff7e8af5..9638f8fe0ec 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py @@ -16,9 +16,6 @@ ) from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.model_output_comparator import ( - AllCloseOutputComparator, -) from executorch.backends.nxp.tests.models import SubTensorConvModule, SubTensorModule from executorch.backends.nxp.tests.nsys_testing import lower_run_compare from executorch.backends.nxp.tests.ops_aliases import ( @@ -41,50 +38,76 @@ class TestSubTensor: [ pytest.param((1,), id="1D."), pytest.param((6, 5), id="2D."), - pytest.param((6, 82), id="2D alt."), pytest.param((1, 4, 7), id="3D."), - pytest.param((1, 68, 7), id="3D alt."), - pytest.param((2, 4, 3, 15), id="4D."), - pytest.param((1, 4, 9, 11, 4), id="5D."), + pytest.param( + (6, 82), + id="2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 68, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (2, 4, 3, 15), + id="4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), ], ) - def test__basic_nsys_inference(self, mocker, request, x_input_shape): + def test__basic_nsys_inference(self, x_input_shape, mocker): x_input_spec = ModelInputSpec(x_input_shape) model = SubTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, [x_input_spec, x_input_spec], graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) - def test__basic_nsys_inference_qat(self, mocker, request): - x_input_spec = ModelInputSpec((2, 4, 3, 15)) + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((2, 4, 3, 15), id="4D."), + pytest.param( + (1, 4, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference_qat(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) model = SubTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, [x_input_spec, x_input_spec], graph_verifier, - request, dataset_creator, - comparator, use_qat=True, - remove_quant_io_ops=True, ) @pytest.mark.parametrize( @@ -93,35 +116,33 @@ def test__basic_nsys_inference_qat(self, mocker, request): pytest.param( [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D." ), - pytest.param( - [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], - id="2 inputs 2D alt.", - ), pytest.param( [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D." ), pytest.param( [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))], - id="2 inputs 3D.", + id="2 inputs 3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], + id="2 inputs 2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), ), ], ) - def test__broadcast(self, mocker, request, input_spec): + def test__broadcast(self, input_spec, mocker): model = SubTensorModule() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, input_spec, graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) @pytest.mark.parametrize( @@ -160,7 +181,7 @@ def test__broadcast_unsupported(self, input_spec): ), ], ) - def test__w_conv(self, mocker, request, x_input_shape): + def test__w_conv(self, x_input_shape, mocker): model = SubTensorConvModule() n, c, h, w = x_input_shape @@ -178,7 +199,6 @@ def test__w_conv(self, mocker, request, x_input_shape): model, [x_input_spec, y_input_spec], graph_verifier, - request, dataset_creator, ) @@ -191,11 +211,12 @@ def test__w_conv(self, mocker, request, x_input_shape): ), pytest.param( [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))], - id="2 inputs 4D + 4D same height.", + id="2 inputs 4D + 4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), ), ], ) - def test__w_conv_broadcast(self, mocker, request, input_spec): + def test__w_conv_broadcast(self, input_spec, mocker): model = SubTensorConvModule() graph_verifier = DetailedGraphVerifier( mocker, @@ -203,16 +224,12 @@ def test__w_conv_broadcast(self, mocker, request, input_spec): expected_non_delegated_ops={}, ) dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) - comparator = AllCloseOutputComparator(atol=1) lower_run_compare( model, input_spec, graph_verifier, - request, dataset_creator, - comparator, - remove_quant_io_ops=True, ) @pytest.mark.parametrize( diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py index 51b7ee484a7..6336308e40b 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py @@ -36,7 +36,6 @@ def assert_delegated( model, input_shape, mocker, - request, use_qat=False, expected_delegated_ops=None, ): @@ -56,7 +55,6 @@ def assert_delegated( model, input_shape, graph_verifier, - request, dataset_creator, use_qat=use_qat, ) @@ -65,10 +63,10 @@ def assert_delegated( def inplace(self, request): return request.param - def test__qat__inplace(self, mocker, request, use_qat, inplace): + def test__qat__inplace(self, mocker, use_qat, inplace): shape = (23,) model = TanhModule(inplace) - self.assert_delegated(model, shape, mocker, request, use_qat=use_qat) + self.assert_delegated(model, shape, mocker, use_qat=use_qat) @pytest.mark.parametrize( "shape", @@ -81,20 +79,16 @@ def test__qat__inplace(self, mocker, request, use_qat, inplace): ], ids=lambda shape: f"{len(shape)}D", ) - def test__shapes(self, mocker, request, shape): + def test__shapes(self, mocker, shape): model = TanhModule() - self.assert_delegated(model, shape, mocker, request) + self.assert_delegated(model, shape, mocker) - def test__with_convolution(self, mocker, request): + def test__with_convolution(self, mocker): input_shape = (1, 3, 12, 16) channels = input_shape[1] model = Conv2dWithActivation( activation=torch.tanh, in_channels=channels, out_channels=channels ) self.assert_delegated( - model, - input_shape, - mocker, - request, - expected_delegated_ops={Tanh: 1, Convolution: 1}, + model, input_shape, mocker, expected_delegated_ops={Tanh: 1, Convolution: 1} ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py index f9b2269751f..c4a698f4bfb 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py @@ -59,7 +59,6 @@ def assert_delegated( model, input_shape, mocker, - request, use_qat=False, atol=None, expected_delegated_ops=None, @@ -83,7 +82,6 @@ def assert_delegated( model, input_shape, graph_verifier, - request, dataset_creator, output_comparator, use_qat=use_qat, @@ -98,25 +96,21 @@ def assert_not_delegated(self, model, input_shape): ) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D]) - def test__qat__align_corners(self, mocker, request, use_qat): + def test__qat__align_corners(self, mocker, use_qat): align_corners = True input_shape = (1, 2, 3, 4) output_size = (5, 7) model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) atol = 0.015 # ~= output scale -> single bit error. - self.assert_delegated( - model, input_shape, mocker, request, use_qat=use_qat, atol=atol - ) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) - def test__qat__not_align_corners(self, mocker, request, use_qat): + def test__qat__not_align_corners(self, mocker, use_qat): align_corners = False input_shape = (1, 2, 3, 4) output_size = (6, 8) model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) atol = 0.015 # ~= output scale -> single bit error. - self.assert_delegated( - model, input_shape, mocker, request, use_qat=use_qat, atol=atol - ) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) @pytest.mark.parametrize( "input_shape, output_size", @@ -131,13 +125,11 @@ def test__qat__not_align_corners(self, mocker, request, use_qat): pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), ], ) - def test__not_align_corners__output_size( - self, mocker, request, input_shape, output_size - ): + def test__not_align_corners__output_size(self, mocker, input_shape, output_size): align_corners = False model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) atol = 0.016 # ~= output scale -> single bit error. - self.assert_delegated(model, input_shape, mocker, request, atol=atol) + self.assert_delegated(model, input_shape, mocker, atol=atol) def test__not_align_corners__output_size__unsupported(self): align_corners = False @@ -159,11 +151,11 @@ def test__not_align_corners__output_size__unsupported(self): pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), ], ) - def test__not_align_corners__scales(self, mocker, request, input_shape, scale): + def test__not_align_corners__scales(self, mocker, input_shape, scale): align_corners = False model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) atol = 0.016 # ~= output scale -> single bit error. - self.assert_delegated(model, input_shape, mocker, request, atol=atol) + self.assert_delegated(model, input_shape, mocker, atol=atol) def test__not_align_corners__scales__unsupported(self): align_corners = False @@ -191,13 +183,11 @@ def test__not_align_corners__scales__unsupported(self): ), ], ) - def test__align_corners__output_size( - self, mocker, request, input_shape, output_size - ): + def test__align_corners__output_size(self, mocker, input_shape, output_size): align_corners = True model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) atol = 0.016 # ~= output scale -> single bit error. - self.assert_delegated(model, input_shape, mocker, request, atol=atol) + self.assert_delegated(model, input_shape, mocker, atol=atol) def test__align_corners__output_size__unsupported(self): align_corners = True @@ -250,11 +240,11 @@ def test__align_corners__output_size__input_size_equal_to_one(self): ), ], ) - def test__align_corners__scales(self, mocker, request, input_shape, scale): + def test__align_corners__scales(self, mocker, input_shape, scale): align_corners = True model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) atol = 0.016 # ~= output scale -> single bit error. - self.assert_delegated(model, input_shape, mocker, request, atol=atol) + self.assert_delegated(model, input_shape, mocker, atol=atol) def test__align_corners__scales__unsupported(self): align_corners = True @@ -269,7 +259,7 @@ def test__noop__alone_in_partition__not_delegated(self): model = UpsampleBilinearModule(scale=scale) self.assert_not_delegated(model, input_shape) - def test__noop__not_alone_in_partition__delegated(self, mocker, request): + def test__noop__not_alone_in_partition__delegated(self, mocker): input_shape = (1, 2, 3, 4) scale = 1 model = UpsampleBilinearAddModule(scale=scale) @@ -277,6 +267,5 @@ def test__noop__not_alone_in_partition__delegated(self, mocker, request): model, input_shape, mocker, - request, expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1}, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py index b3e28a7b2f8..438a580f6e8 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py @@ -53,7 +53,6 @@ def assert_delegated( model, input_shape, mocker, - request, use_qat=False, expected_delegated_ops=None, ): @@ -73,7 +72,6 @@ def assert_delegated( model, input_shape, graph_verifier, - request, dataset_creator, use_qat=use_qat, ) @@ -87,11 +85,11 @@ def assert_not_delegated(self, model, input_shape): ) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D]) - def test__qat(self, mocker, request, use_qat): + def test__qat(self, mocker, use_qat): input_shape = (1, 2, 3, 4) output_size = (6, 8) model = UpsampleNearestModule(size=output_size) - self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) @pytest.mark.parametrize( "input_shape, output_size", @@ -107,9 +105,9 @@ def test__qat(self, mocker, request, use_qat): pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), ], ) - def test__output_size(self, mocker, request, input_shape, output_size): + def test__output_size(self, mocker, input_shape, output_size): model = UpsampleNearestModule(size=output_size) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) def test__output_size__unsupported(self): input_shape = (1, 2, 3, 4) @@ -133,9 +131,9 @@ def test__output_size__unsupported(self): pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), ], ) - def test__scales(self, mocker, request, input_shape, scale): + def test__scales(self, mocker, input_shape, scale): model = UpsampleNearestModule(scale=scale) - self.assert_delegated(model, input_shape, mocker, request) + self.assert_delegated(model, input_shape, mocker) def test__scales__unsupported(self): input_shape = (1, 2, 3, 4) @@ -149,7 +147,7 @@ def test__noop__alone_in_partition__not_delegated(self): model = UpsampleNearestModule(scale=scale) self.assert_not_delegated(model, input_shape) - def test__noop__not_alone_in_partition__delegated(self, mocker, request): + def test__noop__not_alone_in_partition__delegated(self, mocker): input_shape = (1, 2, 3, 4) scale = 1 model = UpsampleNearestAddModule(scale=scale) @@ -157,6 +155,5 @@ def test__noop__not_alone_in_partition__delegated(self, mocker, request): model, input_shape, mocker, - request, expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1}, ) diff --git a/backends/nxp/tests/model_output_comparator.py b/backends/nxp/tests/model_output_comparator.py index 5563703ae20..f0dd7cd2d60 100644 --- a/backends/nxp/tests/model_output_comparator.py +++ b/backends/nxp/tests/model_output_comparator.py @@ -4,7 +4,6 @@ # LICENSE file in the root directory of this source tree. import abc -import logging import os from abc import abstractmethod from pathlib import Path @@ -16,7 +15,6 @@ from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( torch_type_to_numpy_type, ) -from executorch.backends.nxp.tests.utils import archive_test_dir, store_txt_input_tensor class BaseOutputComparator(abc.ABC): @@ -37,11 +35,6 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec): :param npu_results_dir: Path to directory with NPU (delegated) results. :param output_tensor_spec: List of output tensor specifications. """ - if logging.root.isEnabledFor(logging.DEBUG): - diff_cpu_npu_results_dir = os.path.join( - os.path.dirname(cpu_results_dir), "diff_cpu_npu_results" - ) - sample_dirs = [ os.path.join(cpu_results_dir, file) for file in os.listdir(cpu_results_dir) ] @@ -72,28 +65,7 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec): ) npu_output_tensors.append((output_tensor_name, npu_tensor)) - if logging.root.isEnabledFor(logging.DEBUG): - # Store diff results if logging level is enabled - diff_cpu_npu_tensor = np.abs(cpu_tensor - npu_tensor) - os.makedirs( - os.path.join(diff_cpu_npu_results_dir, sample_dir), - exist_ok=True, - ) - diff_cpu_npu_tensor_path = os.path.join( - diff_cpu_npu_results_dir, sample_dir, output_tensor_name - ) - diff_cpu_npu_tensor.tofile(diff_cpu_npu_tensor_path) - - # Store text tensor results - store_txt_input_tensor(cpu_tensor_path, tensor_spec) - store_txt_input_tensor(npu_tensor_path, tensor_spec) - store_txt_input_tensor(diff_cpu_npu_tensor_path, tensor_spec) - - # We need to archive the test_dir before comparison, as comparison can cause AssertionError exception - test_dir = os.path.dirname(cpu_results_dir) - if logging.root.isEnabledFor(logging.DEBUG): - archive_test_dir(test_dir) - self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors) + self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors) @abstractmethod def compare_sample( diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py index d5ff3680f38..7631ee20ca1 100644 --- a/backends/nxp/tests/nsys_testing.py +++ b/backends/nxp/tests/nsys_testing.py @@ -3,22 +3,19 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import datetime import functools +import inspect import logging import os.path -import re import shutil import subprocess from copy import deepcopy from enum import Enum -from importlib.metadata import version from os import environ, mkdir from typing import Callable, Iterable import numpy as np import torch -import yaml from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order from executorch.backends.nxp.backend.ir.converter.conversion import translator from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( @@ -43,11 +40,10 @@ AllCloseOutputComparator, ) from executorch.backends.nxp.tests.outputs_dir_importer import outputs_dir -from executorch.backends.nxp.tests.utils import save_pte_program, store_txt_input_tensor +from executorch.backends.nxp.tests.utils import save_pte_program from executorch.devtools.visualization.visualization_utils import ( visualize_with_clusters, ) -from pytest import FixtureRequest from pytest_mock import MockerFixture from torch.export import ExportedProgram from torch.fx import GraphModule @@ -59,7 +55,6 @@ NSYS_CONFIG_PATH = test_config.NSYS_CONFIG_PATH NSYS_FIRMWARE_PATH = test_config.NSYS_FIRMWARE_PATH NEUTRON_TEST_PATH = test_config.NEUTRON_TEST_PATH -PROJECT_DIR = test_config.PROJECT_DIR class ReferenceModel(Enum): @@ -124,7 +119,6 @@ def wrapper(*args, **kwargs): delegated_program = to_quantized_executorch_program( model, input_spec, - intermediates_dir=test_dir, dataset_dir=calibration_dataset_dir, delegate_to_npu=True, use_qat=use_qat, @@ -132,7 +126,6 @@ def wrapper(*args, **kwargs): operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) - except RuntimeError as e: if "Model converted with neutron-converter has" in str(e) and hasattr( dlg_model_verifier, "check_num_delegated_nodes" @@ -398,7 +391,6 @@ def lower_run_compare( model: torch.nn.Module, input_spec: Iterable[ModelInputSpec] | tuple[int, ...], dlg_model_verifier: GraphVerifier, - request: FixtureRequest, dataset_creator=None, output_comparator=None, mocker: MockerFixture = None, @@ -416,12 +408,11 @@ def lower_run_compare( :param model: Executed PyTorch model. :param input_spec: Model input specification. Can be either tuple of ints - single float32 input model - or Iterable of ModelInputSpec. - :param dlg_model_verifier: Graph verifier instance. - :param request: PyTest request needed for correct test name extraction. :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples. :param output_comparator: Comparator of results produced by NPU and CPU runs of the program. - :param mocker: Mocker instance used by visualizer. + :param dlg_model_verifier: Graph verifier instance. :param reference_model: Version of the model which will be run to obtain reference output data. + :param mocker: Mocker instance used by visualizer. :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. :param operators_not_to_delegate: list of operators not to delegate. @@ -439,7 +430,7 @@ def lower_run_compare( model_to_delegate = model model_to_not_delegate = deepcopy(model) - test_name = get_test_name(request) + test_name = _get_caller_name() test_dir = os.path.join(OUTPUTS_DIR, test_name) shutil.rmtree(test_dir, ignore_errors=True) @@ -547,11 +538,6 @@ def lower_run_compare( output_tensor_spec = _get_program_output_spec(delegated_program) - if logging.root.isEnabledFor(logging.DEBUG): - _generate_txt_test_data( - calibration_dataset_dir, testing_dataset_dir, list(input_spec) - ) - dump_debug_test_summary(test_name, test_dir) npu_results_dir = os.path.join(test_dir, "results_npu") cpu_results_dir = os.path.join(test_dir, "results_cpu") output_comparator.compare_results( @@ -563,12 +549,10 @@ def lower_run_compare_ptq_qat( model: torch.nn.Module, input_spec: list[ModelInputSpec] | tuple, dlg_model_verifier: GraphVerifier, - request: FixtureRequest, train_fn: Callable[[torch.fx.GraphModule], None], dataset_creator=None, output_comparator=None, mocker: MockerFixture = None, - operators_not_to_delegate: list[str] = None, ): """ Run provided program twice and compare it's results. @@ -578,12 +562,10 @@ def lower_run_compare_ptq_qat( :param input_spec: Model input specification. Can be either tuple - single float32 input model - or list of ModelInputSpec. :param dlg_model_verifier: Graph verifier instance. - :param request: PyTest request needed for correct test name extraction. :param train_fn: Train/finetune function for QAT training. :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples. :param output_comparator: Comparator of results produced by NPU and CPU runs of the program. :param mocker: Mocker instance used by visualizer. - :param operators_not_to_delegate: list of operators not to delegate. """ assert_NSYS() @@ -595,7 +577,7 @@ def lower_run_compare_ptq_qat( model_ptq = model model_qat = deepcopy(model) - test_name = get_test_name(request) + test_name = _get_caller_name() test_dir = os.path.join(OUTPUTS_DIR, test_name) shutil.rmtree(test_dir, ignore_errors=True) @@ -624,7 +606,6 @@ def lower_run_compare_ptq_qat( ptq_results_dir, mocker, use_qat=False, - operators_not_to_delegate=operators_not_to_delegate, ) _ = _run_delegated_executorch_program( @@ -639,14 +620,10 @@ def lower_run_compare_ptq_qat( mocker, use_qat=True, train_fn=train_fn, - operators_not_to_delegate=operators_not_to_delegate, ) output_tensor_spec = _get_program_output_spec(delegated_program_ptq) - if logging.root.isEnabledFor(logging.DEBUG): - dump_debug_test_summary(test_name, test_dir) - shutil.make_archive(test_dir, "zip", test_dir) ptq_results_dir = os.path.join(test_dir, "results_ptq") qat_results_dir = os.path.join(test_dir, "results_qat") output_comparator.compare_results( @@ -680,13 +657,13 @@ def _parse_input_quant_params( return q_params -def get_test_name(request): - # PyTest request is available, extract correct name including test class and params - test_name = request.node.nodeid.lstrip(":") - # Escape unacceptable characters from test name to make sure it is a valid filesystem directory name - test_name = re.sub(r'[<>:"/\\|?* ,()`]', "_", test_name) - test_name = test_name.strip(" .") - return test_name +def _get_caller_name(): + test_function_names = ["lower_run_compare", "lower_run_compare_ptq_qat"] + for idx, frame in enumerate(inspect.stack()): + if frame.function in test_function_names: + # Look one index above to get caller + return inspect.stack()[idx + 1].function + return None def execute_cmd(cmd, cwd="."): @@ -748,60 +725,3 @@ def _get_program_output_spec(exported_program) -> list[torch.Tensor]: output_tensors_spec = list(exported_program.graph.output_node().meta["val"]) return output_tensors_spec - - -def get_executorch_git_info() -> dict[str, str]: - git_branch_cmd = f"git -C {PROJECT_DIR} branch --show-current" - git_branch, _, _ = execute_cmd(git_branch_cmd) - git_commit_cmd = f"git -C {PROJECT_DIR} rev-parse --short HEAD" - git_commit, _, _ = execute_cmd(git_commit_cmd) - return {"git_branch": git_branch, "git_commit": git_commit} - - -def dump_debug_test_summary(test_name: str, test_dir: str): - git_info = get_executorch_git_info() - - summary = { - "test_name": test_name, - "date_time": datetime.datetime.now().isoformat(), - "git_branch": git_info["git_branch"], - "git_commit": git_info["git_commit"], - "eiq_neutron_sdk_version": version("eiq_neutron_sdk"), - "eiq_nsys_version": version("eiq_nsys"), - } - with open(os.path.join(test_dir, "summary.yaml"), "w") as f: - yaml.dump(summary, f) - - -def _generate_txt_test_data( - calibration_dataset_dir: str, - testing_dataset_dir: str, - input_tensor_spec: list[ModelInputSpec], -): - # Generates txt tensor variants for input datasets - # Testing dataset can point to calibration dataset - dataset_paths = ( - [calibration_dataset_dir, testing_dataset_dir] - if calibration_dataset_dir != testing_dataset_dir - else [testing_dataset_dir] - ) - for d_path in dataset_paths: - quant_dataset = d_path.endswith("dataset_quant") - - # For multiple input tests, list each sample dir, for single input tests the input files are in d_path - sample_dirs = [os.path.join(d_path, file) for file in os.listdir(d_path)] - sample_dirs = [file for file in sample_dirs if os.path.isdir(file)] - # Single input dataset has tensor directly in dataset path - if len(sample_dirs) == 0: - for input_tensor_name in sorted(os.listdir(d_path)): - input_tensor_path = os.path.join(d_path, input_tensor_name) - tensor_spec = input_tensor_spec[0] - store_txt_input_tensor(input_tensor_path, tensor_spec, quant_dataset) - else: - for sample_dir in sample_dirs: - for idx, input_tensor_name in enumerate(os.listdir(sample_dir)): - input_tensor_path = os.path.join(sample_dir, input_tensor_name) - tensor_spec = input_tensor_spec[idx] - store_txt_input_tensor( - input_tensor_path, tensor_spec, quant_dataset - ) diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py index da50d4dc0d9..46002ba8883 100644 --- a/backends/nxp/tests/ops_aliases.py +++ b/backends/nxp/tests/ops_aliases.py @@ -26,13 +26,11 @@ DequantizePerChannel = exir_ops.edge.quantized_decomposed.dequantize_per_channel.default DequantizePerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -Exp = exir_ops.edge.aten.exp.default GetItem = operator.getitem HardTanh = exir_ops.edge.aten.hardtanh.default HardTanh_ = exir_ops.edge.aten.hardtanh_.default LeakyRelu = exir_ops.edge.aten.leaky_relu.default Log = exir_ops.edge.aten.log.default -MaxPool2D = exir_ops.edge.aten.max_pool2d.default MaxPool2DWithIndices = exir_ops.edge.aten.max_pool2d_with_indices.default MeanDim = exir_ops.edge.aten.mean.dim MulTensor = exir_ops.edge.aten.mul.Tensor diff --git a/backends/nxp/tests/utils.py b/backends/nxp/tests/utils.py index 00b7c364a31..c210d9db8bc 100644 --- a/backends/nxp/tests/utils.py +++ b/backends/nxp/tests/utils.py @@ -7,19 +7,11 @@ import logging import os -import shutil -import numpy as np - -from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( - torch_type_to_numpy_type, -) -from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec from executorch.devtools.visualization.visualization_utils import ( visualize_with_clusters, ) from executorch.exir import ExecutorchProgramManager -from torch._subclasses import FakeTensor def save_pte_program( @@ -40,27 +32,3 @@ def save_pte_program( visualize_with_clusters(prog.exported_program(), visualize_file_name, False) return filename - - -def change_filepath_extension(path: str, extension: str) -> str: - base, _ = os.path.splitext(path) - return base + "." + extension - - -def store_txt_input_tensor( - input_tensor_path: str, - tensor_spec: ModelInputSpec | FakeTensor, - quant_dataset: bool = False, -): - dtype = np.int8 if quant_dataset else torch_type_to_numpy_type(tensor_spec.dtype) - input_tensor = np.fromfile(input_tensor_path, dtype=dtype) - int__max = np.iinfo(np.int32).max - - with open(change_filepath_extension(input_tensor_path, "txt"), "w") as f: - f.write("Flattened tensor shape:" + str(input_tensor.shape)) - f.write("\nOriginal tensor shape:" + str(list(tensor_spec.shape)) + "\n") - f.write(np.array2string(input_tensor, threshold=int__max)) - - -def archive_test_dir(test_dir: str): - shutil.make_archive(test_dir, "zip", test_dir) diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py index 3336a394510..ca853de6f86 100644 --- a/backends/qualcomm/_passes/__init__.py +++ b/backends/qualcomm/_passes/__init__.py @@ -14,7 +14,6 @@ from .convert_mha_to_sha import ConvertMhaToSha from .convert_square_to_pow import ConvertSquareToPow from .decompose_acos import DecomposeAcos -from .decompose_addmm import DecomposeAddmm from .decompose_any import DecomposeAny from .decompose_atan2 import DecomposeAtan2 from .decompose_binary_alpha import DecomposeBinaryAlpha @@ -27,7 +26,6 @@ from .decompose_floor_divide import DecomposeFloorDivide from .decompose_glu import DecomposeGlu from .decompose_hardsigmoid import DecomposeHardsigmoid -from .decompose_hyperbolic_variants import DecomposeHyperbolicVariants from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm from .decompose_log_variants import DecomposeLogVariants from .decompose_maxpool3d import DecomposeMaxPool3d @@ -78,7 +76,6 @@ ConvertMhaToSha, ConvertSquareToPow, DecomposeAcos, - DecomposeAddmm, DecomposeAny, DecomposeAtan2, DecomposeBinaryAlpha, @@ -90,7 +87,6 @@ DecomposeFill, DecomposeFloorDivide, DecomposeGlu, - DecomposeHyperbolicVariants, DecomposeHardsigmoid, DecomposeLinalgVectorNorm, DecomposeLogVariants, diff --git a/backends/qualcomm/_passes/decompose_acos.py b/backends/qualcomm/_passes/decompose_acos.py index d546cf6d92d..f83b18f11fc 100644 --- a/backends/qualcomm/_passes/decompose_acos.py +++ b/backends/qualcomm/_passes/decompose_acos.py @@ -9,7 +9,7 @@ from executorch.exir.dialects.edge._ops import EdgeOpOverload from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_meta, create_const_node +from .utils import copy_meta, get_const_node class DecomposeAcos(ExportPass): @@ -52,7 +52,7 @@ def call(self, graph_module: torch.fx.GraphModule): ) if is_edge and pi_half_node is None: - pi_half_node = create_const_node( + pi_half_node = get_const_node( graph, graph_module, "_pi_half_constant", pi_half, node ) diff --git a/backends/qualcomm/_passes/decompose_atan2.py b/backends/qualcomm/_passes/decompose_atan2.py index a411f997b61..0f54e555e03 100644 --- a/backends/qualcomm/_passes/decompose_atan2.py +++ b/backends/qualcomm/_passes/decompose_atan2.py @@ -9,7 +9,7 @@ from executorch.exir.dialects.edge._ops import EdgeOpOverload from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_meta, create_const_node, create_node +from .utils import copy_meta, create_node, get_const_node class DecomposeAtan2(ExportPass): @@ -68,7 +68,7 @@ def _get_constants(self, graph, graph_module, node, is_edge, const_cache): def make_const(name, val): if name not in const_cache: - const_cache[name] = create_const_node( + const_cache[name] = get_const_node( graph, graph_module, name, val, node ) return const_cache[name] diff --git a/backends/qualcomm/_passes/decompose_log_variants.py b/backends/qualcomm/_passes/decompose_log_variants.py index 904900dd205..2b394806b68 100644 --- a/backends/qualcomm/_passes/decompose_log_variants.py +++ b/backends/qualcomm/_passes/decompose_log_variants.py @@ -11,7 +11,7 @@ from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_meta, create_const_node +from .utils import copy_meta, get_const_node class DecomposeLogVariants(ExportPass): @@ -50,7 +50,7 @@ def _decompose_log_n(self, node, graph, graph_module, const_cache, n): div_op = exir_ops.edge.aten.div.Tensor attr_name = f"_log_base_{n}_constant" if attr_name not in const_cache: - const_cache[attr_name] = create_const_node( + const_cache[attr_name] = get_const_node( graph, graph_module, attr_name, math.log(n), node ) div_arg = const_cache[attr_name] @@ -81,7 +81,7 @@ def _decompose_log_p(self, node, graph, graph_module, const_cache, p): log_op = exir_ops.edge.aten.log.default attr_name = f"_log1p_addend_{p}_constant" if attr_name not in const_cache: - const_cache[attr_name] = create_const_node( + const_cache[attr_name] = get_const_node( graph, graph_module, attr_name, p, node ) add_arg = const_cache[attr_name] diff --git a/backends/qualcomm/_passes/decompose_remainder.py b/backends/qualcomm/_passes/decompose_remainder.py index a6c260d217b..4e5ea739856 100644 --- a/backends/qualcomm/_passes/decompose_remainder.py +++ b/backends/qualcomm/_passes/decompose_remainder.py @@ -10,7 +10,7 @@ from executorch.exir.pass_base import ExportPass, PassResult from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix -from .utils import copy_meta, create_const_node +from .utils import copy_meta, get_const_node class DecomposeRemainder(ExportPass): @@ -69,7 +69,7 @@ def call(self, graph_module: torch.fx.GraphModule): attr_name = get_new_attr_name_with_prefix("_remainder_const_")( graph_module ) - const_cache[x_arg] = create_const_node( + const_cache[x_arg] = get_const_node( graph, graph_module, attr_name, x_arg, node ) x_node = const_cache[x_arg] @@ -82,7 +82,7 @@ def call(self, graph_module: torch.fx.GraphModule): attr_name = get_new_attr_name_with_prefix("_remainder_const_")( graph_module ) - const_cache[y_arg] = create_const_node( + const_cache[y_arg] = get_const_node( graph, graph_module, attr_name, y_arg, node ) y_node = const_cache[y_arg] diff --git a/backends/qualcomm/_passes/decompose_var.py b/backends/qualcomm/_passes/decompose_var.py index c89929fa50e..923fae4977f 100644 --- a/backends/qualcomm/_passes/decompose_var.py +++ b/backends/qualcomm/_passes/decompose_var.py @@ -10,7 +10,7 @@ from executorch.exir.pass_base import ExportPass, PassResult from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix -from .utils import copy_meta, create_const_node +from .utils import copy_meta, get_const_node class DecomposeVar(ExportPass): @@ -155,7 +155,7 @@ def call(self, graph_module: torch.fx.GraphModule): attr_name = get_new_attr_name_with_prefix( "_var_scale_const_" )(graph_module) - const_cache[cache_key] = create_const_node( + const_cache[cache_key] = get_const_node( graph, graph_module, attr_name, scale, node ) scale_node = const_cache[cache_key] diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py index 7efb4a293e1..e3e4b8c8e51 100644 --- a/backends/qualcomm/_passes/qnn_pass_manager.py +++ b/backends/qualcomm/_passes/qnn_pass_manager.py @@ -20,7 +20,6 @@ ConvertMhaToSha, ConvertSquareToPow, DecomposeAcos, - DecomposeAddmm, DecomposeAny, DecomposeAtan2, DecomposeBinaryAlpha, @@ -32,7 +31,6 @@ DecomposeFill, DecomposeFloorDivide, DecomposeGlu, - DecomposeHyperbolicVariants, DecomposeLinalgVectorNorm, DecomposeLogVariants, DecomposeMaxPool3d, @@ -124,14 +122,12 @@ def get_default_pass_activations(cls): (AnnotateUnbind, True), (ConvertBmmToMatmul, False), (DecomposeAcos, True), - (DecomposeAddmm, True), (DecomposeAny, True), (DecomposeAtan2, True), (DecomposeColIm, True), (DecomposeCDist, True), (DecomposeDivMode, True), (DecomposeFill, True), - (DecomposeHyperbolicVariants, True), (DecomposeLogVariants, True), (DecomposeMaxPool3d, True), (DecomposeMinMaxDim, True), @@ -164,7 +160,6 @@ def get_annotation_passes(cls): RecomposeRmsNorm, ReplaceArangeArgs, DecomposeAcos, - DecomposeAddmm, DecomposeAtan2, DecomposeBinaryAlpha, DecomposeCDist, @@ -184,7 +179,6 @@ def get_annotation_passes(cls): DecomposeExpM1, DecomposeFill, DecomposeGlu, - DecomposeHyperbolicVariants, DecomposeRemainder, DecomposeSelectScatter, DecomposeLinalgVectorNorm, @@ -281,14 +275,12 @@ def get_passes_dependency_for_capture_program(cls): AnnotateUnbind: [RemoveRedundancy], ConvertBmmToMatmul: [RecomposePixelUnshuffle], DecomposeAcos: [RemoveRedundancy], - DecomposeAddmm: [RemoveRedundancy], DecomposeAny: [RemoveRedundancy], DecomposeAtan2: [RemoveRedundancy], DecomposeColIm: [FoldQDQ], DecomposeCDist: [RemoveRedundancy], DecomposeDivMode: [RemoveRedundancy], DecomposeFill: [RemoveRedundancy], - DecomposeHyperbolicVariants: [RemoveRedundancy], DecomposeLinalgVectorNorm: [RemoveRedundancy], DecomposeLogVariants: [RemoveRedundancy], DecomposeMaxPool3d: [RemoveRedundancy], diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py index 2a580ab11a4..92a75703bbd 100755 --- a/backends/qualcomm/_passes/utils.py +++ b/backends/qualcomm/_passes/utils.py @@ -343,7 +343,7 @@ def append_qdq( return dq_node -def create_const_node( +def get_const_node( graph: torch.fx.Graph, graph_module: torch.fx.GraphModule, attr_name: str, diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl index 1ea7e6679d5..89f8efdea3e 100644 --- a/backends/qualcomm/aot/wrappers/targets.bzl +++ b/backends/qualcomm/aot/wrappers/targets.bzl @@ -1,7 +1,6 @@ load( "@fbsource//tools/build_defs:default_platform_defs.bzl", "ANDROID", - "CXX", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep") @@ -21,7 +20,7 @@ def define_common_targets(): "*.h", ]), define_static_target = True, - platforms = [ANDROID, CXX], + platforms = [ANDROID], visibility = ["PUBLIC"], deps = [ qnn_third_party_dep("api"), diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md index b8d86b9d6da..7b7f4ef8139 100644 --- a/backends/qualcomm/builders/README.md +++ b/backends/qualcomm/builders/README.md @@ -498,16 +498,11 @@ The following PyTorch operators are supported through decomposition or annotatio | PyTorch Op | Decomposition Pass | |---|---| | `aten.acos` | `DecomposeAcos` | -| `aten.acosh` | `DecomposeHyperbolicVariants` | -| `aten.addmm` | `DecomposeAddmm` | | `aten.adaptive_avg_pool1d`, `aten.avg_pool1d` | `AnnotateAvgPool1D` | | `aten.any` | `DecomposeAny` | -| `aten.asinh` | `DecomposeHyperbolicVariants` | | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` | -| `aten.atanh` | `DecomposeHyperbolicVariants` | | `aten.add` (with alpha), `aten.sub` (with alpha) | `DecomposeBinaryAlpha` | | `aten.cdist`, `aten._cdist_forward` | `DecomposeCDist` | -| `aten.cosh` | `DecomposeHyperbolicVariants` | | `aten.div.Tensor_mode` | `DecomposeDivMode` | | `aten.div.Scalar_mode` | `LiftConstantScalarOperands` → `DecomposeDivMode` | | `aten.im2col`, `aten.col2im` | `DecomposeColIm` | @@ -527,7 +522,6 @@ The following PyTorch operators are supported through decomposition or annotatio | `aten.roll` | `DecomposeRoll` | | `aten.select_scatter` | `DecomposeSelectScatter` | | `aten.silu` | `DecomposeSilu` | -| `aten.sinh` | `DecomposeHyperbolicVariants` | | `aten.tan` | `DecomposeTan` | | `aten.threshold` | `DecomposeThreshold` | | `aten.triu` | `DecomposeTriu` | diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md index 09b4c1918df..8300920d1d5 100644 --- a/backends/qualcomm/debugger/README.md +++ b/backends/qualcomm/debugger/README.md @@ -156,8 +156,6 @@ After `build_executorch_binary()`, the debugger holds: Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported. -**Note:** Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends. - ```python from executorch.examples.qualcomm.utils import SimpleADB @@ -268,7 +266,7 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build 3. Does not support graphs with partitions (partial delegation). 4. Does not support LLM models. 5. Does not support graphs with multiple methods. -6. Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends. + ## ExecuTorch QNN HTP Heap Profiling diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py index bcba08ecc5a..28b7952ef33 100644 --- a/backends/qualcomm/export_utils.py +++ b/backends/qualcomm/export_utils.py @@ -276,10 +276,6 @@ def __init__( self.skip_push = qnn_config.skip_push self.backend_library_paths = {} - if self.direct_build_folder and self.dump_intermediate_outputs: - raise ValueError( - "Per-tensor dumping is currently not supported in direct mode." - ) if self.direct_build_folder: direct_general_artifacts = [ f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so", @@ -441,8 +437,9 @@ def execute( f"--input_list_path {self.input_list_filename}", f"--etdump_path {self.etdump_path}", "--shared_buffer" if self.shared_buffer else "", + f"--debug_output_path {self.debug_output_path}", ( - f"--debug_output_path {self.debug_output_path} --dump_intermediate_outputs" + "--dump_intermediate_outputs" if self.dump_intermediate_outputs else "" ), diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py index ca8abb246bf..0c5be07fcdc 100644 --- a/backends/qualcomm/quantizer/annotators/htp_rules.py +++ b/backends/qualcomm/quantizer/annotators/htp_rules.py @@ -1077,11 +1077,7 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None: @register_annotator( - [ - torch.ops.aten.bmm.default, - torch.ops.aten.matmul.default, - torch.ops.aten.mm.default, - ], + [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default], QnnConstants.OpMatMul.op_name, ) class MatMul(GeneralOpDef): diff --git a/backends/qualcomm/quantizer/annotators/lpai_rules.py b/backends/qualcomm/quantizer/annotators/lpai_rules.py index 6e5b343c5c7..2623e4a6524 100644 --- a/backends/qualcomm/quantizer/annotators/lpai_rules.py +++ b/backends/qualcomm/quantizer/annotators/lpai_rules.py @@ -601,11 +601,7 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None: @register_annotator( - [ - torch.ops.aten.bmm.default, - torch.ops.aten.matmul.default, - torch.ops.aten.mm.default, - ], + [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default], QnnConstants.OpMatMul.op_name, ) class MatMul(GeneralOpDef): diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index 5ad312020be..335f4a5c4cb 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -1,7 +1,6 @@ load( "@fbsource//tools/build_defs:default_platform_defs.bzl", "ANDROID", - "CXX", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep") @@ -22,7 +21,7 @@ def define_common_targets(): "Logging.h", ], define_static_target = True, - platforms = [ANDROID, CXX], + platforms = [ANDROID], visibility = ["PUBLIC"], deps = [ qnn_third_party_dep("api"), @@ -92,7 +91,7 @@ def define_common_targets(): ), define_static_target = True, link_whole = True, # needed for executorch/examples/models/llama:main to register QnnBackend - platforms = [ANDROID, CXX], + platforms = [ANDROID], visibility = ["PUBLIC"], resources = ({ "qnn_lib": qnn_third_party_dep("qnn_offline_compile_libs"), diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl index c98a8bc83ac..a53e5823aff 100644 --- a/backends/qualcomm/targets.bzl +++ b/backends/qualcomm/targets.bzl @@ -1,7 +1,6 @@ load( "@fbsource//tools/build_defs:default_platform_defs.bzl", "ANDROID", - "CXX", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep") @@ -70,7 +69,7 @@ def define_common_targets(): }, exported_external_deps = ["flatbuffers-api"], define_static_target = True, - platforms = [ANDROID, CXX], + platforms = [ANDROID], ) runtime.cxx_library( @@ -88,32 +87,5 @@ def define_common_targets(): exported_deps = [ ":schema", ], - platforms = [ANDROID, CXX], - ) - - # Host-side AOT variant of qnn_executorch_backend. Pulls in the QNN - # offline-compile libraries as a Buck resource (via :runtime, which - # itself depends on qnn_third_party_dep("qnn_offline_compile_libs")), - # so a host-side gtest or runner can dlopen the QNN libraries - # without a manual path setup. - # - # Mirrors qnn_executorch_backend's structure but swaps the on-device - # runtime_android_build dep for the host runtime which bundles the - # x86 simulator libraries as a Buck resource. - runtime.cxx_library( - name = "qnn_executorch_backend_aot", - srcs = [], - headers = [], - define_static_target = True, - visibility = ["PUBLIC"], - deps = [ - qnn_third_party_dep("api"), - "//executorch/runtime/backend:interface", - "//executorch/runtime/core:core", - "//executorch/backends/qualcomm/runtime:runtime", - ], - exported_deps = [ - ":schema", - ], - platforms = [ANDROID, CXX], + platforms = [ANDROID], ) diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 0201edb6dee..9f043ea56a9 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -49,14 +49,6 @@ def forward(self, x): return torch.acos(x) -class Acosh(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.acosh(x) - - class AcosMultiNode(torch.nn.Module): def __init__(self): super().__init__() @@ -152,16 +144,6 @@ def forward(self, x): return 10 + x -class AddMM(torch.nn.Module): - def __init__(self, alpha=1, beta=1): - super().__init__() - self.alpha = alpha - self.beta = beta - - def forward(self, bias, input, mat2): - return torch.addmm(bias, input, mat2, alpha=self.alpha, beta=self.beta) - - class Any(torch.nn.Module): def __init__(self, dim=None, keepdim=False): super().__init__() @@ -265,14 +247,6 @@ def forward(self, x, y): return squeeze_out, conv_out -class Asinh(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.asinh(x) - - class Asin(torch.nn.Module): def __init__(self): super().__init__() @@ -305,14 +279,6 @@ def forward(self, x1, y1, x2, y2): return torch.atan2(x1, y1), torch.atan2(x2, y2) -class Atanh(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.atanh(x) - - class AvgPool1D(torch.nn.Module): def __init__(self): super().__init__() @@ -1023,14 +989,6 @@ def forward(self, x): return torch.cos(x) -class Cosh(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.cosh(x) - - class CumSum(torch.nn.Module): def __init__(self): super().__init__() @@ -2332,14 +2290,6 @@ def forward(self, x): return torch.sin(x) -class Sinh(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.sinh(x) - - class SimpleModel(torch.nn.Module): def __init__(self, kernel_size=3): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index fcb365292ee..914afa077e4 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -150,11 +150,6 @@ def test_qnn_backend_acos(self): index += 1 self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_acosh(self): - module = Acosh() # noqa: F405 - sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_adaptive_avg_pool1d(self): module = AdaptiveAvgPool1D() # noqa: F405 sample_input = (torch.randn(1, 512, 7),) @@ -195,30 +190,6 @@ def test_qnn_backend_adaptive_max_pool2d(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_addmm(self): - test_comb = [ - { - QCOM_MODULE: [AddMM()], # noqa: F405 - QCOM_SAMPLE_INPUTS: [ - (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)), - ], - }, - { - QCOM_MODULE: [AddMM(alpha=2, beta=3)], # noqa: F405 - QCOM_SAMPLE_INPUTS: [ - (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)), - ], - }, - ] - - index = 0 - for comb in test_comb: - for module in comb[QCOM_MODULE]: - for sample_input in comb[QCOM_SAMPLE_INPUTS]: - with self.subTest(i=index): - index += 1 - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_alias(self): module = Alias() # noqa: F405 sample_input = (torch.randn(1, 10),) @@ -329,11 +300,6 @@ def test_qnn_backend_argmin(self): case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS] ) - def test_qnn_backend_asinh(self): - module = Asinh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - self.lower_module_and_test_output(module, sample_input) - @unittest.expectedFailure def test_qnn_backend_asin(self): sample_input = (torch.rand(3, 4) * 2 - 1,) @@ -385,11 +351,6 @@ def test_qnn_backend_atan2(self): index += 1 self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_atanh(self): - module = Atanh() # noqa: F405 - sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_avg_pool1d(self): module = AvgPool1D() # noqa: F405 sample_input = (torch.randn(1, 512, 7),) @@ -652,11 +613,6 @@ def test_qnn_backend_cos(self): sample_input = (torch.randn(2, 5, 1, 3),) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cosh(self): - module = Cosh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cumsum(self): sample_input = () test_comb = [ @@ -2191,11 +2147,6 @@ def test_qnn_backend_sin(self): sample_input = (torch.randn(2, 5, 1, 3),) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_sinh(self): - module = Sinh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_select_copy(self): module = SelectCopy() # noqa: F405 sample_input = (torch.randn([1, 3, 3, 3]),) @@ -2974,12 +2925,6 @@ def test_qnn_backend_acos(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_acosh(self): - module = Acosh() # noqa: F405 - sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_adaptive_avg_pool1d(self): module = AdaptiveAvgPool1D() # noqa: F405 sample_input = (torch.randn(1, 512, 7),) @@ -3024,31 +2969,6 @@ def test_qnn_backend_adaptive_max_pool2d(self): module_one = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module_one, sample_input) - def test_qnn_backend_addmm(self): - test_comb = [ - { - QCOM_MODULE: [AddMM()], # noqa: F405 - QCOM_SAMPLE_INPUTS: [ - (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)), - ], - }, - { - QCOM_MODULE: [AddMM(alpha=2, beta=3)], # noqa: F405 - QCOM_SAMPLE_INPUTS: [ - (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)), - ], - }, - ] - - index = 0 - for comb in test_comb: - for module in comb[QCOM_MODULE]: - for sample_input in comb[QCOM_SAMPLE_INPUTS]: - with self.subTest(i=index): - index += 1 - qdq_module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(qdq_module, sample_input) - def test_qnn_backend_alias(self): module = Alias() # noqa: F405 sample_input = (torch.randn(1, 10),) @@ -3173,12 +3093,6 @@ def test_qnn_backend_asin(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_asinh(self): - module = Asinh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_atan(self): sample_input = (torch.randn(3, 4),) module = Atan() # noqa: F405 @@ -3218,12 +3132,6 @@ def test_qnn_backend_atan2(self): qdq_module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(qdq_module, sample_input) - def test_qnn_backend_atanh(self): - module = Atanh() # noqa: F405 - sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_avg_pool1d(self): module = AvgPool1D() # noqa: F405 sample_input = (torch.randn(1, 512, 7),) @@ -3570,12 +3478,6 @@ def test_qnn_backend_cos(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cosh(self): - module = Cosh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cumsum(self): module = CumSum() # noqa: F405 sample_input = (torch.randn(4),) @@ -5360,12 +5262,6 @@ def test_qnn_backend_sin(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_sinh(self): - module = Sinh() # noqa: F405 - sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_slice_copy(self): modules = [ SliceCopyDefaultParameter(), # noqa: F405 @@ -6222,10 +6118,6 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self): ) def test_qnn_backend_dump_intermediate_outputs_simple_model(self): - if self.direct_build_folder: - self.skipTest( - "Direct mode does not support per-tensor dumping (HTP/LPAI backends)." - ) backend_options = generate_htp_compiler_spec(use_fp16=True) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.soc_model], @@ -6956,38 +6848,20 @@ def output_callback(log_msg): ) def test_qnn_backend_dump_intermediate_outputs_simple_model(self): - # TODO: LPAI direct mode support per-tensor dumping. - if self.direct_build_folder: - self.skipTest( - "Direct mode does not support per-tensor dumping (HTP/LPAI backends)." - ) - match get_backend_type(self.backend): - case QnnExecuTorchBackendType.kHtpBackend: - backend_options = generate_htp_compiler_spec(use_fp16=False) - expected_compared_events = 14 - case QnnExecuTorchBackendType.kLpaiBackend: - backend_options = generate_lpai_compiler_spec( - target_env=self.get_lpai_target_env() - ) - # I/O q/dq nodes fall back to CPU via FoldQDQ LPAI workaround - # and are excluded from QNN etdump; update after first LPAI run - expected_compared_events = 17 - case _: - raise ValueError("Backend is not implemented yet") + backend_options = generate_htp_compiler_spec(use_fp16=False) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.soc_model], backend_options=backend_options, dump_intermediate_outputs=True, ) module = SimpleModel() # noqa: F405 - torch.manual_seed(8) sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) - qdq_module = self.get_qdq_module(module, sample_input) + module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output( - qdq_module, + module, sample_input, expected_partitions=1, - expected_compared_events=expected_compared_events, + expected_compared_events=14, ) def test_qnn_backend_dump_intermediate_outputs_topk(self): @@ -8199,6 +8073,7 @@ def test_static_llm_model(self): # noqa: C901 "1024", "--max_context_len", "1024", + "--skip_user_prompt_calibration", ] match self.static_llm_eval_method: @@ -8248,17 +8123,10 @@ def test_static_llm_model(self): # noqa: C901 ] ) case _: + cmds.remove("--skip_user_prompt_calibration") logging.warning( "No llm eval method chosen. Only generate model output." ) - cmds.extend( - [ - "--calib_tasks", - "wikitext", - "--calib_limit", - "1", - ] - ) if is_llama_model: cmds.extend( @@ -8431,10 +8299,6 @@ def test_codegen2_1b(self): "128", "--max_context_len", "128", - "--calib_tasks", - "wikitext", - "--calib_limit", - "1", ] self.add_default_cmds(cmds) @@ -8496,10 +8360,6 @@ def test_llama_stories_260k(self): "128", "--max_context_len", "128", - "--calib_tasks", - "wikitext", - "--calib_limit", - "1", ] self.add_default_cmds(cmds) @@ -8563,10 +8423,6 @@ def test_llama_stories_110m(self): "128", "--max_context_len", "128", - "--calib_tasks", - "wikitext", - "--calib_limit", - "1", ] if self.use_fp16: cmds.append("--use_fp16") @@ -8720,7 +8576,7 @@ class VLMSpecs(MLLMSpecs): def setUp(self): self.alm_specs = { "granite_speech_3_3-2b": TestExampleMultimodalityScript.ALMSpecs( - max_seq_len=1024, + max_seq_len=512, sm8650_token_rate=5, sm8750_token_rate=8, encoder_pte_size=900_000_000, # 900MB @@ -8732,7 +8588,7 @@ def setUp(self): } self.vlm_specs = { "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs( - max_seq_len=1024, + max_seq_len=128, sm8650_token_rate=50, sm8750_token_rate=55, encoder_pte_size=110_000_000, # 110MB @@ -8742,7 +8598,7 @@ def setUp(self): golden_image_feature="city", ), "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs( - max_seq_len=1024, + max_seq_len=320, sm8650_token_rate=11, sm8750_token_rate=13, encoder_pte_size=425_000_000, # 425MB @@ -8794,8 +8650,6 @@ def test_static_asr(self): "kv", "--max_seq_len", f"{alm_specs.max_seq_len}", - "--calib_samples", - "./examples/qualcomm/oss_scripts/llama/assets/samples/audio.json", ] if self.compile_only: cmds.extend(["--compile_only"]) @@ -8879,8 +8733,6 @@ def test_static_vlm(self): "kv", "--max_seq_len", f"{vlm_specs.max_seq_len}", - "--calib_samples", - "./examples/qualcomm/oss_scripts/llama/assets/samples/vision.json", ] if self.compile_only: cmds.extend(["--compile_only"]) diff --git a/backends/transforms/postpone_permute_below_squeeze_view.py b/backends/transforms/postpone_permute_below_squeeze_view.py index e0e9a3ec198..f676e19fb65 100644 --- a/backends/transforms/postpone_permute_below_squeeze_view.py +++ b/backends/transforms/postpone_permute_below_squeeze_view.py @@ -1,12 +1,12 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe +import copy from typing import cast, List import torch @@ -108,7 +108,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool: # view_node_shape is almost same as permute_node_shape # except it has one more dim somewhere # and the extra dim has value of 1. - new_view_shape = list(pred_shape) + new_view_shape = copy.deepcopy(pred_shape) new_view_shape.insert(index, 1) new_permute_dims = [x + 1 if x >= index else x for x in permute_dims] new_permute_dims.insert(index, index) @@ -132,7 +132,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool: # and the extra dim has value of 1. # Convert permute_dims to list of ints index_to_remove = permute_dims[index] - new_view_shape = list(pred_shape) + new_view_shape = copy.deepcopy(pred_shape) del new_view_shape[index_to_remove] new_permute_dims = [ x - 1 if x > index_to_remove else x for x in permute_dims diff --git a/backends/transforms/test/test_permute_optimization_passes.py b/backends/transforms/test/test_permute_optimization_passes.py index 550446da562..dd356aad8a2 100644 --- a/backends/transforms/test/test_permute_optimization_passes.py +++ b/backends/transforms/test/test_permute_optimization_passes.py @@ -1,6 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -26,8 +25,6 @@ from executorch.backends.transforms.replace_nop_transpose_or_permute_with_view import ( ReplaceNopTransposeOrPermuteWithViewPass, ) - -from executorch.exir import EdgeCompileConfig, to_edge from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult from torch.utils import _pytree as pytree @@ -480,38 +477,6 @@ def test_permute4_view3_chains(self) -> None: "PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView", ) - def test_postpone_permute_with_symbolic_shapes(self) -> None: - class DynamicPermuteViewModule(torch.nn.Module): - def forward(self, x: torch.Tensor) -> torch.Tensor: - y = x.view(x.shape[0], 12, 64) - y = y.permute(1, 0, 2) - y = y.view(1, 12, x.shape[0], 64) - return y.permute(0, 1, 3, 2) - - exported_program = torch.export.export( - DynamicPermuteViewModule(), - (torch.randn(3, 1, 768),), - dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=8)}}, - ) - edge_program = to_edge( - exported_program, - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ) - graph_module = edge_program.exported_program().graph_module - - result = cast( - PassResult, - PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView().call(graph_module), - ) - - self.assertTrue(result.modified) - self.assertEqual( - count_node(result.graph_module, exir_ops.edge.aten.view_copy.default), 2 - ) - self.assertEqual( - count_node(result.graph_module, exir_ops.edge.aten.permute_copy.default), 2 - ) - def test_negative_not_squeeze_like(self) -> None: """View that reshapes (not just squeeze/unsqueeze) should NOT be reordered.""" builder = GraphBuilder() diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp index 44fbc4bc8f6..05bdd9431c8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp @@ -9,36 +9,10 @@ #include #include -#include #include namespace vkcompute { -// -// Resize -// - -// resize_args = { block_config_ref } (unused here) -// -// Elementwise binary with broadcasting: output = broadcast(in_a, in_b). Without -// this the DynamicDispatchNode freezes the output at the build-time upper -// bound. Mirrors the fp32 resize_binary_op_node (same arg-group layout: inputs -// are args[1].refs[0] and [1]). -void resize_q8ta_binary_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in_a = args.at(1).refs.at(0); - const ValueRef in_b = args.at(1).refs.at(1); - - const std::vector a_sizes = graph->sizes_of(in_a); - const std::vector b_sizes = graph->sizes_of(in_b); - graph->virtual_resize( - out, calculate_broadcasted_output_size(a_sizes, b_sizes)); -} - // // Dispatch nodes // @@ -137,7 +111,7 @@ void add_q8ta_binary_node( // Resize args {block_config_ref}, // Resizing Logic - resize_q8ta_binary_node)); + nullptr)); } // diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp index b9f17021ea0..f6e89bef03d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include namespace vkcompute { @@ -219,51 +218,6 @@ ValueRef prepack_quantized_conv2d_weight( return packed_weight; } -// -// Resize -// - -// resize_args = { input, kernel_size, stride, padding, dilation } -// -// The q8ta_conv2d output is statically allocated at the build-time upper-bound -// shape. Without this resize function the DynamicDispatchNode would never -// virtual_resize the output on trigger_resize(), so a dynamic-shape graph would -// freeze the conv output at its upper bound — feeding e.g. a 238-row input into -// a 241-row buffer leaves garbage rows that GroupNorm's global statistics then -// smear across the whole tensor. Recompute H/W from the current input (N and C -// are shape-independent and stay as currently allocated). -void resize_q8ta_conv2d_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef stride = resize_args.at(2); - const ValueRef padding = resize_args.at(3); - const ValueRef dilation = resize_args.at(4); - - const std::vector in_sizes = graph->sizes_of(in); - - // H/W from the current input via the shared conv-output helper. kernel dims - // come from the kernel_size IntList (kernel_size_only=true); the args[3] slot - // is consulted only as an optional ceil_mode and dilation (non-bool) resolves - // it to false. transposed=false. - const std::vector out_hw = calc_out_sizes_hw( - *graph, - in_sizes, - kernel_size, - /*kernel_size_only=*/true, - {stride, padding, dilation, dilation}, - /*transposed=*/false); - - std::vector new_sizes = graph->sizes_of(out); - const size_t ndim = new_sizes.size(); - new_sizes.at(ndim - 2) = out_hw.at(0); - new_sizes.at(ndim - 1) = out_hw.at(1); - graph->virtual_resize(out, new_sizes); -} - // // Dispatch nodes // @@ -373,10 +327,8 @@ void add_q8ta_conv2d_node( push_constants, // Specialization Constants spec_constants, - // Resize args: { input, kernel_size, stride, padding, dilation } - {packed_int8_input, kernel_size, stride, padding, dilation}, - // Resize function: propagate dynamic H/W to the output. - resize_q8ta_conv2d_node)); + // Resize args + {})); } // diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h index 5d16cb3b78c..f463589c50a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h @@ -123,12 +123,7 @@ void add_q8ta_conv2d_pw_node( const ValueRef packed_bias, const uint32_t activation_type, const ValueRef packed_int8_output, - const int32_t groups = 1, - const ValueRef conv_input = kDummyValueRef, - const ValueRef kernel_size = kDummyValueRef, - const ValueRef stride = kDummyValueRef, - const ValueRef padding = kDummyValueRef, - const ValueRef dilation = kDummyValueRef); + const int32_t groups = 1); std::vector calculate_q8ta_im2col_sizes( ComputeGraph* graph, diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp index 914ca1a23ef..e690ff435a8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace vkcompute { @@ -173,45 +172,6 @@ ValueRef prepack_quantized_conv2d_dw_weight( return packed_weight; } -// -// Resize -// - -// resize_args = { input, kernel_size, stride, padding, dilation } -// -// Depthwise conv output H/W follows the same formula as a regular conv (channel -// count is unchanged: groups == in_channels == out_channels). Without this the -// DynamicDispatchNode freezes the output at the build-time upper bound. N/C are -// shape-independent and stay as currently allocated. Mirrors the regular q8ta -// conv resize (resize_q8ta_conv2d_node). -void resize_q8ta_conv2d_dw_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef stride = resize_args.at(2); - const ValueRef padding = resize_args.at(3); - const ValueRef dilation = resize_args.at(4); - - const std::vector in_sizes = graph->sizes_of(in); - - const std::vector out_hw = calc_out_sizes_hw( - *graph, - in_sizes, - kernel_size, - /*kernel_size_only=*/true, - {stride, padding, dilation, dilation}, - /*transposed=*/false); - - std::vector new_sizes = graph->sizes_of(out); - const size_t ndim = new_sizes.size(); - new_sizes.at(ndim - 2) = out_hw.at(0); - new_sizes.at(ndim - 1) = out_hw.at(1); - graph->virtual_resize(out, new_sizes); -} - // // Dispatch nodes // @@ -298,10 +258,10 @@ void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node( push_constants, // Specialization Constants spec_constants, - // Resize args: { input, kernel_size, stride, padding, dilation } - {packed_int8_input, kernel_size, stride, padding, dilation}, + // Resize args + {}, // Resizing Logic - resize_q8ta_conv2d_dw_node)); + nullptr)); } void add_q8ta_conv2d_dw_node( @@ -403,10 +363,8 @@ void add_q8ta_conv2d_dw_node( push_constants, // Specialization Constants spec_constants, - // Resize args: { input, kernel_size, stride, padding, dilation } - {packed_int8_input, kernel_size, stride, padding, dilation}, - // Resizing Logic - resize_q8ta_conv2d_dw_node)); + // Resize args + {})); } // diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp index 9aa6e7b05d1..b43fe9eacc6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include namespace vkcompute { @@ -96,59 +95,6 @@ std::vector calculate_q8ta_im2col_sizes( return {K, H, W}; } -// -// Resize -// - -// resize_args = { input, kernel_size, stride, padding, dilation, groups } -// -// The im2col scratch tensor is [K, H_out, align_up_4(W_out)] where K (the -// flattened conv window, channel/kernel-derived) is shape-independent and -// H_out/W_out are the conv output spatial dims. The downstream PW GEMM that -// consumes this scratch is resized separately (it preserves H/W). Without this, -// the scratch freezes at the build-time upper bound and feeds garbage rows into -// the GEMM. Recompute H_out/W_out from the CURRENT input (NOT the conv output -// tensor, which may itself still be frozen at this point in the resize order). -void resize_q8ta_im2col_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef im2col_out = args.at(0).refs.at(0); - const ValueRef in = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef stride = resize_args.at(2); - const ValueRef padding = resize_args.at(3); - const ValueRef dilation = resize_args.at(4); - const ValueRef groups = resize_args.at(5); - - const std::vector in_sizes = graph->sizes_of(in); - - // Conv output H/W from the current input. - const std::vector out_hw = calc_out_sizes_hw( - *graph, - in_sizes, - kernel_size, - /*kernel_size_only=*/true, - {stride, padding, dilation, dilation}, - /*transposed=*/false); - const int64_t out_height = out_hw.at(0); - const int64_t out_width = out_hw.at(1); - - // K (flattened conv window) is shape-independent — recompute from channels + - // kernel exactly as calculate_q8ta_im2col_sizes does. - const int64_t in_channels = utils::val_at(-3, in_sizes); - const int64_t groups_val = graph->extract_scalar(groups); - const int64_t in_channels_per_group = in_channels / groups_val; - const auto kernel_size_list = graph->get_int_list(kernel_size); - const int64_t flattened_kernel_len = utils::align_up_4( - in_channels_per_group * kernel_size_list->at(0) * - kernel_size_list->at(1)); - const int64_t K = flattened_kernel_len * groups_val; - const int64_t W = utils::align_up_4(out_width); - - graph->virtual_resize(im2col_out, {K, out_height, W}); -} - // // Dispatch nodes // @@ -222,11 +168,10 @@ void add_q8ta_im2col_node( push_constants, // Specialization Constants spec_constants, - // Resize args: { input, kernel_size, stride, padding, dilation, groups } - {packed_int8_input, kernel_size, stride, padding, dilation, groups}, - // Resizing Logic: recompute the im2col scratch dims from the current - // input - resize_q8ta_im2col_node)); + // Resize args + {}, + // Resizing Logic + nullptr)); } // @@ -327,14 +272,7 @@ void q8ta_conv2d_im2col( packed_bias, activation_type_val, packed_int8_output, - groups_val, - // Original activation + conv geometry so the PW output H/W is recomputed - // from the true conv result, not the width-padded im2col scratch. - packed_int8_input, - kernel_size, - stride, - padding, - dilation); + groups_val); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp index 4fb7f0fa775..7a2380f728a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include namespace vkcompute { @@ -182,69 +181,6 @@ ValueRef prepack_quantized_conv2d_pw_weight( return packed_weight; } -// -// Resize -// - -// resize_args = { input } -// -// Standalone 1x1 pointwise conv: stride 1, padding 0, dilation 1, so the output -// H/W equals the input activation H/W. Without this resize the output would -// freeze at the build-time upper bound. N/C are shape-independent and stay as -// currently allocated. -void resize_q8ta_conv2d_pw_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = resize_args.at(0); - - const std::vector in_sizes = graph->sizes_of(in); - std::vector new_sizes = graph->sizes_of(out); - const size_t out_ndim = new_sizes.size(); - const size_t in_ndim = in_sizes.size(); - // Copy H (dim -2) and W (dim -1) from the input; keep output N/C. - new_sizes.at(out_ndim - 2) = in_sizes.at(in_ndim - 2); - new_sizes.at(out_ndim - 1) = in_sizes.at(in_ndim - 1); - graph->virtual_resize(out, new_sizes); -} - -// resize_args = { conv_input, kernel_size, stride, padding, dilation } -// -// im2col-path PW conv. Here the PW node's bound input is the im2col scratch -// tensor sized {K, H_out, align_up_4(W_out)} — its width is rounded up to a -// multiple of 4 for texel alignment, so it must NOT be used to size the output. -// Recompute the TRUE conv H_out/W_out from the ORIGINAL activation + conv -// geometry, exactly as resize_q8ta_conv2d_node does. N/C are shape-independent -// and stay as currently allocated. -void resize_q8ta_conv2d_pw_im2col_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef conv_input = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef stride = resize_args.at(2); - const ValueRef padding = resize_args.at(3); - const ValueRef dilation = resize_args.at(4); - - const std::vector in_sizes = graph->sizes_of(conv_input); - - const std::vector out_hw = calc_out_sizes_hw( - *graph, - in_sizes, - kernel_size, - /*kernel_size_only=*/true, - {stride, padding, dilation, dilation}, - /*transposed=*/false); - - std::vector new_sizes = graph->sizes_of(out); - const size_t ndim = new_sizes.size(); - new_sizes.at(ndim - 2) = out_hw.at(0); - new_sizes.at(ndim - 1) = out_hw.at(1); - graph->virtual_resize(out, new_sizes); -} - // // Dispatch nodes // @@ -263,12 +199,7 @@ void add_q8ta_conv2d_pw_node( const ValueRef packed_bias, const uint32_t activation_type, const ValueRef packed_int8_output, - const int32_t groups, - const ValueRef conv_input, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation) { + const int32_t groups) { VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info( graph.packed_dim_info_of(packed_int8_input))); VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( @@ -320,21 +251,6 @@ void add_q8ta_conv2d_pw_node( graph.hashed_layout_of(packed_int8_input), }; - // The im2col path passes the original activation + conv geometry so the - // output H/W can be recomputed from the true conv result (the bound input is - // the width-padded im2col scratch and must not size the output). The - // standalone 1x1 PW conv passes only its real activation input, whose H/W the - // output matches directly. - std::vector resize_args; - ExecuteNode::ResizeFunction resize_fn; - if (conv_input == kDummyValueRef) { - resize_args = {packed_int8_input}; - resize_fn = resize_q8ta_conv2d_pw_node; - } else { - resize_args = {conv_input, kernel_size, stride, padding, dilation}; - resize_fn = resize_q8ta_conv2d_pw_im2col_node; - } - graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), @@ -350,8 +266,7 @@ void add_q8ta_conv2d_pw_node( param_buffers, push_constants, spec_constants, - resize_args, - resize_fn)); + {})); } // diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp index 7e3c4166e3c..bdbdaa14fec 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp @@ -13,50 +13,10 @@ #include #include #include -#include #include namespace vkcompute { -// resize_args = { input, kernel_size, stride, padding, dilation, output_padding -// } -// -// Transposed conv output H/W uses the transposed formula -// out = (in - 1) * stride - 2 * padding + dilation * (kernel - 1) -// + output_padding + 1 -// (computed by calc_out_sizes_hw's transposed=true path, where the 4th args -// slot is output_padding). Channels stay as allocated. Without this the -// DynamicDispatchNode freezes the output at the build-time upper bound. Mirrors -// the fp32 transposed path of resize_conv2d_node. -void resize_q8ta_conv2d_transposed_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = resize_args.at(0); - const ValueRef kernel_size = resize_args.at(1); - const ValueRef stride = resize_args.at(2); - const ValueRef padding = resize_args.at(3); - const ValueRef dilation = resize_args.at(4); - const ValueRef output_padding = resize_args.at(5); - - const std::vector in_sizes = graph->sizes_of(in); - - const std::vector out_hw = calc_out_sizes_hw( - *graph, - in_sizes, - kernel_size, - /*kernel_size_only=*/true, - {stride, padding, dilation, output_padding}, - /*transposed=*/true); - - std::vector new_sizes = graph->sizes_of(out); - const size_t ndim = new_sizes.size(); - new_sizes.at(ndim - 2) = out_hw.at(0); - new_sizes.at(ndim - 1) = out_hw.at(1); - graph->virtual_resize(out, new_sizes); -} - // Dedicated workgroup size functions for transposed convolution. // Unlike regular conv2d, transposed conv with stride > 1 causes branch // divergence along the height dimension (different rows have different @@ -123,7 +83,6 @@ void add_q8ta_conv2d_transposed_node( const ValueRef stride, const ValueRef padding, const ValueRef dilation, - const ValueRef output_padding, const ValueRef groups, const uint32_t activation_type, const ValueRef packed_int8_output) { @@ -216,16 +175,8 @@ void add_q8ta_conv2d_transposed_node( push_constants, // Specialization Constants spec_constants, - // Resize args: { input, kernel_size, stride, padding, dilation, - // output_padding } - {packed_int8_input, - kernel_size, - stride, - padding, - dilation, - output_padding}, - // Resizing Logic - resize_q8ta_conv2d_transposed_node)); + // Resize args + {})); } void q8ta_conv2d_transposed( @@ -244,9 +195,7 @@ void q8ta_conv2d_transposed( const ValueRef kernel_size = args.at(idx++); const ValueRef stride = args.at(idx++); const ValueRef padding = args.at(idx++); - // output_padding does not affect the shader, but it IS needed to compute the - // transposed-conv output H/W on resize (dynamic shapes). - const ValueRef output_padding = args.at(idx++); + args.at(idx++); // output_padding: only affects output size, not shader const ValueRef dilation = args.at(idx++); const ValueRef groups = args.at(idx++); const ValueRef activation = args.at(idx++); @@ -306,7 +255,6 @@ void q8ta_conv2d_transposed( stride, padding, dilation, - output_padding, groups, activation_type_val, packed_int8_output); diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp index 92daf9d8ac5..210bd0cd78b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp @@ -63,34 +63,6 @@ utils::uvec3 q8ta_linear_local_wg_size( graph, shader, global_workgroup_size, args, resize_args); } -// -// Resize -// - -// resize_args = {} -// -// Quantized linear/matmul: output = [*input.shape[:-1], out_features]. The -// leading/M dims follow the input; out_features (the last dim) is -// weight-derived and shape-independent, so it stays as currently allocated. -// Without this the DynamicDispatchNode freezes the output (incl. the M dim) at -// the build-time upper bound. Mirrors the fp32 resize_linear_qw_node shape -// logic, generalized to arbitrary input rank. -void resize_q8ta_linear_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - - std::vector new_sizes = graph->sizes_of(in); - const std::vector out_sizes = graph->sizes_of(out); - // Keep out_features (last dim, weight-derived); take all leading dims from - // in. - new_sizes.at(new_sizes.size() - 1) = out_sizes.at(out_sizes.size() - 1); - graph->virtual_resize(out, new_sizes); -} - // // Dispatch node // @@ -163,7 +135,7 @@ void add_q8ta_linear_node( // Resize args {}, // Resizing Logic - resize_q8ta_linear_node)); + nullptr)); } // diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp index fb0ffcab14c..bca36444725 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp @@ -13,21 +13,6 @@ namespace vkcompute { -// quantize / dequantize are elementwise: output shape == input shape. Without a -// resize function the DynamicDispatchNode freezes the output at the build-time -// upper bound, so on a dynamic-shape graph (e.g. a 238-row input fed to a -// 241-allocated graph) the FIRST quantize_per_tensor freezes everything -// downstream at 241. Propagate the input's current sizes to the output. -void resize_q8ta_qdq_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& resize_args) { - (void)resize_args; - const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); - graph->virtual_resize(out, graph->sizes_of(in)); -} - void add_q8ta_quantize_node( ComputeGraph& graph, const ValueRef fp_input, @@ -95,9 +80,7 @@ void add_q8ta_quantize_node( inp_block_config.as_packed_int(), outp_block_config.as_packed_int()}, // Resize args - {block_config_ref}, - // Resize function: output shape == input shape (elementwise). - resize_q8ta_qdq_node)); + {block_config_ref})); } void add_q8ta_dequantize_node( @@ -167,9 +150,7 @@ void add_q8ta_dequantize_node( outp_block_config.as_packed_int(), inp_block_config.as_packed_int()}, // Resize args - {block_config_ref}, - // Resize function: output shape == input shape (elementwise). - resize_q8ta_qdq_node)); + {block_config_ref})); } } // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp index 8709e4bdc2c..f7454b6b93a 100644 --- a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp +++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp @@ -53,13 +53,7 @@ static std::string pick_conv2d_dw_shader_with_selector( if (is_3x3) { kernel_name += "_output_tile_3x3"; if (impl_selector == "b1x1") { - // The _b1x1 batch-tile variant exists only for the non-sned family; - // sned (stride != dilation) shaders are not batch-tiled. Match - // pick_conv2d_dw_shader and only append it when stride == dilation, - // otherwise fall back to the un-suffixed sned shader. - if (stride_equals_dilation) { - kernel_name += "_b1x1"; - } + kernel_name += "_b1x1"; } else if (impl_selector == "b4x2") { // b4x2 is the default (no suffix) } else { diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp index a282ebfb0ff..12d4ed61b76 100644 --- a/backends/vulkan/test/custom_ops/utils.cpp +++ b/backends/vulkan/test/custom_ops/utils.cpp @@ -1366,10 +1366,6 @@ ComputeGraph setup_compute_graph( int op_invocations_per_execute) { GraphConfig config; config.enable_querypool = true; - // Default-on (opt-out via TestCase::set_force_resize(false)): force every - // DynamicDispatchNode to run its resize function on each execute(), - // exercising the op's resize formula even when input shapes are unchanged. - config.force_resize = test_case.get_force_resize(); ComputeGraph graph(config); std::vector input_values; diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h index 81bad5e9df0..d8fc36a5142 100644 --- a/backends/vulkan/test/custom_ops/utils.h +++ b/backends/vulkan/test/custom_ops/utils.h @@ -603,22 +603,6 @@ class TestCase { return target_execute_time_us_; } - // When true, the ComputeGraph built for this test case sets - // GraphConfig::force_resize, so every DynamicDispatchNode runs its resize - // function on each execute() even when no input shape changed. Because the - // output is already allocated at the swept shape, the resize must recompute - // the same shape from the current input — a wrong resize formula resizes the - // output to a mismatched shape and surfaces as a test failure. Default true - // (opt-out): every custom_ops test exercises its resize formulas across the - // swept shapes. Call set_force_resize(false) for the rare op whose resize fn - // is intentionally not shape-preserving under a fixed output allocation. - void set_force_resize(bool force_resize) { - force_resize_ = force_resize; - } - bool get_force_resize() const { - return force_resize_; - } - void add_input_spec(const ValueSpec& spec) { inputs_.push_back(spec); } @@ -664,7 +648,6 @@ class TestCase { shader_filter_ = kDefaultShaderFilter; op_invocations_per_execute_ = 0; target_execute_time_us_ = kDefaultTargetExecuteTimeUs; - force_resize_ = true; } private: @@ -677,7 +660,6 @@ class TestCase { std::vector shader_filter_; int op_invocations_per_execute_ = 0; // 0 = adaptive int target_execute_time_us_ = kDefaultTargetExecuteTimeUs; - bool force_resize_ = true; }; // diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index f7cd85f9758..957862935a4 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -38,10 +38,6 @@ set(WEBGPU_SRCS runtime/ops/sdpa/Sdpa.cpp runtime/ops/select_as_symint/SelectAsSymint.cpp runtime/ops/quantized_linear/QuantizedLinear.cpp - runtime/ops/mul/BinaryOp.cpp - runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp - runtime/ops/rope/RotaryEmbedding.cpp - runtime/ops/prepack/Prepack.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) @@ -142,6 +138,7 @@ endfunction() if(EXECUTORCH_BUILD_WEBGPU_TEST) add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp) + add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp) add_webgpu_native_test( webgpu_dispatch_order_test test/native/test_dispatch_order.cpp ) @@ -151,38 +148,4 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST) add_webgpu_native_test( webgpu_update_cache_test test/native/test_update_cache.cpp ) - - # Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) + - # its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON. - if(NOT TARGET GTest::gtest) - find_package(GTest QUIET) - endif() - if(TARGET GTest::gtest) - # Reuse add_webgpu_native_test for the backend link + frameworks + flags; - # add only driver_util, GTest, and the header-only nlohmann/json include. - add_webgpu_native_test(webgpu_op_test test/op_tests/op_test_driver.cpp) - target_sources(webgpu_op_test PRIVATE test/op_tests/driver_util.cpp) - target_link_libraries(webgpu_op_test PRIVATE GTest::gtest) - target_include_directories( - webgpu_op_test - PRIVATE "${EXECUTORCH_ROOT}/third-party/json/single_include" - ) - - # Device-free util unit test: no backend/Dawn link (pure manifest/tolerance - # helpers), so it does NOT use the native-test helper. - add_executable( - webgpu_op_test_util_test test/op_tests/test_driver_util.cpp - test/op_tests/driver_util.cpp - ) - target_include_directories( - webgpu_op_test_util_test - PRIVATE $ - "${EXECUTORCH_ROOT}/third-party/json/single_include" - ) - target_link_libraries( - webgpu_op_test_util_test PRIVATE GTest::gtest GTest::gtest_main - ) - target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions) - set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17) - endif() endif() diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp index ceca89d1710..aed769da4a4 100644 --- a/backends/webgpu/runtime/WebGPUBackend.cpp +++ b/backends/webgpu/runtime/WebGPUBackend.cpp @@ -98,21 +98,20 @@ Error WebGPUBackend::execute( const size_t num_outputs = graph->output_ids().size(); // Copy inputs from EValue tensors to GPU buffers - std::vector inputs; + std::vector> inputs; inputs.reserve(num_inputs); for (size_t i = 0; i < num_inputs; i++) { const auto& tensor = args[i]->toTensor(); - const bool host_is_int64 = - tensor.scalar_type() == executorch::aten::ScalarType::Long; - inputs.push_back({tensor.const_data_ptr(), tensor.nbytes(), host_is_int64}); + inputs.emplace_back(tensor.const_data_ptr(), tensor.nbytes()); } + graph->copy_inputs(inputs); + // Fail loud as a runtime Error so a throw never crosses the backend boundary. try { - graph->copy_inputs(inputs); graph->update_symints_from_inputs(inputs); graph->propagate_resize(); } catch (const std::exception& e) { - ET_LOG(Error, "WebGPU input copy / symint refresh failed: %s", e.what()); + ET_LOG(Error, "WebGPU symint refresh/resize failed: %s", e.what()); return Error::Internal; } diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b7fb4313400..1c977d130dd 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -26,10 +26,6 @@ namespace executorch::backends::webgpu { namespace { -// Op name the AOT exporter emits for a prepacked constant (must match the -// serialized schema); compared in the prepack pre-scan below. -constexpr const char* kPrepackOpName = "et_vk.prepack.default"; - size_t vk_datatype_size(vkgraph::VkDataType dtype) { switch (dtype) { case vkgraph::VkDataType::BOOL: @@ -49,19 +45,6 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) { } } -bool vk_datatype_is_int(vkgraph::VkDataType dtype) { - switch (dtype) { - case vkgraph::VkDataType::BOOL: - case vkgraph::VkDataType::UINT8: - case vkgraph::VkDataType::INT8: - case vkgraph::VkDataType::INT32: - case vkgraph::VkDataType::INT64: - return true; - default: - return false; - } -} - } // namespace WebGPUGraph::WebGPUGraph() = default; @@ -78,7 +61,7 @@ WGPUBuffer WebGPUGraph::create_scratch_buffer(size_t nbytes) { } void WebGPUGraph::update_symints_from_inputs( - const std::vector& inputs) { + const std::vector>& inputs) { for (const auto& src : symint_sources_) { int pos = -1; for (size_t i = 0; i < input_ids_.size(); i++) { @@ -117,8 +100,8 @@ void WebGPUGraph::update_symints_from_inputs( // Reads the [0,..,index,..,0] element; symint sources are scalar-ish. const int64_t offset = static_cast(index) * stride; // elem_size back-derived from build-time numel (sources are static-shaped). - const void* host = inputs[pos].data; - const size_t elem_size = inputs[pos].nbytes / static_cast(numel); + const void* host = inputs[pos].first; + const size_t elem_size = inputs[pos].second / static_cast(numel); int32_t val; if (elem_size == sizeof(int64_t)) { val = static_cast(static_cast(host)[offset]); @@ -234,10 +217,6 @@ void WebGPUGraph::build( const auto* graph = vkgraph::GetVkGraph(flatbuffer_data); - // .pte byte sources for prepack-time constant materialization (build-only). - constant_data_ = constant_data; - named_data_map_ = named_data_map; - // Phase 1: Create all values const auto* values = graph->values(); const int num_vals = values ? values->size() : 0; @@ -247,42 +226,6 @@ void WebGPUGraph::build( ints_.resize(num_vals, 0); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); - value_lists_.resize(num_vals); - - // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the - // prepack node materializes it once) only if it is a prepack source AND never - // a direct arg of a non-prepack op. ValueList args are expanded so a constant - // reached through a list still counts as a direct use. - std::unordered_set prepack_src_ids; - std::unordered_set direct_use_ids; - const auto* chain_prescan = graph->chain(); - if (chain_prescan) { - for (unsigned ci = 0; ci < chain_prescan->size(); ci++) { - const auto* oc = chain_prescan->Get(ci); - const bool is_prepack = oc->name()->str() == kPrepackOpName; - const auto* a = oc->args(); - if (!a) { - continue; - } - for (unsigned j = 0; j < a->size(); j++) { - int id = static_cast(a->Get(j)); - if (is_prepack && j == 0) { - prepack_src_ids.insert(id); - } else if (!is_prepack) { - direct_use_ids.insert(id); - const auto* v = values ? values->Get(id) : nullptr; - if (v && v->value_type() == vkgraph::GraphTypes::ValueList) { - const auto* items = v->value_as_ValueList()->items(); - if (items) { - for (unsigned k = 0; k < items->size(); k++) { - direct_use_ids.insert(static_cast(items->Get(k))); - } - } - } - } - } - } - } for (int i = 0; i < num_vals; i++) { const auto* val = values->Get(i); @@ -305,57 +248,56 @@ void WebGPUGraph::build( numel *= dims->Get(j); } } - tensor.elem_size = vk_datatype_size(vk_tensor->datatype()); - tensor.is_int = vk_datatype_is_int(vk_tensor->datatype()); - tensor.nbytes = numel * tensor.elem_size; + tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); int constant_id = vk_tensor->constant_id(); int mem_obj_id = vk_tensor->mem_obj_id(); - // Constants are dedicated. Every constant is recorded as a - // ConstantSource and materialized via materialize_constant (one - // CPU->GPU write); a constant consumed ONLY via prepack is deferred - // (no eager buffer -- its prepack node performs that one write). + // Constants always get dedicated buffers regardless of mem_obj_id if (constant_id >= 0 || mem_obj_id < 0) { tensor_mem_obj_ids_[i] = -1; - - if (constant_id >= 0) { + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(tensor.nbytes, size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + if (constant_id >= 0 && constant_data && tensor.nbytes > 0) { const auto* constants = graph->constants(); - if (!constants || - constant_id >= static_cast(constants->size())) { - throw std::runtime_error( - "WebGPU: constant_id set but the constants table is missing " - "or the id is out of range"); - } - const auto* vk_bytes = constants->Get(constant_id); - ConstantSource cs; - cs.nbytes = tensor.nbytes; - if (vk_bytes->offset() != UINT64_MAX) { - cs.inline_offset = vk_bytes->offset(); - } else if (vk_bytes->named_key() != nullptr) { - cs.named_key = vk_bytes->named_key()->str(); - } else { - throw std::runtime_error( - "WebGPU: constant has no inline offset and no named-data key"); - } - constant_sources_[i] = std::move(cs); - } - - // Defer constants consumed solely via prepack: skip the eager buffer. - const bool defer = constant_id >= 0 && - prepack_src_ids.count(i) != 0 && direct_use_ids.count(i) == 0; - if (!defer) { - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = std::max(tensor.nbytes, size_t(4)); - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - // Same single CPU->GPU write the prepack node uses (no - // duplication). - if (constant_id >= 0) { - materialize_constant(i, tensor.buffer); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } else if ( + vk_bytes->named_key() != nullptr && + named_data_map != nullptr) { + // Constant stored in the PTE named-data map. + auto buf = + named_data_map->get_data(vk_bytes->named_key()->c_str()); + if (!buf.ok()) { + throw std::runtime_error( + std::string("WebGPU: named constant '") + + vk_bytes->named_key()->c_str() + + "' not found in NamedDataMap"); + } + if (buf->size() < tensor.nbytes) { + throw std::runtime_error( + std::string("WebGPU: named constant '") + + vk_bytes->named_key()->c_str() + "' undersized: have " + + std::to_string(buf->size()) + " bytes, need " + + std::to_string(tensor.nbytes)); + } + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, buf->data(), tensor.nbytes); + buf->Free(); + } else { + throw std::runtime_error( + "WebGPU: constant has no inline offset and no named-data key"); + } } } } else { @@ -406,16 +348,6 @@ void WebGPUGraph::build( add_uniform_buffer_bytes(kSymIntUniformBytes); break; } - case vkgraph::GraphTypes::ValueList: { - value_types_[i] = ValueType::ValueList; - const auto* items = val->value_as_ValueList()->items(); - if (items) { - for (unsigned j = 0; j < items->size(); j++) { - value_lists_[i].push_back(static_cast(items->Get(j))); - } - } - break; - } default: value_types_[i] = ValueType::Null; break; @@ -492,47 +424,6 @@ void WebGPUGraph::build( webgpu_operator_registry().get_op_fn(op_name)(*this, args); } } - - // Prepack nodes (Phase 3) materialized their constants directly into the - // consumer buffers via materialize_constant; no separate copy pass needed. - // The .pte bytes are freed right after build() returns (WebGPUBackend - // processed->Free()), so clear the build-only source pointers. - constant_data_ = nullptr; - named_data_map_ = nullptr; -} - -void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) { - auto it = constant_sources_.find(const_value_id); - if (it == constant_sources_.end()) { - throw std::runtime_error( - "WebGPU: no source recorded for constant id " + - std::to_string(const_value_id)); - } - const ConstantSource& cs = it->second; - if (cs.nbytes == 0) { - return; - } - if (cs.inline_offset != UINT64_MAX) { - if (constant_data_ == nullptr) { - throw std::runtime_error("WebGPU: inline constant data is null"); - } - wgpuQueueWriteBuffer( - queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes); - } else if (!cs.named_key.empty() && named_data_map_ != nullptr) { - auto buf = named_data_map_->get_data(cs.named_key.c_str()); - if (!buf.ok()) { - throw std::runtime_error( - "WebGPU: named constant '" + cs.named_key + "' not found"); - } - if (buf->size() < cs.nbytes) { - throw std::runtime_error( - "WebGPU: named constant '" + cs.named_key + "' undersized"); - } - wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes); - buf->Free(); - } else { - throw std::runtime_error("WebGPU: constant has no source"); - } } WGPUShaderModule WebGPUGraph::get_or_create_shader( @@ -593,47 +484,16 @@ WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl( return bgl; } -void WebGPUGraph::copy_inputs(const std::vector& inputs) { +void WebGPUGraph::copy_inputs( + const std::vector>& inputs) { for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) { - const InputData& in = inputs[i]; - if (in.nbytes == 0) { + if (inputs[i].second == 0) { continue; } int tid = input_ids_[i]; const auto& tensor = tensors_[tid]; - - // Fast path: host and GPU element types match byte-for-byte. - if (in.nbytes == tensor.nbytes) { - wgpuQueueWriteBuffer(queue_, tensor.buffer, 0, in.data, tensor.nbytes); - continue; - } - - // Narrow int64 host indices into the int32 buffer (mirrors Vulkan). - const bool buffer_is_int32 = tensor.is_int && tensor.elem_size == 4; - if (in.host_is_int64 && buffer_is_int32 && in.nbytes == tensor.nbytes * 2) { - const size_t numel = tensor.nbytes / 4; - const int64_t* src = static_cast(in.data); - std::vector narrowed(numel); - for (size_t e = 0; e < numel; e++) { -#ifndef NDEBUG - // Index tensors (tokens/positions) are far below int32 range in - // practice; assert in debug that the narrowing is lossless. - if (static_cast(src[e]) != src[e]) { - throw std::runtime_error("WebGPU: int64 index overflows int32"); - } -#endif - narrowed[e] = static_cast(src[e]); - } - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, narrowed.data(), tensor.nbytes); - continue; - } - - throw std::runtime_error( - "WebGPU: unsupported input copy for input " + std::to_string(i) + - " (host " + std::to_string(in.nbytes) + " bytes" + - (in.host_is_int64 ? " int64" : "") + " vs buffer " + - std::to_string(tensor.nbytes) + " bytes)"); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, inputs[i].first, inputs[i].second); } } @@ -855,11 +715,10 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { stats.num_tensors++; - // Shared tensors are tracked via shared_buffer_sizes_; a deferred - // prepack-routed constant has no buffer (no GPU memory) -> not counted. + // Shared tensors are tracked via shared_buffer_sizes_ bool is_shared = i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; - if (!is_shared && tensors_[i].buffer != nullptr) { + if (!is_shared) { stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; } } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 3572f751a06..3cff09ecb6d 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -25,16 +25,6 @@ struct WebGPUTensor { WGPUBuffer buffer = nullptr; std::vector dims; size_t nbytes = 0; - // Serialized (GPU-side) element type, used to narrow wider host inputs. - size_t elem_size = 0; - bool is_int = false; -}; - -// Host-side view of one graph input, passed to copy_inputs. -struct InputData { - const void* data = nullptr; - size_t nbytes = 0; - bool host_is_int64 = false; }; struct WebGPUDispatch { @@ -50,15 +40,6 @@ struct OutputCopy { size_t nbytes = 0; }; -// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef -// (sizes + a data reference, not a live GPU tensor). The prepack node is the -// sole materialization, so the constant needs no eager GPU buffer. -struct ConstantSource { - uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key - std::string named_key; // non-empty => fetch from named_data_map_ - size_t nbytes = 0; -}; - struct ExecuteConfig { size_t chunk_size = 0; size_t initial_chunk_size = 0; @@ -94,7 +75,7 @@ class WebGPUGraph { const executorch::runtime::NamedDataMap* named_data_map = nullptr); // Copy input tensor data from host pointers into GPU buffers. - void copy_inputs(const std::vector& inputs); + void copy_inputs(const std::vector>& inputs); // Execute all recorded dispatches. void execute(); @@ -128,10 +109,6 @@ class WebGPUGraph { bool get_bool(int id) const { return bools_[id]; } - // Member value ids of a serialized ValueList (op multi-output list). - const std::vector& get_value_list(int id) const { - return value_lists_[id]; - } // Live-scalar (SymInt) API; mirrors the Vulkan SymInt/ParamsBuffer UBO. // set_symint writes the buffer + marks dirty only if the value changed. @@ -161,7 +138,8 @@ class WebGPUGraph { } // Execute-time select_as_symint read; mirrors Vulkan select_as_symint_impl. - void update_symints_from_inputs(const std::vector& inputs); + void update_symints_from_inputs( + const std::vector>& inputs); // Per-SymInt resize hook; mirrors Vulkan DynamicDispatchNode::trigger_resize. void add_resize_hook(int symint_id, std::function fn) { @@ -189,11 +167,6 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } - // Materialize a recorded prepack-routed constant into dst via one CPU->GPU - // transfer. Build-time only (the .pte bytes are freed after build()). - // Mirrors Vulkan prepack_standard. - void materialize_constant(int const_value_id, WGPUBuffer dst); - void add_uniform_buffer_bytes(size_t bytes) { uniform_buffer_bytes_ += bytes; } @@ -233,16 +206,7 @@ class WebGPUGraph { return static_cast(value_types_.size()); } - enum class ValueType { - Tensor, - Int, - Double, - Bool, - Null, - String, - SymInt, - ValueList - }; + enum class ValueType { Tensor, Int, Double, Bool, Null, String, SymInt }; ValueType get_value_type(int id) const { return value_types_[id]; @@ -260,7 +224,6 @@ class WebGPUGraph { std::vector ints_; std::vector doubles_; std::vector bools_; - std::vector> value_lists_; // SymInt (live scalar): id -> {live Uniform buffer, current value}, sparse. struct SymIntSlot { @@ -300,13 +263,6 @@ class WebGPUGraph { std::vector dispatches_; - // Prepack-routed constant sources (offset/named-key + size); the prepack node - // materializes these once. constant_data_/named_data_map_ point at the .pte - // bytes and are valid only during build(). - const uint8_t* constant_data_ = nullptr; - const executorch::runtime::NamedDataMap* named_data_map_ = nullptr; - std::unordered_map constant_sources_; - ExecuteConfig execute_config_; // Caches for reusing GPU objects across dispatches. diff --git a/backends/webgpu/runtime/WebGPUUtils.h b/backends/webgpu/runtime/WebGPUUtils.h index 39eb3caa28b..690ea72ebf7 100644 --- a/backends/webgpu/runtime/WebGPUUtils.h +++ b/backends/webgpu/runtime/WebGPUUtils.h @@ -12,7 +12,6 @@ #include #include -#include #include #include @@ -49,25 +48,4 @@ inline uint32_t compute_1d_workgroup_count( return count; } -// Create a uniform buffer mapped-at-creation, copy `size` bytes in, and unmap. -inline WGPUBuffer -make_uniform(WGPUDevice device, const void* data, size_t size) { - WGPUBufferDescriptor desc = {}; - desc.size = size; - desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; - desc.mappedAtCreation = true; - WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &desc); - if (!buf) { - throw std::runtime_error("make_uniform: buffer creation failed"); - } - void* ptr = wgpuBufferGetMappedRange(buf, 0, size); - if (!ptr) { - wgpuBufferRelease(buf); - throw std::runtime_error("make_uniform: mapped range is null"); - } - std::memcpy(ptr, data, size); - wgpuBufferUnmap(buf); - return buf; -} - } // namespace executorch::backends::webgpu::utils diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index 84b5349ef2d..28d4e8fef91 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -8,9 +8,8 @@ # Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader. # This is the substantive op-coverage gate: unlike the python operators suite # (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these -# executables run quantized_linear / SDPA / update_cache / multi-dispatch -# ordering / scratch through the real WebGPU backend on Dawn. (Simple ops — -# add / rms_norm / the misc ops — run through the cases.py op-test framework.) +# executables run rms_norm / multi-dispatch ordering / scratch through the real +# WebGPU backend on Dawn. # # Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES + # LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs: @@ -18,9 +17,9 @@ # bash backends/webgpu/scripts/test_webgpu_native_ci.sh # # Builds whatever native test targets are present in the landed tree (NOT a fixed -# list): webgpu_native_test (base) + webgpu_dispatch_order_test, -# webgpu_scratch_buffer_test (D107576199) + webgpu_update_cache_test -# (D107547307). SDPA executables join once they land. +# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) + +# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199) + +# webgpu_update_cache_test (D107547307). SDPA executables join once they land. set -e @@ -38,31 +37,22 @@ fi cd "${EXECUTORCH_ROOT}" # ── Exports for the model-driven executables (best-effort) ─────────────────── -# native_test (quantized_linear/SDPA/update_cache) + dispatch_order read .pte/ -# golden inputs via env/dir and self-skip if absent; scratch is standalone. -# native_test itself is gated below on the executorch wheel being importable. +# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and +# self-skip if absent; scratch is standalone (generates its own inputs). +PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" +RMS_NORM_DIR="/tmp/rmsn" +RMS_NORM_OK=1 DISPATCH_ORDER_DIR="/tmp/dispatch_order" DISPATCH_ORDER_OK=1 UPDATE_CACHE_DIR="/tmp/update_cache" UPDATE_CACHE_OK=1 -EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte" -EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin" -EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin" -EMBEDDING_LLAMA1B_MODEL="/tmp/webgpu_embedding_q4gsw_llama1b.pte" -EMBEDDING_LLAMA1B_INDICES="/tmp/webgpu_embedding_q4gsw_llama1b_indices.bin" -EMBEDDING_LLAMA1B_GOLDEN="/tmp/webgpu_embedding_q4gsw_llama1b_golden.bin" -ROPE_MODEL="/tmp/webgpu_rope.pte" -ROPE_XQ_GOLDEN="/tmp/webgpu_rope_xq_golden.bin" -ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin" -ROPE_DECODE_MODEL="/tmp/webgpu_rope_decode.pte" -ROPE_DECODE_XQ_GOLDEN="/tmp/webgpu_rope_decode_xq_golden.bin" -ROPE_DECODE_XK_GOLDEN="/tmp/webgpu_rope_decode_xk_golden.bin" -PREPACK_MODEL="/tmp/webgpu_prepack.pte" -PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin" -PREPACK2_MODEL="/tmp/webgpu_prepack_two_const.pte" -PREPACK2_GOLDEN="/tmp/webgpu_prepack_two_const_golden.bin" -PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte" -PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin" + +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model +export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') +" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent" $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models @@ -70,23 +60,9 @@ export_all_quantized_linear_models('/tmp') " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import export_embedding_q4gsw_model -export_embedding_q4gsw_model('${EMBEDDING_MODEL}', '${EMBEDDING_GOLDEN}', '${EMBEDDING_INDICES}') -export_embedding_q4gsw_model('${EMBEDDING_LLAMA1B_MODEL}', '${EMBEDDING_LLAMA1B_GOLDEN}', '${EMBEDDING_LLAMA1B_INDICES}', 'llama1b') -" || echo "WARN: embedding_q4gsw export failed; embedding configs will FAIL in webgpu_native_test" - -$PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model -export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}') -export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode') -" || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test" - -$PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model -export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}') -export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}') -export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}') -" || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test" +from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases +export_rms_norm_cases('${RMS_NORM_DIR}') +" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; } $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases @@ -136,7 +112,7 @@ cmake \ "${EXECUTORCH_ROOT}" # ── Build + run every native test target that exists in this tree ──────────── -TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) +TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) BIN_DIR="${BUILD_DIR}/backends/webgpu" # Which targets are defined depends on which diffs are landed (native_test + @@ -165,35 +141,20 @@ for t in "${TARGETS[@]}"; do done echo "=== Run native tests on Dawn + SwiftShader ===" -# webgpu_native_test hosts the quantized_linear / SDPA / update_cache / symint -# sweeps. Gate on the executorch wheel being importable (the proxy for "the -# exports above ran"): CI has the wheel so they ran; a bare local run without it -# skips here rather than hard-failing the required-config guards. -if [[ -x "${BIN_DIR}/webgpu_native_test" ]] && - "${PYTHON_EXECUTABLE}" -c "import executorch" 2>/dev/null; then - env WEBGPU_TEST_SDPA_DIR=/tmp/ \ +# native_test is model-driven; only run it if the export produced its .pte +# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare +# local run without the wheel self-skips here rather than hard-failing on load). +if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then + env WEBGPU_TEST_MODEL="${PTE_MODEL}" \ + WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ + WEBGPU_TEST_SDPA_DIR=/tmp/ \ WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \ - WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL="${EMBEDDING_MODEL}" \ - WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES="${EMBEDDING_INDICES}" \ - WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN="${EMBEDDING_GOLDEN}" \ - WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL="${EMBEDDING_LLAMA1B_MODEL}" \ - WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES="${EMBEDDING_LLAMA1B_INDICES}" \ - WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN="${EMBEDDING_LLAMA1B_GOLDEN}" \ - WEBGPU_TEST_ROPE_MODEL="${ROPE_MODEL}" \ - WEBGPU_TEST_ROPE_XQ_GOLDEN="${ROPE_XQ_GOLDEN}" \ - WEBGPU_TEST_ROPE_XK_GOLDEN="${ROPE_XK_GOLDEN}" \ - WEBGPU_TEST_ROPE_DECODE_MODEL="${ROPE_DECODE_MODEL}" \ - WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN="${ROPE_DECODE_XQ_GOLDEN}" \ - WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN="${ROPE_DECODE_XK_GOLDEN}" \ - WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \ - WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \ - WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \ - WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \ - WEBGPU_TEST_PREPACK_TIED_MODEL="${PREPACK_TIED_MODEL}" \ - WEBGPU_TEST_PREPACK_TIED_GOLDEN="${PREPACK_TIED_GOLDEN}" \ "${BIN_DIR}/webgpu_native_test" else - echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)" + echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)" +fi +if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then + "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}" fi if [[ "${UPDATE_CACHE_OK}" == "1" && -x "${BIN_DIR}/webgpu_update_cache_test" ]]; then "${BIN_DIR}/webgpu_update_cache_test" "${UPDATE_CACHE_DIR}" @@ -204,25 +165,3 @@ fi [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" echo "=== WebGPU native tests on Dawn: all run targets passed ===" - -# ── Op-test codegen framework: generate manifest → build → run (Dawn+SwiftShader) ── -# Reconfigure the SAME build dir adding GTest (EXECUTORCH_BUILD_TESTS=ON), then run -# every op in cases.py against its torch golden. Self-skips if the generator can't run. -OP_TEST_DIR="/tmp/webgpu_op_tests" -if $PYTHON_EXECUTABLE -m executorch.backends.webgpu.test.op_tests.generate_op_tests \ - --output "${OP_TEST_DIR}"; then - echo "=== Reconfigure with GTest + build/run op-test framework ===" - cmake -DEXECUTORCH_BUILD_TESTS=ON -B "${BUILD_DIR}" "${EXECUTORCH_ROOT}" - OP_DEFINED="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)" - if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test_util_test; then - cmake --build "${BUILD_DIR}" --target webgpu_op_test_util_test -j"${NPROC}" - "${BIN_DIR}/webgpu_op_test_util_test" - fi - if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test; then - cmake --build "${BUILD_DIR}" --target webgpu_op_test -j"${NPROC}" - "${BIN_DIR}/webgpu_op_test" --manifest "${OP_TEST_DIR}/manifest.json" - fi - echo "=== WebGPU op-test framework on Dawn: passed ===" -else - echo "WARN: op-test manifest generation failed (needs the executorch wheel); skipping" -fi diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 5ea465e853b..6681499c055 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -26,18 +26,36 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v echo "=== Step 1: Run Python export tests ===" $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v -$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v +# Non-fatal: a rms_norm pytest failure skips the rms_norm native test below +# rather than aborting the whole run. +RMS_NORM_PYTEST_OK=1 +$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v \ + || RMS_NORM_PYTEST_OK=0 # ── Step 2: Export .pte model ───────────────────────────────────────────────── echo "=== Step 2: Export test models ===" +PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" +RMS_NORM_DIR="/tmp/rmsn" DISPATCH_ORDER_DIR="/tmp/dispatch_order" PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model +export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') +" +$PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') " +if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then + $PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases +export_rms_norm_cases('${RMS_NORM_DIR}') +" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; } +fi echo "=== Export update_cache model ===" UPDATE_CACHE_OK=1 @@ -95,6 +113,7 @@ cmake \ "${EXECUTORCH_ROOT}" cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} +cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC} cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC} cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC} @@ -106,10 +125,18 @@ else echo "(skipping update_cache native test: export did not complete)" fi env \ + WEBGPU_TEST_MODEL="${PTE_MODEL}" \ + WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ ${UPDATE_CACHE_ENV_VAR} \ WEBGPU_TEST_SDPA_DIR=/tmp/ \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" +if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then + "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_rms_norm_test" "${RMS_NORM_DIR}" +else + echo "(skipping rms_norm native test: pytest or export did not complete)" +fi + "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index ad7ad2f2fc2..ef643d33482 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -24,6 +24,118 @@ using namespace executorch::backends::webgpu; using namespace executorch::extension; using namespace executorch::runtime; +static bool test_single_add(const std::string& model_path) { + printf("\n--- Test: single add (1024x1024) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector a_data(size); + std::vector b_data(size); + for (int i = 0; i < size; i++) { + a_data[i] = static_cast(i) * 1.0f; + b_data[i] = static_cast(i) * 2.0f; + } + + auto a = make_tensor_ptr({dim, dim}, std::vector(a_data)); + auto b = make_tensor_ptr({dim, dim}, std::vector(b_data)); + + auto result = module.forward({EValue(a), EValue(b)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + int check_count = std::min(size, 1024); + for (int i = 0; i < check_count; i++) { + float expected = a_data[i] + b_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e (checked %d elements)\n", max_error, check_count); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: single add test\n"); + return true; +} + +static bool test_chained_add(const std::string& model_path) { + printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector x_data(size); + std::vector y_data(size); + for (int i = 0; i < size; i++) { + x_data[i] = static_cast(i % 100) * 0.01f; + y_data[i] = static_cast(i % 50) * 0.02f; + } + + auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); + auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); + + auto result = module.forward({EValue(x), EValue(y)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + for (int i = 0; i < size; i++) { + float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e (checked %d elements)\n", max_error, size); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: chained add test\n"); + return true; +} + #ifdef WGPU_BACKEND_ENABLE_PROFILING // Capacity-overrun must throw; runs without a device or TimestampQuery. static bool test_query_pool_overrun_throws() { @@ -295,112 +407,7 @@ static float q4gsw_ramp(int i) { return static_cast((i % 17) - 8) / 16.0f; } -// Fwd decl of the per-element abs-OR-rel tolerance helper (defined below). -static bool quant_within_tol( - const float* out, - const float* golden, - int n, - float atol, - float rtol, - float* ma, - float* mr); - -static std::vector load_indices( - const std::string& path, - size_t numel) { - // Load raw little-endian int32 indices written by the export .py. - std::vector g(numel); - FILE* f = std::fopen(path.c_str(), "rb"); - if (!f) { - return {}; - } - size_t n = std::fread(g.data(), sizeof(int32_t), numel, f); - std::fclose(f); - if (n != numel) { - return {}; - } - return g; -} - -static bool test_embedding_q4gsw( - const std::string& model_path, - const std::string& indices_path, - const std::string& golden_path, - int num_indices, - int embed, - const char* label) { - // q4gsw embedding-gather vs torch golden; shapes per test_embedding_q4gsw.py. - const int out_numel = num_indices * embed; - printf( - "\n--- Test: embedding_q4gsw (%s: indices=%d, embed=%d) ---\n", - label, - num_indices, - embed); - - Module module(model_path); - auto err = module.load_forward(); - if (err != Error::Ok) { - printf("FAIL: could not load forward method (error %d)\n", (int)err); - return false; - } - printf("Model loaded: %s\n", model_path.c_str()); - - std::vector idx32 = load_indices(indices_path, num_indices); - std::vector golden = load_golden(golden_path, out_numel); - if (idx32.empty() || golden.empty()) { - printf( - "FAIL: could not load indices %s / golden %s\n", - indices_path.c_str(), - golden_path.c_str()); - return false; - } - - // int64 at the program boundary; copy_inputs narrows to the int32 buffer. - std::vector idx64(idx32.begin(), idx32.end()); - auto idx = make_tensor_ptr({num_indices}, std::move(idx64)); - - auto result = module.forward({EValue(idx)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - const auto& outputs = result.get(); - if (outputs.empty() || !outputs[0].isTensor()) { - printf("FAIL: no tensor output\n"); - return false; - } - const auto& out_tensor = outputs[0].toTensor(); - if (out_tensor.numel() != out_numel) { - printf( - "FAIL: output numel %zu != expected %d\n", - (size_t)out_tensor.numel(), - out_numel); - return false; - } - const float* out_data = out_tensor.const_data_ptr(); - - float max_abs_err = 0.0f, max_rel_err = 0.0f; - const bool pass = quant_within_tol( - out_data, - golden.data(), - out_numel, - 1e-3f, - 1e-3f, - &max_abs_err, - &max_rel_err); - printf( - "Max abs error: %e Max rel error: %e (checked %d elements)\n", - max_abs_err, - max_rel_err, - out_numel); - if (!pass) { - printf("FAIL: embedding_q4gsw exceeds tolerance 1e-3 (abs AND rel)\n"); - return false; - } - printf("PASS: embedding_q4gsw test\n"); - return true; -} - +// Per-element dual tolerance (abs OR rel), parameterized like sdpa_within_tol. static bool quant_within_tol( const float* out, const float* golden, @@ -425,185 +432,6 @@ static bool quant_within_tol( return ok; } -static bool test_rope( - const std::string& model_path, - const std::string& xq_golden_path, - const std::string& xk_golden_path, - int S, - int NH, - int NKV, - int HD, - const char* label) { - // Llama interleaved RoPE vs torch goldens; shapes/ramps per test_rope.py. - const int xq_numel = S * NH * HD; - const int xk_numel = S * NKV * HD; - const int freqs_numel = S * (HD / 2); - printf( - "\n--- Test: apply_rotary_emb (%s: S=%d,NH=%d,NKV=%d,HD=%d) ---\n", - label, - S, - NH, - NKV, - HD); - - Module module(model_path); - auto err = module.load_forward(); - if (err != Error::Ok) { - printf("FAIL: could not load forward method (error %d)\n", (int)err); - return false; - } - printf("Model loaded: %s\n", model_path.c_str()); - - // ((i % mod) - off) / 16: exact in fp32, matches test_rope.py::_ramp. - auto ramp = [](int i, int mod, int off) { - return static_cast((i % mod) - off) / 16.0f; - }; - std::vector xq(xq_numel), xk(xk_numel), fc(freqs_numel), - fs(freqs_numel); - for (int i = 0; i < xq_numel; i++) { - xq[i] = ramp(i, 17, 8); - } - for (int i = 0; i < xk_numel; i++) { - xk[i] = ramp(i, 13, 6); - } - for (int i = 0; i < freqs_numel; i++) { - fc[i] = ramp(i, 11, 5); - fs[i] = ramp(i, 7, 3); - } - - auto xqt = make_tensor_ptr({1, S, NH, HD}, std::vector(xq)); - auto xkt = make_tensor_ptr({1, S, NKV, HD}, std::vector(xk)); - auto fct = make_tensor_ptr({S, HD / 2}, std::vector(fc)); - auto fst = make_tensor_ptr({S, HD / 2}, std::vector(fs)); - - auto result = - module.forward({EValue(xqt), EValue(xkt), EValue(fct), EValue(fst)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - const auto& outputs = result.get(); - - // Outputs in graph order [0]=xq_out, [1]=xk_out (positional; the numel check - // below guards a swap, since NH != NKV under GQA). - if (outputs.size() < 2 || !outputs[0].isTensor() || !outputs[1].isTensor()) { - printf("FAIL: expected 2 tensor outputs, got %zu\n", outputs.size()); - return false; - } - const auto& xq_t = outputs[0].toTensor(); - const auto& xk_t = outputs[1].toTensor(); - if (xq_t.numel() != xq_numel || xk_t.numel() != xk_numel) { - printf( - "FAIL: output shapes [%zu,%zu] != expected [%d,%d]\n", - (size_t)xq_t.numel(), - (size_t)xk_t.numel(), - xq_numel, - xk_numel); - return false; - } - const float* xq_out = xq_t.const_data_ptr(); - const float* xk_out = xk_t.const_data_ptr(); - - std::vector gq = load_golden(xq_golden_path, xq_numel); - std::vector gk = load_golden(xk_golden_path, xk_numel); - if (gq.empty() || gk.empty()) { - printf( - "FAIL: could not load goldens %s / %s\n", - xq_golden_path.c_str(), - xk_golden_path.c_str()); - return false; - } - - // Per-element abs-OR-rel on xq and xk (shared helper, defined above). - float maq = 0.0f, mrq = 0.0f, mak = 0.0f, mrk = 0.0f; - const bool pass_q = - quant_within_tol(xq_out, gq.data(), xq_numel, 1e-3f, 1e-3f, &maq, &mrq); - const bool pass_k = - quant_within_tol(xk_out, gk.data(), xk_numel, 1e-3f, 1e-3f, &mak, &mrk); - const float max_abs_err = std::max(maq, mak); - const float max_rel_err = std::max(mrq, mrk); - - printf( - "Max abs error: %e Max rel error: %e (checked %d elements)\n", - max_abs_err, - max_rel_err, - xq_numel + xk_numel); - if (!(pass_q && pass_k)) { - printf("FAIL: apply_rotary_emb exceeds tolerance 1e-3 (abs AND rel)\n"); - return false; - } - printf("PASS: apply_rotary_emb test\n"); - return true; -} - -static bool test_prepack( - const std::string& model_path, - const std::string& golden_path, - const std::string& label = "x + const w") { - // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py. - constexpr int n = 4; - constexpr int numel = n * n; - printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n); - - Module module(model_path); - auto err = module.load_forward(); - if (err != Error::Ok) { - printf("FAIL: could not load forward method (error %d)\n", (int)err); - return false; - } - printf("Model loaded: %s\n", model_path.c_str()); - - std::vector golden = load_golden(golden_path, numel); - if (golden.empty()) { - printf("FAIL: could not load golden %s\n", golden_path.c_str()); - return false; - } - - // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs. - std::vector x_data(numel); - for (int i = 0; i < numel; i++) { - x_data[i] = static_cast((i % 13) - 6) / 16.0f; - } - auto x = make_tensor_ptr({n, n}, std::vector(x_data)); - - auto result = module.forward({EValue(x)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - const auto& outputs = result.get(); - if (outputs.empty() || !outputs[0].isTensor()) { - printf("FAIL: no tensor output\n"); - return false; - } - const auto& out_tensor = outputs[0].toTensor(); - if (out_tensor.numel() != numel) { - printf( - "FAIL: output numel %zu != expected %d\n", - (size_t)out_tensor.numel(), - numel); - return false; - } - const float* out_data = out_tensor.const_data_ptr(); - - float max_abs_err = 0.0f, max_rel_err = 0.0f; - // Per-element abs-OR-rel (quant_within_tol): a global rel gate spuriously - // fails near-zero outputs where rel error explodes. - const bool within = quant_within_tol( - out_data, golden.data(), numel, 1e-3f, 1e-3f, &max_abs_err, &max_rel_err); - printf( - "Max abs error: %e Max rel error: %e (checked %d elements)\n", - max_abs_err, - max_rel_err, - numel); - if (!within) { - printf("FAIL: prepack exceeds tolerance 1e-3\n"); - return false; - } - printf("PASS: prepack test\n"); - return true; -} - // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden. static bool test_q4gsw_config( const Q4gswConfig& cfg, @@ -1612,6 +1440,19 @@ static bool test_resize_hook(const std::string& blob_path) { } int main(int argc, char** argv) { + std::string model_path = "webgpu_add_test.pte"; + if (argc > 1) { + model_path = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) { + model_path = env; + } + + std::string chained_model_path; + if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { + chained_model_path = env; + } + std::string update_cache_model_path; if (const char* env = std::getenv("WEBGPU_TEST_UPDATE_CACHE_MODEL")) { update_cache_model_path = env; @@ -1626,86 +1467,6 @@ int main(int argc, char** argv) { } } - // embedding_q4gsw on-GPU configs: small + llama1b (env-gated, - // run-if-present). - struct EmbConfig { - const char* name; - const char* model_env; - const char* indices_env; - const char* golden_env; - int num_indices; - int embed; - }; - const EmbConfig emb_configs[] = { - {"small", - "WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL", - "WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES", - "WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN", - 4, - 64}, - {"llama1b", - "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL", - "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES", - "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN", - 4, - 2048}, - }; - - // apply_rotary_emb on-GPU configs: multi + decode (env-gated, - // run-if-present). - struct RopeConfig { - const char* name; - const char* model_env; - const char* xq_env; - const char* xk_env; - int S; - int NH; - int NKV; - int HD; - }; - const RopeConfig rope_configs[] = { - {"multi", - "WEBGPU_TEST_ROPE_MODEL", - "WEBGPU_TEST_ROPE_XQ_GOLDEN", - "WEBGPU_TEST_ROPE_XK_GOLDEN", - 5, - 8, - 2, - 64}, - {"decode", - "WEBGPU_TEST_ROPE_DECODE_MODEL", - "WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN", - "WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN", - 1, - 32, - 8, - 64}, - }; - - std::string prepack_model_path, prepack_golden_path; - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) { - prepack_model_path = env; - } - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) { - prepack_golden_path = env; - } - - std::string prepack2_model_path, prepack2_golden_path; - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) { - prepack2_model_path = env; - } - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) { - prepack2_golden_path = env; - } - - std::string prepack_tied_model_path, prepack_tied_golden_path; - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_MODEL")) { - prepack_tied_model_path = env; - } - if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_GOLDEN")) { - prepack_tied_golden_path = env; - } - // SDPA sweep: configs self-discover their sdpa_.pte/.golden.bin under // this directory (default "" = the embedded-file root / cwd). Set // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/). @@ -1733,6 +1494,12 @@ int main(int argc, char** argv) { ok = test_query_pool_overrun_throws() && ok; ok = test_query_pool_roundtrip(ctx) && ok; #endif // WGPU_BACKEND_ENABLE_PROFILING + ok = test_single_add(model_path) && ok; + + if (!chained_model_path.empty()) { + ok = test_chained_add(chained_model_path) && ok; + } + if (!update_cache_model_path.empty()) { ok = test_update_cache(update_cache_model_path) && ok; } @@ -1753,42 +1520,6 @@ int main(int argc, char** argv) { ok = false; } - for (const auto& c : emb_configs) { - const char* m = std::getenv(c.model_env); - const char* ip = std::getenv(c.indices_env); - const char* g = std::getenv(c.golden_env); - if (m && ip && g && *m && *ip && *g) { - ok = test_embedding_q4gsw(m, ip, g, c.num_indices, c.embed, c.name) && ok; - } - } - - for (const auto& c : rope_configs) { - const char* m = std::getenv(c.model_env); - const char* xq = std::getenv(c.xq_env); - const char* xk = std::getenv(c.xk_env); - if (m && xq && xk && *m && *xq && *xk) { - ok = test_rope(m, xq, xk, c.S, c.NH, c.NKV, c.HD, c.name) && ok; - } - } - - if (!prepack_model_path.empty() && !prepack_golden_path.empty()) { - ok = test_prepack(prepack_model_path, prepack_golden_path) && ok; - } - - if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) { - ok = test_prepack( - prepack2_model_path, prepack2_golden_path, "x + w1 + w2") && - ok; - } - - if (!prepack_tied_model_path.empty() && !prepack_tied_golden_path.empty()) { - ok = test_prepack( - prepack_tied_model_path, - prepack_tied_golden_path, - "x + w + w (tied weights, shared key)") && - ok; - } - bool sdpa_ran = false; bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran); if (sdpa_ran) { diff --git a/codegen/api/et_cpp.py b/codegen/api/et_cpp.py index a144128368c..88f1eb83fe0 100644 --- a/codegen/api/et_cpp.py +++ b/codegen/api/et_cpp.py @@ -40,6 +40,7 @@ tensorT, ) + if TYPE_CHECKING: from collections.abc import Sequence @@ -277,7 +278,7 @@ def default_expr(d: str, t: Type) -> str: if isinstance(t, OptionalType): if d == "None": - return "std::nullopt" + return "torch::executor::nullopt" return default_expr(d, t.elem) diff --git a/codegen/api/types/types.py b/codegen/api/types/types.py index dd80daebb33..712d7e5e341 100644 --- a/codegen/api/types/types.py +++ b/codegen/api/types/types.py @@ -16,6 +16,7 @@ ) from torchgen.model import BaseTy + halfT = BaseCppType("torch::executor", "Half") bfloat16T = BaseCppType("torch::executor", "BFloat16") stringT = BaseCppType("torch::executor", "string_view") @@ -58,7 +59,7 @@ class OptionalCType(CType): def cpp_type(self, *, strip_ref: bool = False) -> str: # Do not pass `strip_ref` recursively. - return f"std::optional<{self.elem.cpp_type()}>" + return f"torch::executor::optional<{self.elem.cpp_type()}>" def remove_const_ref(self) -> CType: return OptionalCType(self.elem.remove_const_ref()) diff --git a/devtools/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md index c161958f189..096ab10fb57 100644 --- a/devtools/bundled_program/schema/README.md +++ b/devtools/bundled_program/schema/README.md @@ -4,13 +4,3 @@ and other useful info together for verifying the correctness of ExecuTorch progr ## Rules to ensure forward/backward compatibility Please check the rules in [here](../../../schema/README.md) for more info. - - -## Regenerating generated code - -Schema changes require regenerating the Python bindings in -`devtools/bundled_program/serialize/generated` and committing the updated files. From the repo root: - -```sh -python devtools/bundled_program/serialize/generate_bundled_program.py -``` \ No newline at end of file diff --git a/devtools/bundled_program/serialize/BUCK b/devtools/bundled_program/serialize/BUCK index 89a8122503c..ae920d1e4c2 100644 --- a/devtools/bundled_program/serialize/BUCK +++ b/devtools/bundled_program/serialize/BUCK @@ -9,7 +9,7 @@ fbcode_target(_kind = runtime.python_library, name = "lib", srcs = [ "__init__.py", - ] + glob(["generated/**/*.py"]), + ], resources = { "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", @@ -19,7 +19,6 @@ fbcode_target(_kind = runtime.python_library, # Please ask before changing this. visibility = ["PUBLIC"], deps = [ - "fbsource//third-party/pypi/flatbuffers:flatbuffers", "fbsource//third-party/pypi/setuptools:setuptools", "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir/_serialize:lib", diff --git a/devtools/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py index 50c6b5768ce..ceba7670910 100644 --- a/devtools/bundled_program/serialize/__init__.py +++ b/devtools/bundled_program/serialize/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2025-2026 Arm Limited and/or its affiliates. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -9,62 +9,23 @@ # TODO(T138924864): Refactor to unify the serialization for bundled program and executorch program. -import functools import importlib.resources as _resources import json import os -import re import tempfile -from typing import Any import executorch.devtools.bundled_program.schema as bp_schema import executorch.devtools.bundled_program.serialize as serialization_package - -import flatbuffers # pyre-ignore[21] from executorch.devtools.bundled_program.core import BundledProgram -from executorch.devtools.bundled_program.serialize.generated.bundled_program_flatbuffer import ( - Bool as _Bool, - BundledMethodTestCase as _BundledMethodTestCase, - BundledMethodTestSuite as _BundledMethodTestSuite, - BundledProgram as _BundledProgram, - Double as _Double, - Int as _Int, - Tensor as _Tensor, - Value as _Value, - ValueUnion as _ValueUnion, -) from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.exir._serialize._flatbuffer_program import ( - _coerce_bytes, - _create_aligned_byte_vector, -) # The prefix of schema files used for bundled program BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema" SCALAR_TYPE_SCHEMA_NAME = "scalar_type" -@functools.lru_cache(maxsize=1) -def _bundled_program_file_identifier() -> bytes: - schema = _resources.read_binary( - serialization_package, f"{BUNDLED_PROGRAM_SCHEMA_NAME}.fbs" - ) - match = re.search(rb'file_identifier\s+"([^"]+)"', schema) - if match is None: - raise ValueError( - f"Missing file_identifier in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs" - ) - file_identifier = match.group(1) - if len(file_identifier) != 4: - raise ValueError( - f"Invalid file_identifier length {len(file_identifier)} " - f"in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs" - ) - return file_identifier - - def write_schema(d: str, schema_name: str) -> None: schema_path = os.path.join(d, "{}.fbs".format(schema_name)) with open(schema_path, "wb") as schema_file: @@ -117,145 +78,6 @@ def convert_from_flatbuffer(program_flatbuffer: bytes) -> bytes: return output_file.read() -def _pack_tensor(self: Any, builder: Any) -> int: - if self.sizes is not None: - _Tensor.TensorStartSizesVector(builder, len(self.sizes)) - for i in reversed(range(len(self.sizes))): - builder.PrependInt32(self.sizes[i]) - sizes = builder.EndVector() - if self.data is not None: - data = _create_aligned_byte_vector(builder, _coerce_bytes(self.data), 16) - if self.dimOrder is not None: - dim_order = _create_aligned_byte_vector( - builder, _coerce_bytes(self.dimOrder), 1 - ) - - _Tensor.TensorStart(builder) - _Tensor.TensorAddScalarType(builder, self.scalarType) - if self.sizes is not None: - _Tensor.TensorAddSizes(builder, sizes) - if self.data is not None: - _Tensor.TensorAddData(builder, data) - if self.dimOrder is not None: - _Tensor.TensorAddDimOrder(builder, dim_order) - return _Tensor.TensorEnd(builder) - - -def _pack_bundled_program(self: Any, builder: Any) -> int: - if self.methodTestSuites is not None: - method_test_suites_list = [ - method_test_suite.Pack(builder) - for method_test_suite in self.methodTestSuites - ] - _BundledProgram.BundledProgramStartMethodTestSuitesVector( - builder, len(self.methodTestSuites) - ) - for i in reversed(range(len(self.methodTestSuites))): - builder.PrependUOffsetTRelative(method_test_suites_list[i]) - method_test_suites = builder.EndVector() - if self.program is not None: - program = _create_aligned_byte_vector(builder, _coerce_bytes(self.program), 32) - - _BundledProgram.BundledProgramStart(builder) - _BundledProgram.BundledProgramAddVersion(builder, self.version) - if self.methodTestSuites is not None: - _BundledProgram.BundledProgramAddMethodTestSuites(builder, method_test_suites) - if self.program is not None: - _BundledProgram.BundledProgramAddProgram(builder, program) - return _BundledProgram.BundledProgramEnd(builder) - - -@functools.lru_cache(maxsize=1) -def _install_fast_packers() -> None: - _Tensor.TensorT.Pack = _pack_tensor - _BundledProgram.BundledProgramT.Pack = _pack_bundled_program - - -def _convert_tensor(val: bp_schema.Tensor) -> Any: - result = _Tensor.TensorT() - result.scalarType = int(val.scalar_type) - result.sizes = list(val.sizes) - result.data = _coerce_bytes(val.data) - result.dimOrder = _coerce_bytes(val.dim_order) - return result - - -def _convert_int(val: bp_schema.Int) -> Any: - result = _Int.IntT() - result.intVal = val.int_val - return result - - -def _convert_bool(val: bp_schema.Bool) -> Any: - result = _Bool.BoolT() - result.boolVal = val.bool_val - return result - - -def _convert_double(val: bp_schema.Double) -> Any: - result = _Double.DoubleT() - result.doubleVal = val.double_val - return result - - -def _convert_value_union(val: bp_schema.ValueUnion) -> tuple[int, Any]: - if isinstance(val, bp_schema.Tensor): - return _ValueUnion.ValueUnion.Tensor, _convert_tensor(val) - if isinstance(val, bp_schema.Int): - return _ValueUnion.ValueUnion.Int, _convert_int(val) - if isinstance(val, bp_schema.Bool): - return _ValueUnion.ValueUnion.Bool, _convert_bool(val) - if isinstance(val, bp_schema.Double): - return _ValueUnion.ValueUnion.Double, _convert_double(val) - return _ValueUnion.ValueUnion.NONE, None - - -def _convert_value(val: bp_schema.Value) -> Any: - result = _Value.ValueT() - result.valType, result.val = _convert_value_union(val.val) - return result - - -def _convert_method_test_case(val: bp_schema.BundledMethodTestCase) -> Any: - result = _BundledMethodTestCase.BundledMethodTestCaseT() - result.inputs = [_convert_value(value) for value in val.inputs] - result.expectedOutputs = [_convert_value(value) for value in val.expected_outputs] - return result - - -def _convert_method_test_suite(val: bp_schema.BundledMethodTestSuite) -> Any: - result = _BundledMethodTestSuite.BundledMethodTestSuiteT() - result.methodName = val.method_name - result.testCases = [ - _convert_method_test_case(test_case) for test_case in val.test_cases - ] - return result - - -def _convert_bundled_program(val: bp_schema.BundledProgram) -> Any: - result = _BundledProgram.BundledProgramT() - result.version = val.version - result.methodTestSuites = [ - _convert_method_test_suite(suite) for suite in val.method_test_suites - ] - result.program = _coerce_bytes(val.program) - return result - - -def _bundled_program_schema_to_flatbuffer( - bundled_program: bp_schema.BundledProgram, -) -> bytes: - _install_fast_packers() - bundled_program_t = _convert_bundled_program(bundled_program) - builder = flatbuffers.Builder() - bundled_program_offset = bundled_program_t.Pack(builder) - builder.Finish( - bundled_program_offset, - file_identifier=_bundled_program_file_identifier(), - ) - return bytes(builder.Output()) - - # from bundled program to flatbuffer def serialize_from_bundled_program_to_flatbuffer( bundled_program: BundledProgram, @@ -272,7 +94,9 @@ def serialize_from_bundled_program_to_flatbuffer( bundled_program_in_schema = bundled_program.serialize_to_schema() - return _bundled_program_schema_to_flatbuffer(bundled_program_in_schema) + return convert_to_flatbuffer( + serialize_from_bundled_program_to_json(bundled_program_in_schema) + ) # From flatbuffer to bundled program in schema. diff --git a/docs/source/backends/nxp/op-support.csv b/docs/source/backends/nxp/op-support.csv index fb67f47bf62..8a250dce88d 100644 --- a/docs/source/backends/nxp/op-support.csv +++ b/docs/source/backends/nxp/op-support.csv @@ -13,7 +13,6 @@ aten.constant_pad_nd.default,int8,static int8,"H or W padding only" aten.convolution.default,int8,static int8,"1D or 2D convolution, constant weights, groups=1 or groups=channels_count (depthwise)" aten.dim_order_ops._clone_dim_order.default,,, "See aten.clone.default" aten.div.Tensor,int8,static int8,"divisor - static tensor or scalar value, one dimension must satisfy %8 = 0 or scalar division (all dims = 1)" -aten.exp.default,int8,static int8, aten.hardtanh.default,int8,static int8,"supported ranges: <0,6>, <-1, 1>, <0,1>, <0,inf>" aten.leaky_relu.default,int8,static int8, aten.log.default,int8,static int8, diff --git a/examples/arm/executor_runner/arm_memory_allocator.cpp b/examples/arm/executor_runner/arm_memory_allocator.cpp index d3337b6005e..de670df29ae 100644 --- a/examples/arm/executor_runner/arm_memory_allocator.cpp +++ b/examples/arm/executor_runner/arm_memory_allocator.cpp @@ -26,7 +26,7 @@ static void asan_unpoison_buffer(void* base, size_t size) { #endif ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address) - : MemoryAllocator(size, base_address) { + : MemoryAllocator(size, base_address), used_(0) { #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER) asan_poison_buffer(base_address, size); #endif @@ -34,16 +34,35 @@ ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address) void* ArmMemoryAllocator::allocate(size_t size, size_t alignment) { void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment); -#if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER) if (ret != nullptr) { +#if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER) asan_unpoison_buffer(ret, size); - } #endif + // Align with the same code as in MemoryAllocator::allocate() to keep + // used_ "in sync" As alignment is expected to be power of 2 (checked by + // MemoryAllocator::allocate()) we can check it the lower bits + // (same as alignment - 1) is zero or not. + if ((size & (alignment - 1)) == 0) { + // Already aligned. + used_ += size; + } else { + used_ = (used_ | (alignment - 1)) + 1 + size; + } + } return ret; } +size_t ArmMemoryAllocator::used_size() const { + return used_; +} + +size_t ArmMemoryAllocator::free_size() const { + return executorch::runtime::MemoryAllocator::size() - used_; +} + void ArmMemoryAllocator::reset() { executorch::runtime::MemoryAllocator::reset(); + used_ = 0; #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER) asan_poison_buffer(base_address(), size()); #endif diff --git a/examples/arm/executor_runner/arm_memory_allocator.h b/examples/arm/executor_runner/arm_memory_allocator.h index 3c82f72c44b..1d7bbdecb4c 100644 --- a/examples/arm/executor_runner/arm_memory_allocator.h +++ b/examples/arm/executor_runner/arm_memory_allocator.h @@ -10,14 +10,21 @@ using executorch::runtime::MemoryAllocator; #pragma once -// Custom allocator that poisons/unpoisons its buffer for AddressSanitizer. The -// used and free byte counts are reported by the base MemoryAllocator's -// used_size() / free_size(). +// Setup our own allocator that can show some extra stuff like used and free +// memory info class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator { public: ArmMemoryAllocator(uint32_t size, uint8_t* base_address); void* allocate(size_t size, size_t alignment = kDefaultAlignment) override; - void reset() override; + // Returns the used size of the allocator's memory buffer. + size_t used_size() const; + + // Returns the free size of the allocator's memory buffer. + size_t free_size() const; + void reset(); + + private: + size_t used_; }; diff --git a/examples/espressif/README.md b/examples/espressif/README.md index a76e794030c..025bdf94094 100644 --- a/examples/espressif/README.md +++ b/examples/espressif/README.md @@ -44,6 +44,8 @@ examples/espressif/ ├── executor_runner/ │ ├── CMakeLists.txt # Component/standalone CMake build │ ├── esp_executor_runner.cpp # Main executor runner +│ ├── esp_memory_allocator.h # Custom memory allocator +│ ├── esp_memory_allocator.cpp │ ├── esp_perf_monitor.h # Performance monitoring │ ├── esp_perf_monitor.cpp │ └── pte_to_header.py # Convert .pte to C header diff --git a/examples/espressif/executor_runner/CMakeLists.txt b/examples/espressif/executor_runner/CMakeLists.txt index 2a26c53d5e0..a103a1ddc8c 100644 --- a/examples/espressif/executor_runner/CMakeLists.txt +++ b/examples/espressif/executor_runner/CMakeLists.txt @@ -28,6 +28,7 @@ if(ESP_PLATFORM) SRCS "esp_executor_runner.cpp" "esp_pal.cpp" + "esp_memory_allocator.cpp" "esp_perf_monitor.cpp" INCLUDE_DIRS "." @@ -282,7 +283,7 @@ else() add_executable(esp_executor_runner) target_sources( esp_executor_runner PRIVATE esp_executor_runner.cpp esp_pal.cpp - esp_perf_monitor.cpp + esp_perf_monitor.cpp esp_memory_allocator.cpp ) target_link_libraries( diff --git a/examples/espressif/executor_runner/esp_executor_runner.cpp b/examples/espressif/executor_runner/esp_executor_runner.cpp index c2f1fa34dde..9260e6b88a0 100644 --- a/examples/espressif/executor_runner/esp_executor_runner.cpp +++ b/examples/espressif/executor_runner/esp_executor_runner.cpp @@ -73,6 +73,7 @@ #include #include "esp_executor_runner.h" +#include "esp_memory_allocator.h" #include "esp_perf_monitor.h" #if defined(ESP_PLATFORM) @@ -477,8 +478,8 @@ struct RunnerContext { bool bundle_io = false; Box loader; Box program; - Box method_allocator; - Box temp_allocator; + Box method_allocator; + Box temp_allocator; std::vector> planned_spans; Box planned_memory; Box memory_manager; @@ -1019,7 +1020,7 @@ bool et_runner_init(void) { return false; } #endif - MemoryAllocator file_allocator( + EspMemoryAllocator file_allocator( method_allocation_pool_size, method_allocation_pool); auto [buffer, buffer_size] = load_file_from_fs("/spiffs/model.pte", file_allocator); @@ -1246,4 +1247,4 @@ size_t et_runner_outputs_size(void) { ET_CHECK_MSG(model_ok == true, "Problem running model"); ET_LOG(Info, "Program complete."); -} +} \ No newline at end of file diff --git a/examples/models/BUCK b/examples/models/BUCK index ed72a16e05f..a2b6789a95e 100644 --- a/examples/models/BUCK +++ b/examples/models/BUCK @@ -33,9 +33,6 @@ fbcode_target(_kind = python_library, "//executorch/examples/models/phi_4_mini:phi_4_mini", # @manual "//executorch/examples/models/smollm2:smollm2", # @manual "//executorch/examples/models/smollm3:smollm3", # @manual - "//executorch/examples/models/smolvlm:smolvlm", # @manual - "//executorch/examples/models/whisper:whisper", # @manual - "//executorch/examples/models/yolo26:yolo26", # @manual ], ) diff --git a/examples/models/__init__.py b/examples/models/__init__.py index d50554006bd..241a5cc366e 100644 --- a/examples/models/__init__.py +++ b/examples/models/__init__.py @@ -45,10 +45,6 @@ class Model(str, Enum): MobileNetV1025 = "mobilenet_v1_025" ResNet8 = "resnet8" Sdpa = "sdpa" - Qwen3 = "qwen3" - SmolVLM = "smolvlm" - YOLO26 = "yolo26" - Whisper = "whisper" def __str__(self) -> str: return self.value @@ -109,10 +105,6 @@ def __str__(self) -> str: ), str(Model.ResNet8): ("mlperf_tiny.resnet8", "ResNet8Model"), str(Model.Sdpa): ("toy_model", "SdpaModule"), - str(Model.Qwen3): ("qwen3", "Qwen3Model"), - str(Model.SmolVLM): ("smolvlm", "SmolVLMModel"), - str(Model.YOLO26): ("yolo26", "YOLO26Model"), - str(Model.Whisper): ("whisper", "WhisperModel"), } __all__ = [ diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index 482f64083a0..ae3bcb24c19 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -93,31 +93,14 @@ method with dynamic sequence length and host-side sampling. Writes `model.pte` (and optionally `model.ptd`) into `--output-dir`. -#### TurboQuant KV cache (long context, CUDA + MLX) +#### TurboQuant KV cache (long context, MLX only) For long-context inference, add `--turboquant` to swap the full-attention layers' KV cache for a TurboQuant TQ4 cache (4-bit codebook + nibble pack). This gives ~3.8× cache memory savings on the full-attention layers and lets -you fit context lengths that wouldn't fit in bf16. Sliding-window layers are -unaffected. Supported on both the CUDA and MLX backends. - -**Long context requires BOTH flags**: `--turboquant` *and* a larger -`--max-seq-len`. Raising `--max-seq-len` alone keeps a bf16 KV cache, which does -not fit at long context. On CUDA, `--turboquant` is what enables 128k: Gemma4-31B -at `--max-seq-len 131072` runs within ~27 GiB at runtime (fits a 32 GB card). - -```bash -# CUDA — 128k context (TQ4 KV) -python examples/models/gemma4_31b/export.py \ - --gguf ./gemma-4-31B-it-Q4_K_M.gguf \ - --output-dir ./gemma4_31b_exports_128k \ - --max-seq-len 131072 \ - --backend cuda \ - --turboquant -``` +you fit context lengths that wouldn't fit in bf16. Sliding-window layers are unaffected. ```bash -# MLX (Apple Silicon) python examples/models/gemma4_31b/export.py \ --prequantized ./gemma4_31b_int4 \ --output-dir ./gemma4_31b_exports_mlx_tq \ diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py index 666d0c44e9d..f9b383cf224 100644 --- a/examples/models/gemma4_31b/cuda_source_transformations.py +++ b/examples/models/gemma4_31b/cuda_source_transformations.py @@ -77,15 +77,9 @@ def _turboquant_attention_forward( # uncompressed K/V is never materialized. k_packed, k_norms, v_packed, v_norms = self.kv_cache.update(input_pos, k, v) - # Number of valid (filled) KV positions = input_pos[0] + T. Passing this to - # tq4_sdpa bounds its KV loop to the actual context instead of the full - # pre-allocated buffer (max_seq_len for global layers), making attention - # O(context) instead of O(max_seq_len). Kept as a GPU scalar (no ``.item()``) - # so the bound is captured correctly by the decode CUDA graph. Decode: T=1 -> - # input_pos+1; prefill chunk: T -> chunk_end. - # NOTE: this call-site argument was dropped during a rebase, which silently - # disabled the O(context) bound and forced a full max_seq_len sweep every - # step (catastrophic at 128k: ~2.7 tok/s decode vs ~37+ when bounded). + # Number of valid (filled) KV positions = input_pos[0] + T. Bounds tq4_sdpa's + # KV loop to the actual context (O(context), not O(max_seq_len)) and enables + # the split-K decode path. GPU scalar (no .item()) so it's CUDA-graph-safe. kv_len = input_pos[0] + input_pos.shape[0] # ``scale=self.scaling`` (= 1.0 for Gemma 4) — overrides tq4_sdpa's @@ -100,7 +94,7 @@ def _turboquant_attention_forward( self.kv_cache.centroids, self.kv_cache.rotation, attn_mask, - False, # is_causal: attn_mask already encodes causal masking + False, # is_causal — attn_mask already encodes causal masking self.scaling, kv_len, True, # mask_is_causal: Gemma full-attention mask is standard causal diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index d9e16bc34df..59be23020f2 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -171,7 +171,6 @@ def _export_cuda( ) from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.passes import MemoryPlanningPass - from executorch.exir.passes.propagate_device_pass import PropagateDeviceConfig from torch.export import Dim, export inductor_config.coordinate_descent_tuning = False @@ -271,14 +270,6 @@ def _export_cuda( alloc_graph_input=False, ), emit_mutable_buffer_names=True, - # Keep method inputs/outputs device-resident so the CUDA backend - # does not insert boundary H2D/D2H copies: the runner stages inputs - # in CUDA memory and reads the sampled token back with a single - # small D2H. CUDA-only (no effect on the MLX path). - propagate_device_config=PropagateDeviceConfig( - skip_h2d_for_method_inputs=True, - skip_d2h_for_method_outputs=True, - ), ), ) diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py index 90839ea6f6a..e95581dc95d 100644 --- a/examples/models/gemma4_31b/gguf_loader.py +++ b/examples/models/gemma4_31b/gguf_loader.py @@ -17,12 +17,9 @@ linear and embedding. ``embed_tokens`` and ``lm_head`` stay tied -- they share the one quantized tensor. * **CUDA**: Q4_K -> ``Int4Tensor``, Q6_K -> ``CudaDp4aPlanarInt6Tensor`` (a genuine - 6-bit packed weight, lossless, symmetric). ``embed_tokens`` and ``lm_head`` are - untied: ``lm_head`` keeps a packed (int6/int4) matmul weight, while the token - embedding becomes a gatherable ``IntxUnpackedToInt8Tensor`` (int8) -- the truly - packed int4/int6 tensors can't gather. For the Q6_K tied weight the decode is - done once and shared between the two, avoiding a whole-tensor bf16 dequant and - a second decode (see ``_untie_embed_lm_head``). + 6-bit packed weight, lossless, symmetric); ``lm_head`` keeps the quantized + tensor but the token embedding is dequantized to bf16 (the packed tensors can't + gather), so they are untied. Usage: model, config = load_gguf_model("model.gguf", backend="cuda") @@ -119,55 +116,6 @@ def _resolve_tied_lm_head(model, lm_head_weight, packers): ) -def _untie_embed_lm_head(model, gtensor, weight, backend): - """Untie the GGUF token-embed / lm_head weight, returning ``(embed, lm_head)``. - - GGUF ties ``embed_tokens`` and ``lm_head`` to one quantized weight. The - returned ``lm_head`` is packed into ``model.lm_head`` after the streaming loop - (``_resolve_tied_lm_head``), or is ``None`` when this function already - assigned it. - - * **MLX**: keep both tied on the raw ``ExportableGGUFTensor``. - * **CUDA** (Q6_K or Q4_K): untie so ``lm_head`` keeps a packed low-bit matmul - weight while the token embedding becomes a gatherable int8 - ``IntxUnpackedToInt8Tensor`` -- the truly packed int4/int6 tensors can't - gather. Instead of dequantizing the whole ~1.4 B-element weight to bf16 - (2 B/elem), decode it once to int8 (1 B/elem; the decode is lossless so the - result is numerically identical), halving the embedding's host + GPU-constant - footprint. The token embedding (Q4_K for the Gemma checkpoint) is the single - biggest weight, so this is the dominant saving vs the bf16 path. ``lm_head``: - - Q6_K -> ``CudaDp4aPlanarInt6Tensor`` from the *same* int8 decode and - assigned here (``pack_linear_for_cuda`` would mis-route an int8 tensor to - the int8 path), so the post-loop resolve is a no-op. - - Q4_K -> kept as the native ``Int4Tensor`` and returned, so - ``_resolve_tied_lm_head`` packs it to ``CudaCoalescedInt4Tensor`` (same - as a regular Q4_K linear). - * **CUDA, other types**: fall back to the bf16 embedding. - """ - if backend == "mlx": - return weight, gtensor - - if gtensor.ggml_type in ("q6_k", "q4_k"): - intx = gtensor.to_intx_unpacked_to_int8_tensor() - if gtensor.ggml_type == "q6_k": - import torch.nn as nn - from executorch.backends.cuda.dp4a_planar_int6_tensor import ( - CudaDp4aPlanarInt6Tensor, - ) - - model.lm_head.weight = nn.Parameter( - CudaDp4aPlanarInt6Tensor._from_intx_int8(intx), requires_grad=False - ) - return intx, None - # Q4_K: ``weight`` is the native Int4Tensor; let _resolve_tied_lm_head - # pack it to CudaCoalescedInt4Tensor. Only the embedding switches to int8. - return intx, weight - - from executorch.examples.models.gemma4_31b.quant import dequantize_weight - - return dequantize_weight(weight, torch.bfloat16), weight - - def load_gguf_model( gguf_path: str, max_seq_len: int = 4096, @@ -192,7 +140,7 @@ def load_gguf_model( Gemma4_31BConfig, materialize_runtime_buffers, ) - from executorch.examples.models.gemma4_31b.quant import pack_one + from executorch.examples.models.gemma4_31b.quant import dequantize_weight, pack_one from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf if backend == "cuda": @@ -213,7 +161,7 @@ def load_gguf_model( with torch.device("meta"): model = Gemma4_31B(config) - lm_head_weight = None # tied weight resolved into lm_head after the loop + lm_head_weight = None # weight reused for a tied lm_head n_processed = 0 print(f"Streaming GGUF from {gguf_path}...") @@ -225,9 +173,11 @@ def load_gguf_model( if isinstance(value, ExportableGGUFTensor): weight = _convert_weight(model, model_key, value, backend) if model_key == "embed_tokens.weight": - weight, lm_head_weight = _untie_embed_lm_head( - model, value, weight, backend - ) + # Tied lm_head reuses the embedding weight: MLX wants the raw + # ExportableGGUFTensor (linear pattern), CUDA the quant tensor. + lm_head_weight = value if backend == "mlx" else weight + if backend == "cuda": + weight = dequantize_weight(weight, torch.bfloat16) value = weight elif value.dtype == torch.float32: value = value.to(torch.bfloat16) diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp index 3d9970b1610..83d1f639e75 100644 --- a/examples/models/gemma4_31b/main.cpp +++ b/examples/models/gemma4_31b/main.cpp @@ -23,11 +23,8 @@ #include #include #include -#include #include #include -#include -#include #include #include @@ -79,29 +76,25 @@ DEFINE_bool( cuda_graph, false, "Enable CUDA graph capture for the decode method. CUDA only."); +DEFINE_bool( + ignore_eos, + false, + "Do not stop at EOS; always generate exactly max_new_tokens. For " + "benchmarking decode throughput at a fixed token count (mirrors " + "llama.cpp --ignore-eos)."); namespace llm = ::executorch::extension::llm; using ::executorch::extension::from_blob; -using ::executorch::extension::make_tensor_ptr; using ::executorch::extension::Module; -using ::executorch::extension::TensorPtr; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; -#ifdef EXECUTORCH_BUILD_CUDA -using ::executorch::extension::clone_tensor_ptr_to; -#endif using SizesType = executorch::aten::SizesType; -// Read a sampled token ID from a scalar int64 output (CUDA path). -// -// The model now emits the sampled token as int64 (see sampler.py), matching -// the decode method's int64 token input so the on-device output buffer can be -// aliased directly as the next step's input. We still copy the 8-byte scalar -// back to the host here for EOS detection and detokenization. +// Read a sampled token ID from a scalar float output (CUDA path). static uint64_t read_token(const executorch::aten::Tensor& output) { const void* ptr = output.const_data_ptr(); - int64_t val = 0; + float val = 0.0f; #ifdef EXECUTORCH_BUILD_CUDA cudaPointerAttributes attrs{}; @@ -109,7 +102,7 @@ static uint64_t read_token(const executorch::aten::Tensor& output) { attrs.type == cudaMemoryTypeDevice; if (on_device) { cudaError_t err = - cudaMemcpy(&val, ptr, sizeof(int64_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&val, ptr, sizeof(float), cudaMemcpyDeviceToHost); if (err != cudaSuccess) { ET_LOG( Error, @@ -118,13 +111,13 @@ static uint64_t read_token(const executorch::aten::Tensor& output) { return 0; } } else { - memcpy(&val, ptr, sizeof(int64_t)); + memcpy(&val, ptr, sizeof(float)); } #else - memcpy(&val, ptr, sizeof(int64_t)); + memcpy(&val, ptr, sizeof(float)); #endif - return static_cast(val); + return static_cast(llrintf(val)); } int main(int argc, char** argv) { @@ -194,8 +187,6 @@ int main(int argc, char** argv) { FLAGS_temperature <= 0.0 ? 1e-6f : static_cast(FLAGS_temperature); #ifdef EXECUTORCH_BUILD_CUDA - const auto cuda_device = - executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0); if (FLAGS_cuda_graph) { executorch::runtime::BackendOptions<2> cuda_opts; cuda_opts.set_option("enable_cuda_graph_for_method", "decode"); @@ -232,9 +223,8 @@ int main(int argc, char** argv) { ET_LOG(Error, "Failed to load decode method"); return 1; } - auto temp_tensor = clone_tensor_ptr_to( - from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float), - cuda_device); + auto temp_tensor = + from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float); #else if (FLAGS_cuda_graph) { ET_LOG(Info, "--cuda_graph ignored on non-CUDA build"); @@ -302,12 +292,6 @@ int main(int argc, char** argv) { // --------------------------------------------------------------- uint64_t cur_token = 0; int64_t prefill_pos = 0; -#ifdef EXECUTORCH_BUILD_CUDA - // Alias of the most recent forward's on-device int64 output token. The last - // prefill chunk's output seeds the first decode step (no token H2D); each - // decode step then re-aliases its own output for the next step. - TensorPtr device_out_token; -#endif while (prefill_pos < num_prompt_tokens) { int64_t chunk_len = std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk); @@ -326,12 +310,6 @@ int main(int argc, char** argv) { auto pos_tensor = from_blob( pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long); -#ifdef EXECUTORCH_BUILD_CUDA - // skip_h2d: prefill/decode method inputs must already live in CUDA memory. - tokens_tensor = clone_tensor_ptr_to(tokens_tensor, cuda_device); - pos_tensor = clone_tensor_ptr_to(pos_tensor, cuda_device); -#endif - std::vector inputs; inputs.push_back(EValue(tokens_tensor)); inputs.push_back(EValue(pos_tensor)); @@ -350,11 +328,7 @@ int main(int argc, char** argv) { } #ifdef EXECUTORCH_BUILD_CUDA - const auto& out_tensor = result.get()[0].toTensor(); - cur_token = read_token(out_tensor); - // Keep the sampled token on device: alias the output buffer so it feeds - // straight into the next forward as the int64 token input (zero copy). - device_out_token = make_tensor_ptr(out_tensor); + cur_token = read_token(result.get()[0].toTensor()); #else cur_token = static_cast( llm::logits_to_token(result.get()[0].toTensor(), temp_val)); @@ -386,69 +360,22 @@ int main(int argc, char** argv) { // Decode loop // --------------------------------------------------------------- int64_t pos = num_prompt_tokens; - std::vector decode_pos_data = {pos}; - auto decode_pos_cpu = from_blob( - decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long); -#ifdef EXECUTORCH_BUILD_CUDA - // Fixed device-resident position input slot: the decode method always reads - // the position from this same address every step (cuda-graph-safe). Seeded - // once here with a one-time H2D; refreshed each step by an on-device D2D. - auto decode_pos = clone_tensor_ptr_to(decode_pos_cpu, cuda_device); - // Upload the FULL decode position array to device ONCE (a single H2D - the - // one-time copy we keep). Each step copies its position from here into the - // fixed slot with a device-to-device copy, so there is NO per-round pos H2D. - std::vector pos_seq_data(FLAGS_max_new_tokens); - for (int32_t i = 0; i < FLAGS_max_new_tokens; i++) { - pos_seq_data[i] = num_prompt_tokens + i; - } - auto pos_seq_dev = clone_tensor_ptr_to( - from_blob( - pos_seq_data.data(), - {S(FLAGS_max_new_tokens)}, - executorch::aten::ScalarType::Long), - cuda_device); - auto* pos_seq_dev_ptr = - static_cast(pos_seq_dev->mutable_data_ptr()); - auto* decode_pos_slot_ptr = - static_cast(decode_pos->mutable_data_ptr()); -#else - // Non-CUDA (MLX) path: keep host token/pos buffers; the backend stages them - // and the host samples from the returned logits. std::vector decode_token_data = {static_cast(cur_token)}; + std::vector decode_pos_data = {pos}; auto decode_tokens = from_blob( decode_token_data.data(), {1, 1}, executorch::aten::ScalarType::Long); - auto decode_pos = decode_pos_cpu; -#endif + auto decode_pos = from_blob( + decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long); uint64_t prev_token = cur_token; - bool hit_eos = eos_ids.find(cur_token) != eos_ids.end(); + bool hit_eos = + !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end(); for (int32_t step = 0; step < FLAGS_max_new_tokens && !hit_eos; step++) { -#ifdef EXECUTORCH_BUILD_CUDA - // No per-round H2D: copy this step's position from the pre-uploaded device - // position array into the fixed position slot with an on-device D2D. With - // the token aliased on device (Option A) and the position staged via D2D, - // the per-round HtoD count is zero (independent of decode length). - // cudaMemcpy D2D is host-synchronous, so the slot is updated before the - // decode kernels read it; with cuda graph enabled this becomes a captured - // cudaMemcpyAsync on the decode stream into this same fixed slot. - ET_CHECK_MSG( - cudaMemcpy( - decode_pos_slot_ptr, - pos_seq_dev_ptr + step, - sizeof(int64_t), - cudaMemcpyDeviceToDevice) == cudaSuccess, - "Failed to copy decode position D2D"); -#else - decode_pos_data[0] = pos; decode_token_data[0] = static_cast(cur_token); -#endif + decode_pos_data[0] = pos; std::vector inputs; -#ifdef EXECUTORCH_BUILD_CUDA - inputs.push_back(EValue(device_out_token)); -#else inputs.push_back(EValue(decode_tokens)); -#endif inputs.push_back(EValue(decode_pos)); #ifdef EXECUTORCH_BUILD_CUDA @@ -465,10 +392,7 @@ int main(int argc, char** argv) { prev_token = cur_token; #ifdef EXECUTORCH_BUILD_CUDA - const auto& out_tensor = result.get()[0].toTensor(); - cur_token = read_token(out_tensor); - // Alias this step's on-device output token as the next step's token input. - device_out_token = make_tensor_ptr(out_tensor); + cur_token = read_token(result.get()[0].toTensor()); #else cur_token = static_cast( llm::logits_to_token(result.get()[0].toTensor(), temp_val)); @@ -481,7 +405,7 @@ int main(int argc, char** argv) { fflush(stdout); } - hit_eos = eos_ids.find(cur_token) != eos_ids.end(); + hit_eos = !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end(); } printf("\n"); diff --git a/examples/models/gemma4_31b/model.py b/examples/models/gemma4_31b/model.py index d953541a244..bfaa73a754b 100644 --- a/examples/models/gemma4_31b/model.py +++ b/examples/models/gemma4_31b/model.py @@ -484,7 +484,7 @@ def forward( temperature: 1-D float tensor for Gumbel-max sampling. Returns: - (B, 1) sampled token IDs as int64. + (B, 1) sampled token IDs as float. """ x = self.embed_tokens(tokens) * self.embed_normalizer diff --git a/examples/models/gemma4_31b/sampler.py b/examples/models/gemma4_31b/sampler.py index 2ce428224a2..690344fd2e4 100644 --- a/examples/models/gemma4_31b/sampler.py +++ b/examples/models/gemma4_31b/sampler.py @@ -26,12 +26,9 @@ def sample( temperature still works ("near-greedy"). Returns: - ``[B, 1]`` int64 token IDs (``argmax(logits/T + gumbel_noise)``). - Emitting int64 (rather than casting to float) lets the runner alias the - on-device output token directly as the next decode step's int64 token - input — no D2H/H2D round-trip and no dtype cast. + ``[B, 1]`` float32 token IDs (``argmax(logits/T + gumbel_noise)``). """ logits = logits / temperature.clamp(min=1e-6) noise = torch.rand_like(logits) gumbel = -torch.log(-torch.log(noise + 1e-20) + 1e-20) - return (logits + gumbel).argmax(dim=-1, keepdim=True).to(torch.int64) + return (logits + gumbel).argmax(dim=-1, keepdim=True).float() diff --git a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py index c346c1d2f82..caf0a44e03b 100644 --- a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py +++ b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py @@ -246,14 +246,13 @@ def _load(self, tmp): def test_load_converts_weights(self): """GGUF -> CUDA: Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> CudaDp4aPlanarInt6Tensor, - embedding int8 (gatherable).""" + embedding bf16.""" from executorch.backends.cuda.coalesced_int4_tensor import ( CudaCoalescedInt4Tensor, ) from executorch.backends.cuda.dp4a_planar_int6_tensor import ( CudaDp4aPlanarInt6Tensor, ) - from torchao.quantization import IntxUnpackedToInt8Tensor with tempfile.TemporaryDirectory() as tmp: model, _ = self._load(tmp) @@ -264,49 +263,11 @@ def test_load_converts_weights(self): self.assertIsInstance( model.layers[0].mlp.down_proj.weight.data, CudaDp4aPlanarInt6Tensor ) - # Tied lm_head keeps a packed int6 matmul weight. + # Tied lm_head is repacked to int6 by pack_cuda (it keeps quantization, + # unlike the token embedding which is dequantized for the gather). self.assertIsInstance(model.lm_head.weight.data, CudaDp4aPlanarInt6Tensor) - # Token embedding is decoded to a gatherable int8 tensor (not bf16): the - # Q6_K decode is lossless and shared with lm_head. Keeping it int8 (vs - # bf16) avoids a ~5.6 GB fp32 dequant transient and ~1.4 GB resident at - # export time. - self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor) - - def test_int8_embedding_matches_bf16(self): - """Guard the bf16 -> int8 token-embedding switch. - - The embedding is now loaded as a gatherable int8 ``IntxUnpackedToInt8Tensor`` - instead of being dequantized to bf16. Its gathered rows must match the bf16 - dequant of the *source* GGUF token embedding -- i.e. exactly what the old - ``dequantize_weight(..., bf16)`` path returned. The GGUF decode is lossless, - so they agree to bf16 precision. - """ - from executorch.examples.models.gemma4_31b.gguf_loader import gguf_to_model_key - from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf - from torchao.quantization import IntxUnpackedToInt8Tensor - - with tempfile.TemporaryDirectory() as tmp: - path = os.path.join(tmp, "tiny.gguf") - build_gguf_checkpoint(path) - # Reference = bf16 dequant of the source GGUF token embedding (the - # tensor the previous bf16 embedding path materialized). - ref_bf16 = None - for name, val in iter_gguf(path): - if gguf_to_model_key(name) == "embed_tokens.weight": - self.assertIsInstance(val, ExportableGGUFTensor) - ref_bf16 = val.dequantize(torch.bfloat16) - break - self.assertIsNotNone(ref_bf16, "token_embd.weight not found in GGUF") - model, _ = load_gguf_model(path, backend="cuda", config=GGUF_CONFIG) - - self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor) - - ids = torch.tensor([0, 1, 7, GGUF_CONFIG.vocab_size - 1]) - out = model.embed_tokens(ids) # int8 gather + dequant - ref = ref_bf16[ids] - self.assertEqual(out.shape, ref.shape) - rel_err = (out.float() - ref.float()).abs().mean() / ref.float().abs().mean() - self.assertLess(rel_err.item(), 0.02) + # Token embedding is dequantized to bf16 (Int4/packed-int6 can't gather). + self.assertEqual(model.embed_tokens.weight.dtype, torch.bfloat16) def test_generate(self): """GGUF -> CUDA -> eager generate produces valid tokens (inference.py).""" diff --git a/examples/models/parakeet/CMakeLists.txt b/examples/models/parakeet/CMakeLists.txt index a2b798de557..810f2815abd 100644 --- a/examples/models/parakeet/CMakeLists.txt +++ b/examples/models/parakeet/CMakeLists.txt @@ -109,49 +109,32 @@ if(EXECUTORCH_BUILD_VULKAN) executorch_target_link_options_shared_lib(vulkan_backend) endif() -set(parakeet_shared_sources parakeet_transcriber.cpp timestamp_utils.cpp - tokenizer_utils.cpp -) - -set(parakeet_common_include_directories - ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include -) - -add_executable(parakeet_runner main.cpp ${parakeet_shared_sources}) -add_executable( - parakeet_helper parakeet_helper.cpp parakeet_helper_protocol.cpp - ${parakeet_shared_sources} -) - -foreach(parakeet_target parakeet_runner parakeet_helper) - if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") - target_link_options_gc_sections(${parakeet_target}) - if(NOT APPLE AND NOT MSVC) - target_link_options(${parakeet_target} PRIVATE "LINKER:-s") - endif() +add_executable(parakeet_runner main.cpp timestamp_utils.cpp tokenizer_utils.cpp) +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(parakeet_runner) + if(NOT APPLE AND NOT MSVC) + target_link_options(parakeet_runner PRIVATE "LINKER:-s") endif() +endif() - if(TARGET mlxdelegate) - executorch_target_copy_mlx_metallib(${parakeet_target}) - endif() +# Copy MLX metallib for runtime if MLX delegate is enabled +if(TARGET mlxdelegate) + executorch_target_copy_mlx_metallib(parakeet_runner) +endif() - target_include_directories( - ${parakeet_target} PUBLIC ${parakeet_common_include_directories} - ) - target_link_libraries(${parakeet_target} PUBLIC ${link_libraries}) - target_compile_options(${parakeet_target} PUBLIC ${_common_compile_options}) -endforeach() +target_include_directories( + parakeet_runner PUBLIC ${_common_include_directories} +) +target_link_libraries(parakeet_runner PUBLIC ${link_libraries}) +target_compile_options(parakeet_runner PUBLIC ${_common_compile_options}) # On Windows, copy required DLLs to the executable directory if(MSVC AND EXECUTORCH_BUILD_CUDA) - foreach(parakeet_target parakeet_runner parakeet_helper) - add_custom_command( - TARGET ${parakeet_target} - POST_BUILD - COMMAND - ${CMAKE_COMMAND} -E copy_if_different $ - $ - COMMENT "Copying aoti_cuda_shims.dll to ${parakeet_target} directory" - ) - endforeach() + add_custom_command( + TARGET parakeet_runner + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ + $ + COMMENT "Copying aoti_cuda_shims.dll to parakeet_runner directory" + ) endif() diff --git a/examples/models/parakeet/CMakePresets.json b/examples/models/parakeet/CMakePresets.json index 90a90fbbdf5..87ace61e315 100644 --- a/examples/models/parakeet/CMakePresets.json +++ b/examples/models/parakeet/CMakePresets.json @@ -89,42 +89,42 @@ "displayName": "Build Parakeet runner (CPU)", "configurePreset": "parakeet-cpu", "configuration": "Release", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] }, { "name": "parakeet-cuda", "displayName": "Build Parakeet runner (CUDA)", "configurePreset": "parakeet-cuda", "configuration": "Release", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] }, { "name": "parakeet-cuda-debug", "displayName": "Build Parakeet runner (CUDA, Debug)", "configurePreset": "parakeet-cuda-debug", "configuration": "Debug", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] }, { "name": "parakeet-metal", "displayName": "Build Parakeet runner (Metal)", "configurePreset": "parakeet-metal", "configuration": "Release", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] }, { "name": "parakeet-mlx", "displayName": "Build Parakeet runner (MLX)", "configurePreset": "parakeet-mlx", "configuration": "Release", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] }, { "name": "parakeet-vulkan", "displayName": "Build Parakeet runner (Vulkan)", "configurePreset": "parakeet-vulkan", "configuration": "Release", - "targets": ["parakeet_runner", "parakeet_helper"] + "targets": ["parakeet_runner"] } ], "workflowPresets": [ diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index e2f09f8aa99..62cec6a9cc4 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -242,11 +242,6 @@ make parakeet-cuda make parakeet-mlx ``` -Each Parakeet build now produces both: - -- `parakeet_runner` for one-shot CLI transcription from an audio file -- `parakeet_helper` for long-lived host integrations that keep the model warm and stream PCM requests over stdin/stdout - On Windows (PowerShell), use CMake workflow presets directly: ```powershell @@ -315,26 +310,6 @@ If your generator is single-config, the runner may be at `.\cmake-out\examples\m | `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) | | `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) | -### Persistent Helper - -The helper binary uses the same Parakeet transcription stack as `parakeet_runner`, -but keeps the model loaded across multiple requests so host apps can avoid repeated -startup and model load overhead. - -Example: - -```bash -# Metal -DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_helper \ - --model_path examples/models/parakeet/parakeet_metal/model.pte \ - --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model -``` - -The helper accepts framed requests over stdin, validates 16 kHz mono float32 PCM -payloads, and returns status/result messages over stdout. It is intended for app -integrations such as the macOS `ExecuWhisper` frontend in the separate -`executorch-examples` repository. - ### Mobile App Check out a [demo Android app](https://github.com/meta-pytorch/executorch-examples/tree/main/parakeet/android/ParakeetApp) for Parakeet in the separate `executorch-examples` repository. diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp index 410ba6cea62..b8a052004e4 100644 --- a/examples/models/parakeet/main.cpp +++ b/examples/models/parakeet/main.cpp @@ -6,14 +6,25 @@ * LICENSE file in the root directory of this source tree. */ -#include - +#include +#include #include #include #include +#include +#include -#include "parakeet_transcriber.h" +#include +#include "timestamp_utils.h" +#include "tokenizer_utils.h" +#include "types.h" + +#include +#include +#include +#include +#include #include #ifdef ET_BUILD_METAL #include @@ -33,17 +44,69 @@ DEFINE_string( timestamps, "segment", "Timestamp output mode: none|token|word|segment|all"); -DEFINE_bool( - runtime_profile, - false, - "Print a detailed runtime profile for preprocessor, encoder, and decode-loop execution."); + +using ::executorch::extension::from_blob; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; + +using ::parakeet::TextWithOffsets; +using ::parakeet::TokenWithTextInfo; + +namespace { +// TDT duration values for Parakeet models +const std::vector DURATIONS = {0, 1, 2, 3, 4}; + +struct TimestampOutputMode { + bool token = false; + bool word = false; + bool segment = false; + + bool enabled() const { + return token || word || segment; + } +}; + +std::string to_lower_ascii(std::string s) { + for (char& ch : s) { + ch = static_cast(std::tolower(static_cast(ch))); + } + return s; +} + +TimestampOutputMode parse_timestamp_output_mode(const std::string& raw_arg) { + if (raw_arg.empty()) { + throw std::invalid_argument( + "Invalid --timestamps value (empty). Expected: token, word, segment, all."); + } + const std::string mode = to_lower_ascii(raw_arg); + if (mode == "none") { + return {false, false, false}; + } + if (mode == "token") { + return {true, false, false}; + } + if (mode == "word") { + return {false, true, false}; + } + if (mode == "segment") { + return {false, false, true}; + } + if (mode == "all") { + return {true, true, true}; + } + throw std::invalid_argument( + "Invalid --timestamps value '" + raw_arg + + "'. Expected: token, word, segment, all."); +} +} // namespace int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); - parakeet::TimestampOutputMode timestamp_mode; + TimestampOutputMode timestamp_mode; try { - timestamp_mode = parakeet::parse_timestamp_output_mode(FLAGS_timestamps); + timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps); } catch (const std::invalid_argument& e) { ET_LOG(Error, "%s", e.what()); return 1; @@ -54,57 +117,162 @@ int main(int argc, char** argv) { return 1; } - try { - parakeet::ParakeetTranscriber transcriber( - FLAGS_model_path, FLAGS_tokenizer_path, FLAGS_data_path); - const auto result = transcriber.transcribe_wav_path( - FLAGS_audio_path, - parakeet::TranscribeConfig{timestamp_mode, FLAGS_runtime_profile}); - - std::cout << "Transcribed text: " << result.text << std::endl; - if (!result.stats_json.empty()) { - std::cout << "PyTorchObserver " << result.stats_json << std::endl; - } - if (result.runtime_profile_report.has_value()) { - std::cout << *result.runtime_profile_report; - } + // --- Build config and runner --- + executorch::extension::asr::TransducerConfig config; + config.durations = DURATIONS; -#ifdef ET_BUILD_METAL - executorch::backends::metal::print_metal_backend_stats(); -#endif + std::optional data_path_opt; + if (!FLAGS_data_path.empty()) { + data_path_opt = FLAGS_data_path; + } - if (timestamp_mode.segment) { - std::cout << "\nSegment timestamps:" << std::endl; - for (const auto& segment : result.segment_offsets) { - const double start = segment.start_offset * result.frame_to_seconds; - const double end = segment.end_offset * result.frame_to_seconds; - std::cout << start << "s - " << end << "s : " << segment.text - << std::endl; - } - } + executorch::extension::asr::TransducerRunner runner( + FLAGS_model_path, FLAGS_tokenizer_path, config, data_path_opt); - if (timestamp_mode.word) { - std::cout << "\nWord timestamps:" << std::endl; - for (const auto& word : result.word_offsets) { - const double start = word.start_offset * result.frame_to_seconds; - const double end = word.end_offset * result.frame_to_seconds; - std::cout << start << "s - " << end << "s : " << word.text << std::endl; - } - } + auto load_err = runner.load(); + if (load_err != Error::Ok) { + ET_LOG(Error, "Failed to load model."); + return 1; + } - if (timestamp_mode.token) { - std::cout << "\nToken timestamps:" << std::endl; - for (const auto& token : result.token_offsets) { - const double start = token.start_offset * result.frame_to_seconds; - const double end = token.end_offset * result.frame_to_seconds; - std::cout << start << "s - " << end << "s : " << token.decoded_text - << std::endl; - } - } + // --- Load and preprocess audio --- + ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str()); + std::vector audio_data = + ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path); + ET_LOG(Info, "Loaded %zu audio samples", audio_data.size()); + + auto audio_tensor = from_blob( + audio_data.data(), + {static_cast<::executorch::aten::SizesType>(audio_data.size())}, + ::executorch::aten::ScalarType::Float); + + ET_LOG(Info, "Running preprocessor..."); + auto preprocess_result = runner.preprocess(audio_tensor); + if (!preprocess_result.ok()) { + ET_LOG(Error, "Preprocessing failed."); + return 1; + } + auto preprocess_out = preprocess_result.get(); + // --- Transcribe --- + ET_LOG(Info, "Running TDT greedy decode..."); + auto result = runner.transcribe( + preprocess_out.features, + [](const std::string& piece) { std::cout << piece << std::flush; }, + preprocess_out.length); + + if (!result.ok()) { + ET_LOG(Error, "Transcription failed."); + return 1; + } + + auto& decoded_tokens = result.get(); + ET_LOG(Info, "Decoded %zu tokens", decoded_tokens.size()); + + // Use the runner's tokenizer for text decoding and timestamps + const auto* tokenizer = runner.tokenizer(); + if (!tokenizer || !tokenizer->is_loaded()) { + ET_LOG(Error, "Tokenizer not available."); + return 1; + } + + // Print full transcribed text + std::string text = parakeet::tokenizer_utils::decode_token_sequence( + decoded_tokens, *tokenizer); + std::cout << "\nTranscribed text: " << text << std::endl; + +#ifdef ET_BUILD_METAL + executorch::backends::metal::print_metal_backend_stats(); +#endif // ET_BUILD_METAL + + if (!timestamp_mode.enabled()) { return 0; + } + + // --- Timestamps --- + // Query timestamp-related metadata from the model. + // These are Parakeet-specific constants, not part of TransducerRunner. + std::unique_ptr meta_module; + if (data_path_opt) { + meta_module = std::make_unique( + FLAGS_model_path, *data_path_opt, Module::LoadMode::Mmap); + } else { + meta_module = + std::make_unique(FLAGS_model_path, Module::LoadMode::Mmap); + } + auto meta_load_err = meta_module->load(); + if (meta_load_err != Error::Ok) { + ET_LOG(Error, "Failed to load model for timestamp metadata."); + return 1; + } + + std::vector<::executorch::runtime::EValue> empty_inputs; + auto window_stride_result = + meta_module->execute("window_stride", empty_inputs); + auto encoder_subsampling_factor_result = + meta_module->execute("encoder_subsampling_factor", empty_inputs); + + if (!window_stride_result.ok() || !encoder_subsampling_factor_result.ok()) { + ET_LOG( + Error, + "Failed to query timestamp metadata (window_stride, encoder_subsampling_factor)."); + return 1; + } + + double window_stride = window_stride_result.get()[0].toDouble(); + int64_t encoder_subsampling_factor = + encoder_subsampling_factor_result.get()[0].toInt(); + meta_module.reset(); + + ET_LOG(Info, "Computing timestamps..."); + std::unordered_set supported_punctuation = + parakeet::tokenizer_utils::derive_supported_punctuation(*tokenizer); + + std::vector tokens_with_text_info; + try { + tokens_with_text_info = + parakeet::timestamp_utils::get_tokens_with_text_info( + decoded_tokens, *tokenizer, supported_punctuation); } catch (const std::exception& e) { - ET_LOG(Error, "%s", e.what()); + ET_LOG(Error, "Failed to get tokens with text info: %s", e.what()); return 1; } + const auto word_offsets = parakeet::timestamp_utils::get_words_offsets( + tokens_with_text_info, *tokenizer, supported_punctuation); + const auto segment_offsets = + parakeet::timestamp_utils::get_segment_offsets(word_offsets); + + const double frame_to_seconds = + window_stride * static_cast(encoder_subsampling_factor); + + if (timestamp_mode.segment) { + std::cout << "\nSegment timestamps:" << std::endl; + for (const auto& segment : segment_offsets) { + const double start = segment.start_offset * frame_to_seconds; + const double end = segment.end_offset * frame_to_seconds; + std::cout << start << "s - " << end << "s : " << segment.text + << std::endl; + } + } + + if (timestamp_mode.word) { + std::cout << "\nWord timestamps:" << std::endl; + for (const auto& word : word_offsets) { + const double start = word.start_offset * frame_to_seconds; + const double end = word.end_offset * frame_to_seconds; + std::cout << start << "s - " << end << "s : " << word.text << std::endl; + } + } + + if (timestamp_mode.token) { + std::cout << "\nToken timestamps:" << std::endl; + for (const auto& token : tokens_with_text_info) { + const double start = token.start_offset * frame_to_seconds; + const double end = token.end_offset * frame_to_seconds; + std::cout << start << "s - " << end << "s : " << token.decoded_text + << std::endl; + } + } + + return 0; } diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt index 726657a3779..e1b54d644b2 100644 --- a/examples/models/qwen3_5_moe/CMakeLists.txt +++ b/examples/models/qwen3_5_moe/CMakeLists.txt @@ -54,14 +54,9 @@ elseif(EXECUTORCH_BUILD_CUDA) list(APPEND link_libraries aoti_cuda_backend) executorch_target_link_options_shared_lib(aoti_cuda_backend) add_compile_definitions(EXECUTORCH_BUILD_CUDA) -elseif(TARGET mlxdelegate) - list(APPEND link_libraries mlxdelegate mlx) - executorch_target_link_options_shared_lib(mlxdelegate) - add_compile_definitions(EXECUTORCH_BUILD_MLX) else() message( - FATAL_ERROR - "Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON" + FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON" ) endif() @@ -74,21 +69,9 @@ target_include_directories( ) target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries}) -add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp) -target_include_directories( - qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include} -) -target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries}) - if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(qwen3_5_moe_runner) target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s") - target_link_options_gc_sections(qwen3_5_moe_worker) - target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s") -endif() - -if(TARGET mlxdelegate) - executorch_target_copy_mlx_metallib(qwen3_5_moe_runner) endif() if(EXECUTORCH_BUILD_CUDA) diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json index 276c2116148..0d6de7f60eb 100644 --- a/examples/models/qwen3_5_moe/CMakePresets.json +++ b/examples/models/qwen3_5_moe/CMakePresets.json @@ -36,42 +36,19 @@ "type": "equals", "rhs": "Darwin" } - }, - { - "name": "qwen3-5-moe-mlx", - "displayName": "Qwen3.5 MoE runner (MLX)", - "inherits": ["qwen3-5-moe-base"], - "cacheVariables": { - "EXECUTORCH_BUILD_MLX": "ON" - }, - "condition": { - "type": "equals", - "lhs": "${hostSystemName}", - "rhs": "Darwin" - } } ], "buildPresets": [ { "name": "qwen3-5-moe-cuda", - "displayName": "Build Qwen3.5 MoE runner, worker, and no-bleed test (CUDA)", + "displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)", "configurePreset": "qwen3-5-moe-cuda", - "targets": [ - "qwen3_5_moe_runner", - "qwen3_5_moe_worker", - "test_qwen35_moe_nobleed" - ] + "targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"] }, { "name": "qwen3-5-moe-metal", - "displayName": "Build Qwen3.5 MoE runner and worker (Metal)", + "displayName": "Build Qwen3.5 MoE runner (Metal)", "configurePreset": "qwen3-5-moe-metal", - "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"] - }, - { - "name": "qwen3-5-moe-mlx", - "displayName": "Build Qwen3.5 MoE runner (MLX)", - "configurePreset": "qwen3-5-moe-mlx", "targets": ["qwen3_5_moe_runner"] } ], @@ -103,20 +80,6 @@ "name": "qwen3-5-moe-metal" } ] - }, - { - "name": "qwen3-5-moe-mlx", - "displayName": "Configure and build Qwen3.5 MoE runner (MLX)", - "steps": [ - { - "type": "configure", - "name": "qwen3-5-moe-mlx" - }, - { - "type": "build", - "name": "qwen3-5-moe-mlx" - } - ] } ] } diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md index c275641bfd7..e3f13cc77d6 100644 --- a/examples/models/qwen3_5_moe/README.md +++ b/examples/models/qwen3_5_moe/README.md @@ -147,56 +147,6 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \ `--cuda_graph` is intentionally single-session only. CUDA graph replay captures device pointers, so it is not combined with per-session mutable-state rebinding. -## OpenAI-compatible serving - -The CUDA build also produces `qwen3_5_moe_worker`, a C++ model-execution worker -used by the generic `examples/llm_server` control plane. The Qwen launcher wires -in the model's Hugging Face chat template and Qwen XML tool-call parser: - -```bash -python -m executorch.examples.models.qwen3_5_moe.serve \ - --model-path qwen35_moe_exports/model.pte \ - --data-path qwen35_moe_exports/aoti_cuda_blob.ptd \ - --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \ - --hf-tokenizer ~/models/Qwen3.5-35B-A3B \ - --model-id qwen3.5-moe \ - --max-context 4096 \ - --max-sessions 4 \ - --no-think -``` - -`--max-sessions` controls how many isolated sessions the worker can host on one -weight load. One slot is reserved for anonymous requests; clients should send a -stable `session_id` (or session-affinity header) to get per-conversation -isolation and warm append-only resume. - -### Use from pi - -Point pi at the Qwen server via `~/.pi/agent/models.json`: - -```json -{ - "providers": { - "executorch": { - "baseUrl": "http://127.0.0.1:8000/v1", - "api": "openai-completions", - "apiKey": "x", - "models": [ - { - "id": "qwen3.5-moe", - "compat": { "sendSessionAffinityHeaders": true } - } - ] - } - } -} -``` - -The model id must match `--model-id`. `sendSessionAffinityHeaders` lets pi route -each conversation or subagent to a stable server session; without it, requests -use the anonymous scratch session and do not get per-conversation isolation or -warm resume. - ### CUDA no-bleed test The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two @@ -261,38 +211,7 @@ python export.py \ | `--qembedding` | (none) | Embedding quantization: `8w` | | `--tiny-test` | off | Build tiny model with random weights for CI testing | -### Build (MLX) - -Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the -MLX backend and the runner binary. Requires Apple Silicon (Darwin). - -```bash -make qwen3_5_moe-mlx -``` - -This builds ExecuTorch with MLX support, then the runner binary at -`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib` -copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd` -data file is produced or needed. - -### Run (MLX, C++ runner) - -The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and -a `tokenizer.json`; no `--data_path`): - -```bash -cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \ - --model_path ./qwen35_moe_mlx/model.pte \ - --tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \ - --prompt "What is the capital of France?" \ - --max_new_tokens 50 -``` - -The MLX export emits a single dynamic-seq `forward` method; the runner loads and -calls it for both prefill and decode (sampling on host), matching the Python -runner. See the [Run](#run) section above for the full flag list. - -### Run (MLX, Python) +### Run (MLX) ```bash python -m executorch.examples.models.qwen3_5_moe.run \ diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py index 566d61e6cfc..d7e7d9ca293 100644 --- a/examples/models/qwen3_5_moe/export.py +++ b/examples/models/qwen3_5_moe/export.py @@ -768,16 +768,10 @@ def _export_mlx(model, config, args): gc.collect() print("Lowering to ExecuTorch with MLX backend...") - # Largest prefill chunk the runner may submit in one forward call. The MLX - # runner chunks long prompts to cap peak memory; bound it by the compiled - # dynamic max (max_seq_len - 1) so a chunk can never exceed what `forward` - # was compiled for. - max_prefill_chunk = min(1024, config.max_seq_len - 1) metadata = { "get_max_seq_len": config.max_seq_len, "get_vocab_size": config.vocab_size, "get_n_layers": config.num_hidden_layers, - "get_max_prefill_chunk": max_prefill_chunk, "use_kv_cache": True, "use_sdpa_with_kv_cache": False, "enable_dynamic_shape": True, diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp index 713f6211330..3c5b2eec439 100644 --- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp +++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp @@ -19,8 +19,6 @@ #include #include -#include - #ifdef EXECUTORCH_BUILD_CUDA #include #include @@ -41,22 +39,6 @@ using SizesType = executorch::aten::SizesType; namespace { -#ifdef EXECUTORCH_BUILD_MLX -// The MLX export emits a single dynamic-seq `forward` method that handles both -// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads -// and calls `forward` for both phases. -constexpr const char* kPrefillMethod = "forward"; -constexpr const char* kDecodeMethod = "forward"; -#else -// CUDA/Metal exports emit two separate methods. -constexpr const char* kPrefillMethod = "prefill"; -constexpr const char* kDecodeMethod = "decode"; -#endif - -// Constant method exported by the MLX .pte giving the largest prefill chunk the -// `forward` method was compiled for. Read into the metadata map in create(). -constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk"; - Result read_sampled_token( const executorch::aten::Tensor& output, float temperature) { @@ -116,10 +98,8 @@ Result> build_qwen_module( } #endif - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod)); - if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod)); - } + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill")); + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode")); return module; } @@ -260,63 +240,34 @@ class Qwen35MoESession : public LLMSession { } stop_.store(false, std::memory_order_relaxed); - - // On MLX, run prefill in fixed-size chunks (caps peak memory and the - // compiled prefill shape). Other backends prefill the whole prompt in one - // pass. Only the final chunk's sampled token is kept; the recurrence/KV - // state from earlier chunks persists via pos_ advancement. -#ifdef EXECUTORCH_BUILD_MLX - // Chunk size: default to the compiled max (kMaxSeqLen - 1), overridden by - // the exported get_max_prefill_chunk constant when present (mirrors - // gemma4_31b). Falls back to T (single pass) if no metadata is available at - // all. - int64_t chunk_size = T; - if (auto it = metadata_.find(kMaxSeqLen); - it != metadata_.end() && it->second > 1) { - chunk_size = it->second - 1; - } - if (auto it = metadata_.find(kMaxPrefillChunk); - it != metadata_.end() && it->second > 0) { - chunk_size = it->second; + std::vector token_data(tokens.begin(), tokens.end()); + std::vector pos_data(T); + for (int64_t i = 0; i < T; ++i) { + pos_data[i] = pos_ + i; } -#else - const int64_t chunk_size = T; -#endif - - uint64_t sampled_token = 0; - for (int64_t off = 0; off < T; off += chunk_size) { - const int64_t len = std::min(chunk_size, T - off); - std::vector token_data( - tokens.begin() + off, tokens.begin() + off + len); - std::vector pos_data(len); - for (int64_t i = 0; i < len; ++i) { - pos_data[i] = pos_ + i; - } - auto tokens_tensor = from_blob( - token_data.data(), - {1, static_cast(len)}, - executorch::aten::ScalarType::Long); - auto pos_tensor = from_blob( - pos_data.data(), - {static_cast(len)}, - executorch::aten::ScalarType::Long); - - const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod; - std::vector inputs; - inputs.push_back(tokens_tensor); - inputs.push_back(pos_tensor); + auto tokens_tensor = from_blob( + token_data.data(), + {1, static_cast(T)}, + executorch::aten::ScalarType::Long); + auto pos_tensor = from_blob( + pos_data.data(), + {static_cast(T)}, + executorch::aten::ScalarType::Long); + + const char* method = (T >= 2) ? "prefill" : "decode"; + std::vector inputs; + inputs.push_back(tokens_tensor); + inputs.push_back(pos_tensor); #ifdef EXECUTORCH_BUILD_CUDA - set_temp(first_token_temp); - inputs.push_back(EValue(temp_tensor_)); + set_temp(first_token_temp); + inputs.push_back(EValue(temp_tensor_)); #endif - auto sampled = - run_locked(method, inputs, first_token_temp, /*sync_after=*/true); - ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); - sampled_token = sampled.get(); - pos_ += len; - } - pending_ = sampled_token; + auto sampled = + run_locked(method, inputs, first_token_temp, /*sync_after=*/true); + ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); + pending_ = sampled.get(); prev_decode_token_.reset(); + pos_ += T; return Error::Ok; } @@ -383,7 +334,7 @@ class Qwen35MoESession : public LLMSession { inputs.push_back(EValue(temp_tensor_)); #endif auto sampled = - run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false); + run_locked("decode", inputs, temperature_, /*sync_after=*/false); ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); pending_ = sampled.get(); prev_decode_token_ = token; @@ -506,14 +457,6 @@ Result> Qwen35MoEEngine::create( ET_LOG(Error, "Qwen35MoEEngine: failed to read metadata"); return metadata_result.error(); } -#ifdef EXECUTORCH_BUILD_MLX - // Surface the compiled max prefill chunk (a constant method get_llm_metadata - // doesn't harvest) into the metadata map so the session can chunk long - // prompts within the shape `forward` was compiled for. - if (auto mpc = meta_module->get(kMaxPrefillChunk); mpc.ok()) { - metadata_result.get()[kMaxPrefillChunk] = mpc->toScalar().to(); - } -#endif auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get()); // This export's metadata doesn't carry the chat-turn EOS (config.json has no // eos_token_id and the .pte exports no get_eos_ids method), so get_eos_ids() diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 4a5d1fd023d..8e0dc70bbb5 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -130,12 +130,12 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL Default example using hybrid mode. ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1 - +``` #### Codegen2 Default example using kv mode. ```bash -python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json +python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" ``` #### Gemma 2B @@ -210,17 +210,7 @@ Default example using hybrid mode. python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1 ``` -#### Using custom calibration samples for LLMs - -Instead of `--calib_tasks`, you can supply your own conversation JSON files via `--calib_samples`. The samples are fed into the quantization calibration pass to collect activation observer statistics — they do not affect the inference prompt. This is useful when you want to calibrate on domain-specific or instruct-format data rather than a generic lm_eval task. - -```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json -``` - -You can also provide both `--calib_tasks` and `--calib_samples` at the same time; the pipeline concatenates both data sources for calibration. - - +## Multimodal Support ### Overview @@ -278,7 +268,7 @@ pip install soundfile Default example using hybrid mode. ```bash -python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/audio.json +python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" ``` ### Specifying Custom Audio @@ -291,6 +281,9 @@ You can specify a custom audio file for ALM models using the `--audio_path` flag - **Local file paths**: Absolute or relative paths to `.wav` files on your system - Example: `"/path/to/your/audio.wav"` +**Default behavior:** +If `--audio_path` is not specified, the system will automatically use the default audio file defined in the model's configuration file (`encoder/encoder_config.py`). + #### Audio Preprocessing The audio encoder configuration is defined in `encoder/encoder_config.py`: @@ -301,6 +294,7 @@ The audio encoder configuration is defined in `encoder/encoder_config.py`: class GraniteSpeechEncoder(AudioModalityConfig): encoder_class = GraniteSpeechCTCEncoderWrapper audio_seq_len = 171 + audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" # Default audio (content: "After his nap, ...") quant_recipe = GraniteSpeechEncoderQuantRecipe ``` @@ -357,13 +351,13 @@ Vision-Language Models (VLMs) combine computer vision and natural language proce #### SmolVLM 500M Default example using hybrid mode. ```bash -python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json +python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" ``` #### InternVL 1B Default example using hybrid mode. ```bash -python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json +python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" ``` ### Specifying Custom Image @@ -376,6 +370,9 @@ Take a example image of Statue-of-Liberty in New York Bay - **Local file paths**: Absolute or relative paths to image files on your system - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png) +**Default behavior:** +If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`). + #### Image Preprocessing Each VLM model has specific preprocessing requirements defined in its configuration: @@ -388,6 +385,7 @@ class SmolVLMEncoder(VisionModalityConfig): img_seq_len = 64 img_resized_h = 512 img_resized_w = 512 + img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" # Default image quant_recipe = SmolVLMEncoderQuantRecipe ``` @@ -429,7 +427,7 @@ PROMPT2="Answer the question: What's the main object in first image?" PROMPT3="Caption this image." # Execute the multi-turn conversation -python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json +python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" ``` **How it works:** @@ -455,19 +453,16 @@ The VLM inference pipeline consists of: - Special tokens (e.g., ``, `<|fake_token_around_image|>`, ``) mark modality boundaries (see [tokenizer.py](tokenizer.py)) ```python - # Token fields on each encoder config subclass (encoder/encoder_config.py) - @dataclass(init=False, frozen=True) - class SmolVLMEncoder(VisionModalityConfig): - img_token = "" - fake_wrap_start = "" - fake_wrap_end = "" - global_img_token = "" - - @dataclass(init=False, frozen=True) - class InternVL3Encoder(VisionModalityConfig): - img_token = "" - fake_wrap_start = "" - fake_wrap_end = "" + # Special tokens for Vision-Language Model + VLM_SPECIAL_TOKENS = { + "smolvlm_500m_instruct": { + "image_token": "", + "global_img": "", + "fake_wrap_start": "", + "fake_wrap_end": "", + }, + ... + } ``` - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]` @@ -550,13 +545,16 @@ From the example script above, 1 wikitext sample is used to evaluate all 3 phase Example: ```bash # 1st run to compile with --calib_limit 1 -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 -a ${FOLDER_TO_PRE_GEN_PTE} --compile_only +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 --compile_only ``` ```bash # 2nd run to perform QNN device execution with --eval_limit 3 -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json ``` +#### Tasks quantization calibration +If `--calib_tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration. +`--calib_tasks` and `--eval_tasks` are independent flags. `--calib_tasks` controls which tasks are used for quantization calibration, while `--eval_tasks` controls which tasks are used for perplexity evaluation. They can be set to different tasks or limits as needed. #### SQNR Evalution To evaluate QNN's output logits against the golden logits from `nn.Module`, users can provide the flag `--sqnr_eval`. Please note that SQNR evaluation will only compare the logits of the user's prompt and will not compare the new tokens generated by the model. @@ -565,52 +563,6 @@ Example: python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods sqnr_eval ``` - - -#### Quantization - -The calibration data is independent from the runtime evaluation set, and only affects quantization quality, not the inference output. - -Calibration data is required for compilation. There are two ways to supply it: - -1. **`--calib_tasks`** — calibrate on one or more lm_eval tasks (tune with `--calib_limit` and `--calib_num_fewshot`). LLM-only. -2. **`--calib_samples`** — calibrate on custom conversation samples provided as JSON files (see format below). Required for multimodal models (VLM/ALM). - -For LLMs, provide at least one of the two; for multimodal models, `--calib_samples` is mandatory. - -Calibration and runtime evaluation use separate flag sets and can target different tasks or limits as needed: - -| Purpose | Flags | -|---|---| -| Calibration data (lm_eval tasks) | `--calib_tasks`, `--calib_limit`, `--calib_num_fewshot` | -| Calibration data (custom samples) | `--calib_samples` (JSON files, HuggingFace message format) | - -##### Custom calibration samples (`--calib_samples`) - -`--calib_samples` accepts one or more JSON files. Each file is a flat list of sample objects. Each sample has a `messages` field following the HuggingFace chat template, and an optional `files` field for media inputs (local paths or URLs): - -```json -[ - { - "files": ["path/or/url/to/files"], - "messages": [ - {"role": "user", "content": "..." }, - {"role": "assistant", "content": "..."} - ] - } -] -``` - -`files` is only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). For LLM-only models, `files` can be omitted. `content` can be a plain string or a list of HuggingFace content blocks (e.g. `[{"type": "image"}, {"type": "text", "text": "..."}]` for vision inputs). - -Ready-to-use examples for each model type are provided under `assets/samples/`: - -| Model type | Example file | -|---|---| -| LLM | [assets/samples/text.json](assets/samples/text.json) | -| ALM (audio) | [assets/samples/audio.json](assets/samples/audio.json) | -| VLM (vision) | [assets/samples/vision.json](assets/samples/vision.json) | - #### Quantization Guidance To automatically identify sensitive layers and generate a mixed-precision recipe suggestion, add the `--quant_recipe_suggestion` flag. During calibration, the analyzer compares FP32 and QDQ intermediate outputs layer-by-layer using SQNR, then writes two files to the working directory: diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS index c00525d6fe7..30b86eabb01 100644 --- a/examples/qualcomm/oss_scripts/llama/TARGETS +++ b/examples/qualcomm/oss_scripts/llama/TARGETS @@ -23,6 +23,17 @@ runtime.python_library( ], ) +runtime.python_library( + name = "decoder_utils", + srcs = [ + "decoder_utils.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/examples/models/llama:eval_library", + ], +) + runtime.python_library( name = "masking_utils", srcs = [ @@ -70,112 +81,19 @@ runtime.python_library( srcs = [ "tokenizer.py", ], - deps = [ - ":decoder_constants", - ":static_llama", - "//caffe2:torch", - "fbsource//third-party/pypi/transformers:transformers", - ], -) - -runtime.python_library( - name = "utils", - srcs = [ - "utils.py", - ], deps = [ "//caffe2:torch", - "//executorch/exir:lib", - ], -) - -runtime.python_library( - name = "inference", - srcs = [ - "inference/__init__.py", - "inference/decoder.py", - "inference/encoder.py", - "inference/model.py", - ], - deps = [ - ":masking_utils", - "//caffe2:torch", ], ) runtime.python_library( name = "dataset", srcs = [ - "dataset/__init__.py", - "dataset/builders.py", - "dataset/collators.py", - "dataset/config.py", - "dataset/datasets.py", - "dataset/loaders.py", - "dataset/preprocessors.py", - "dataset/schema.py", - ], - deps = [ - ":decoder_constants", - ":encoder", - ":masking_utils", - ":tokenizer", - "//caffe2:torch", - "//executorch/examples/models/llama:eval_library", - "fbsource//third-party/pypi/lm-eval:lm-eval", - "fbsource//third-party/pypi/transformers:transformers", - ], -) - -runtime.python_library( - name = "quantize", - srcs = [ - "quantize/__init__.py", - "quantize/ptq.py", - "quantize/strategy.py", - ], - deps = [ - ":decoder_constants", - ":inference", - ":utils", - "//caffe2:torch", - "//executorch/backends/qualcomm/_passes:passes", - ], -) - -runtime.python_library( - name = "mix_precision_analyzer", - srcs = [ - "mix_precision_analyzer.py", - ], - deps = [ - ":inference", - "//caffe2:torch", - "//executorch/backends/qualcomm/quantizer:quantizer", - "//executorch/devtools:lib", - "//executorch/exir:lib", - "//pytorch/ao:torchao", - ], -) - -runtime.python_library( - name = "evaluator", - srcs = [ - "evaluator/__init__.py", - "evaluator/device_evaluator.py", - "evaluator/lm_eval_adapter.py", + "dataset.py", ], deps = [ - ":dataset", - ":decoder_constants", - ":inference", ":tokenizer", - ":utils", "//caffe2:torch", - "//executorch/backends/qualcomm:export_utils", - "//executorch/examples/models/llama:eval_library", - "//pytorch/ao:torchao", - "fbsource//third-party/pypi/lm-eval:lm-eval", ], ) @@ -188,16 +106,10 @@ runtime.python_library( "wrappers/llm_wrappers.py", ], deps = [ - ":dataset", ":decoder_constants", ":encoder", - ":evaluator", - ":inference", - ":mix_precision_analyzer", - ":quantize", ":static_llama", ":static_llm_quant_recipe", - ":tokenizer", "//caffe2:torch", "//executorch/backends/qualcomm:export_utils", "//executorch/backends/qualcomm/_passes:passes", @@ -217,11 +129,10 @@ runtime.python_library( deps = [ ":dataset", ":decoder_constants", + ":decoder_utils", ":encoder", - ":evaluator", ":masking_utils", ":static_llm_quant_recipe", - ":tokenizer", ":wrappers", "//executorch/examples/models/llama:source_transformation", "//caffe2:torch", @@ -269,6 +180,22 @@ python_binary( ], ) +python_binary( + name = "eval_llama_qnn", + srcs = ["eval_llama_qnn.py"], + main_function = "executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn.main", + preload_deps = [ + "//executorch/extension/llm/custom_ops:model_sharding_py", + ], + deps = [ + ":llama_lib", + "//executorch/examples/models/llama:eval_library", + "//executorch/examples/qualcomm/oss_scripts/llama:range_setting_pt2e", + "fbsource//third-party/pypi/lm-eval:lm-eval", + ], + keep_gpu_sections = True, +) + runtime.command_alias( name = "llama_qnn", env = { diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py index b53f4bda689..9ed44f6f3e0 100644 --- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py +++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py @@ -49,10 +49,12 @@ class AudioModalityConfig(MultiModalityConfig): Attributes: audio_seq_len: Number of audio tokens in the sequence. + audio_url: Default audio URL for validation and calibration. """ audio_seq_len: int n_bins: int + audio_url: str def create_encoder(self, config): return self.encoder_class(config, n_bins=self.n_bins) @@ -69,11 +71,13 @@ class VisionModalityConfig(MultiModalityConfig): img_seq_len: Number of image tokens/patches in the sequence. img_resized_h: Target height for image resizing (pixels). img_resized_w: Target width for image resizing (pixels). + img_url: Default image URL for validation and calibration. """ img_seq_len: int img_resized_h: int img_resized_w: int + img_url: str def create_encoder(self, config): return self.encoder_class( @@ -90,6 +94,7 @@ class GraniteSpeechEncoder(AudioModalityConfig): encoder_class = GraniteSpeechCTCEncoderWrapper audio_seq_len = 171 n_bins = 844 + audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" quant_recipe = GraniteSpeechEncoderQuantRecipe num_sharding = 8 @@ -104,6 +109,7 @@ class SmolVLMEncoder(VisionModalityConfig): img_seq_len = 64 img_resized_h = 512 img_resized_w = 512 + img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" quant_recipe = SmolVLMEncoderQuantRecipe @@ -117,4 +123,5 @@ class InternVL3Encoder(VisionModalityConfig): img_seq_len = 256 img_resized_h = 448 img_resized_w = 448 + img_url = "http://images.cocodataset.org/val2017/000000039769.jpg" quant_recipe = InternVL3EncoderQuantRecipe diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index d3d4a475288..ea09451a697 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -12,7 +12,7 @@ import os import sys from multiprocessing.connection import Client -from typing import Dict, List +from typing import Dict import torch from executorch.backends.qualcomm.export_utils import ( @@ -30,11 +30,7 @@ LLMModelConfig, SUPPORTED_LLM_MODELS, ) -from executorch.examples.qualcomm.oss_scripts.llama.dataset import ( - DataConfig, - DatasetBuilder, - MessageSample, -) +from executorch.examples.qualcomm.oss_scripts.llama.dataset import DatasetBuilder from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( ATTENTION_SINK_EVICTOR, AUDIO_ENCODER, @@ -50,7 +46,7 @@ TOK_EMBEDDING_GRAPH_NAMES, VISION_ENCODER, ) -from executorch.examples.qualcomm.oss_scripts.llama.evaluator.device_evaluator import ( +from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import ( DefaultEval, SqnrEval, TaskEval, @@ -100,9 +96,10 @@ def compile( args, decoder_model_config: LLMModelConfig, pte_filenames: Dict[str, str], - tokenizer_wrapper, + tokenizer, + calibration_data, is_multimodal, -) -> Dict[str, List]: +): os.makedirs(args.artifact, exist_ok=True) multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config) @@ -188,8 +185,9 @@ def compile( # perform ptq multi_modal_mgr.quantize( - tokenizer_wrapper=tokenizer_wrapper, + calibration_data=calibration_data, skip_quantize=skip_quantize, + tokenizer=tokenizer, backend=get_backend_type(args.backend), soc_model=args.soc_model, ) @@ -206,14 +204,15 @@ def inference( args, decoder_model_config: LLMModelConfig, runtime_tokenizer_path, - tokenizer_wrapper: TokenizerWrapper, + tokenizer, + chat_template, text_decoder_pte_path: str, encoder_pte_paths: Dict[str, str], tok_embedding_pte_path: str, attention_sink_evictor_pte_path: str, + calibration_data, is_multimodal, ): - tokenizer = tokenizer_wrapper.tokenizer assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}." @@ -251,35 +250,15 @@ def inference( {modality: encoder_pte_path}, ) - multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config) - audio_encoder = multi_modal_mgr.audio_encoder.model - vision_encoder = multi_modal_mgr.vision_encoder.model - tok_embedding = multi_modal_mgr.text_decoder.calibration_prefill.tok_embedding - source_model = multi_modal_mgr.text_decoder.calibration_prefill.decoder - audio_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get( - "audio_token_id", None - ) - image_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get( - "image_token_id", None - ) - dataset_builder = DatasetBuilder( - DataConfig.from_args(args), - decoder_model_config, - tokenizer_wrapper, - attn_mask=source_model.get_example_inputs()[1], - ) if PROMPT_EVAL in args.eval_methods: prompt_evaluator = DefaultEval( args=args, - decoder_model_config=decoder_model_config, pte_paths=pte_paths, runtime_tokenizer_path=runtime_tokenizer_path, is_multimodal=is_multimodal, - dataset_builder=dataset_builder, - ) - output_prompt = prompt_evaluator.run( - prompt=args.prompt, audio_paths=args.audio_path, image_paths=args.image_path + modality_inputs=calibration_data, ) + output_prompt = prompt_evaluator.run(prompt=args.prompt) eval_results.update( { "inference_speed": prompt_evaluator.inference_speed, @@ -291,31 +270,31 @@ def inference( if SQNR_EVAL in args.eval_methods: assert not is_multimodal, "Modality Model does not support SQNR_EVAL." - runtime_message = tokenizer_wrapper.prepare_messages(args.prompt)[0] - message = MessageSample( - files=runtime_message["files_path"], - messages=tokenizer_wrapper.make_chat_template( - runtime_message["text"], args.system_prompt - ), + tokenizer_wrapper = TokenizerWrapper( + args, + decoder_model_config, + ) + prompt = ( + tokenizer_wrapper.apply_prompt_template( + chat_template, args.prompt[0], args.system_prompt + ) + if chat_template is not None + else args.prompt[0] ) + multi_modal_mgr = MultiModalManager( + control_args=args, config=decoder_model_config + ) + source_model = multi_modal_mgr.text_decoder.decode.decoder sqnr_evaluator = SqnrEval( source_model=source_model, get_example_inputs=source_model.get_example_inputs, args=args, pte_paths=pte_paths, - tokenizer_wrapper=tokenizer_wrapper, - decoder_model_config=decoder_model_config, + tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, is_multimodal=is_multimodal, - dataset_builder=dataset_builder, - encoder=audio_encoder or vision_encoder, - tok_embedding=tok_embedding, - audio_token_id=audio_token_id, - image_token_id=image_token_id, - ) - sqnr, golden_logits, _ = sqnr_evaluator.run( - message, audio_paths=args.audio_path, image_paths=args.image_path ) + sqnr, golden_logits, _ = sqnr_evaluator.run(prompt=prompt) logging.info(f"SQNR Eval Score between FP32 nn.Module and QNN: {sqnr}") eval_results.update( { @@ -336,19 +315,11 @@ def inference( get_example_inputs=source_model.get_example_inputs, args=args, pte_paths=pte_paths, - tokenizer_wrapper=tokenizer_wrapper, - decoder_model_config=decoder_model_config, + tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, is_multimodal=is_multimodal, - dataset_builder=dataset_builder, - encoder=audio_encoder or vision_encoder, - tok_embedding=tok_embedding, - audio_token_id=audio_token_id, - image_token_id=image_token_id, - ) - qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run( - message, audio_paths=args.audio_path, image_paths=args.image_path ) + qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(prompt=prompt) eval_results["qdq_sqnr"] = qdq_sqnr logging.info(f"SQNR Eval Score between CPU QDQ and QNN: {qdq_sqnr}") logging.info( @@ -364,7 +335,6 @@ def inference( # Generate the eval wrapper ppl_evaluator = TaskEval( args=args, - decoder_model_config=decoder_model_config, pte_paths=pte_paths, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, @@ -440,7 +410,7 @@ def _build_parser(): parser.add_argument( "--prompt", - help="User prompts used during runtime inference only (not compilation or calibration). When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.", + help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.", required=True, type=str, nargs="+", @@ -536,7 +506,7 @@ def _build_parser(): parser.add_argument( "--audio_path", - help="Path to the audio file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.", + help="Path to the audio file for multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.", default=[], type=str, nargs="+", @@ -544,7 +514,7 @@ def _build_parser(): parser.add_argument( "--image_path", - help="Path to the image file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.", + help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.", default=[], type=str, nargs="+", @@ -558,7 +528,7 @@ def _build_parser(): help="Choose eval methods(default: prompt_eval). Users can provide more than 1 eval methods. For example: --eval_methods tasks_eval sqnr_eval." "Following eval methods are supported:" "1) prompt_eval: Model will generate the output response based on the provided prompt through the flag --prompt." - "2) tasks_eval: This will eval the tasks provided through the flag --eval_tasks." + "2) tasks_eval: This will eval the tasks provided through the flag --tasks." "3) sqnr_eval: This will eval the sqnr between between QNN's output logit V.S. Static Llama nn.Module's output logit. Eval is based on the provided prompt through the --prompt flag. Please note that sqnr will only eval the prompt's logit but not the new generated token's logit.", ) @@ -576,7 +546,6 @@ def _build_parser(): default=1, help="number of samples to evalulate. If not set, evaluate all samples", ) - parser.add_argument( "--eval_num_fewshot", type=int, @@ -608,19 +577,6 @@ def _build_parser(): help="Number of examples to calibrate in few-shot context", ) - parser.add_argument( - "--calib_samples", - nargs="+", - type=str, - default=None, - help="One or more paths to calibration sample JSON files. Only JSON format is supported. " - "Each file must be a flat list of sample objects: " - '[{"files": ["path_or_url", ...], "messages": [{"role": "user"|"assistant", "content": "..." | [...]}]}]. ' - '"files" is optional and only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). ' - '"messages" follows the HuggingFace chat template; "content" can be a plain string or a list of content blocks. ' - "Multiple files are merged.", - ) - parser.add_argument( "-F", "--use_fp16", @@ -631,16 +587,31 @@ def _build_parser(): parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument( + "--calibration_num_threads", + type=int, + default=0, + help="Thread count for calibration forward passes. 0 = auto-tune (default).", + ) + parser.add_argument( "--quant_recipe_suggestion", action="store_true", help="Enable automatic quant recipe suggestion in PTQ", ) + parser.add_argument( + "--skip_user_prompt_calibration", + action="store_true", + help="Skip using user prompt for calibration. Useful when only dataset-based calibration is desired.", + ) + return parser def export_llama(args) -> None: + if args.calibration_num_threads < 0: + raise ValueError("--calibration_num_threads must be >= 0") if args.compile_only and args.pre_gen_pte: raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") if (TASKS_EVAL or SQNR_EVAL) in args.eval_methods and args.model_mode not in { @@ -651,12 +622,6 @@ def export_llama(args) -> None: "Eval device perplexity is only supported for KV mode. Hybrid mode will only use KV mode when evaluating tasks/sqnr." ) if TASKS_EVAL in args.eval_methods and args.eval_tasks is None: - if args.calib_tasks is None: - logging.warning( - "--eval_tasks is set but --calib_tasks is not; quantization " - "calibration will use --prompt instead of a task dataset. " - "Pass --calib_tasks to match the previous --tasks behavior." - ) raise RuntimeError("Please provide --eval_tasks to eval perplexity") assert ( args.decoder_model in SUPPORTED_LLM_MODELS @@ -709,9 +674,17 @@ def export_llama(args) -> None: args, decoder_model_config, ) - runtime_tokenizer_path = tokenizer_wrapper.runtime_tokenizer_path + runtime_tokenizer_path, tokenizer, chat_template = ( + tokenizer_wrapper.get_runtime_tokenizer( + args.tokenizer_model, args.tokenizer_bin + ) + ) # Prepare dataset + dataset_builder = DatasetBuilder(args, decoder_model_config, tokenizer_wrapper) + calibration_data = dataset_builder.prepare_calibration_dataset( + args.prompt, chat_template + ) text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte" attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte" tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte" @@ -728,26 +701,13 @@ def export_llama(args) -> None: hasattr(decoder_model_config, AUDIO_ENCODER), ] ) - if is_multimodal: - # TODO: Implement attention sink support for multimodal models (vision/audio). - if args.use_attention_sink is not None: - raise ValueError( - "Multimodal models currently do not support attention sink feature." - ) - if args.eval_tasks is not None: - raise ValueError("Multimodal models do not support --eval_tasks.") - - if not args.pre_gen_pte: - if is_multimodal and args.calib_samples is None: - raise ValueError( - "For MLLMs calibration data is required for compilation. " - "Provide --calib_samples with a vision/audio JSON file." - ) - if not is_multimodal and not any((args.calib_tasks, args.calib_samples)): - raise ValueError( - "For LLMs calibration data is required for compilation. " - "Provide --calib_tasks or --calib_samples." - ) + # TODO: Implement attention sink support for multimodal models (vision/audio). + assert ( + not is_multimodal or args.use_attention_sink is None + ), "Multimodal models currently do not support attention sink feature." + assert ( + not is_multimodal or not args.skip_user_prompt_calibration + ), "--skip_user_prompt_calibration is not supported for multimodal models (VLM/ALM) as they do not support task-based calibration yet." if args.pre_gen_pte: text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte" @@ -775,11 +735,13 @@ def export_llama(args) -> None: args, decoder_model_config, runtime_tokenizer_path, - tokenizer_wrapper, + tokenizer, + chat_template, text_decoder_pte_path, encoder_pte_paths, tok_embedding_pte_path, attention_sink_evictor_pte_path, + calibration_data, is_multimodal, ) print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") @@ -789,7 +751,8 @@ def export_llama(args) -> None: args, decoder_model_config, pte_filenames, - tokenizer_wrapper, + tokenizer, + calibration_data, is_multimodal, ) if args.use_attention_sink: @@ -834,11 +797,13 @@ def export_llama(args) -> None: args, decoder_model_config, runtime_tokenizer_path, - tokenizer_wrapper, + tokenizer, + chat_template, text_decoder_pte_path, encoder_pte_paths, tok_embedding_pte_path, attention_sink_evictor_pte_path, + calibration_data, is_multimodal, ) diff --git a/examples/qualcomm/oss_scripts/llama/masking_utils.py b/examples/qualcomm/oss_scripts/llama/masking_utils.py index a09cdf1240f..7725b7589e1 100644 --- a/examples/qualcomm/oss_scripts/llama/masking_utils.py +++ b/examples/qualcomm/oss_scripts/llama/masking_utils.py @@ -5,12 +5,10 @@ # LICENSE file in the root directory of this source tree. from abc import ABC, abstractmethod -from typing import List, Tuple, Union +from typing import List, Union import torch -PADDING_MASK_VALUE = -255.0 - def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: int): """ @@ -23,14 +21,14 @@ def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: i ● = activate (can attend), ○ = inactivate (masked) """ - mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE) + mask = torch.full((ar_len, ar_len), -255.0) mask_cond = torch.arange(ar_len) mask.masked_fill_(mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1), 0) if max_context_len != ar_len: mask = torch.cat( [ - torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE, + torch.ones(ar_len, max_context_len - ar_len) * -255.0, mask, ], dim=-1, @@ -52,7 +50,7 @@ def create_sliding_window_attn_mask( ● = activate (can attend), ○ = inactivate (masked) """ - mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE) + mask = torch.full((ar_len, ar_len), -255.0) mask_cond = torch.arange(ar_len) mask.masked_fill_( (mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1)) @@ -63,7 +61,7 @@ def create_sliding_window_attn_mask( if max_context_len != ar_len: mask = torch.cat( [ - torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE, + torch.ones(ar_len, max_context_len - ar_len) * -255.0, mask, ], dim=-1, @@ -98,6 +96,7 @@ def mask(self) -> torch.Tensor: def smart_mask_init(self, pos): """ Initialize the attention mask by smart mask initialization method after model forward. + Args: pos (int): Current position in the sequence. """ @@ -115,17 +114,6 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset): """ pass - def _extra_init_kwargs(self) -> dict: - return {} - - def _mask_padding_positions( - self, input_ids: List[List[int]], max_seq_length: int - ) -> None: - """Mask positions beyond each sequence's actual length.""" - actual_lens = torch.tensor([len(seq) for seq in input_ids]) - pad_rows = torch.arange(max_seq_length).unsqueeze(0) >= actual_lens.unsqueeze(1) - self.mask.masked_fill_(pad_rows.unsqueeze(-1), PADDING_MASK_VALUE) - class CausalAttentionMask(BaseAttentionMask): def __init__(self, max_batch_size: int, ar_len: int, max_context_len: int): @@ -146,22 +134,28 @@ def smart_mask_init(self, pos): def smart_mask_update(self, pos, n_updates, _): """ Smart Mask mechanism for attention mask updating + Initial mask(5x15) layout (before any updates): Each row represents a query token in the autoregressive context. ● = activate (can attend), ○ = inactivate (masked) + 0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○ 1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○ 2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ 3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ○ 4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ● + After 1st update (e.g., pos=0, n_updates=5, sliding_window=3): Newly added tokens are unmasked (set to 0). + 0 ● ● ● ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○ 1 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ○ ○ ○ 2 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ○ ○ 3 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ○ 4 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ● + After 2nd update (e.g., pos=5, n_updates=5): + 0 ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○ ○ 1 ● ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○ 2 ● ● ● ● ● ● ● ● ● ● ● ● ● ○ ○ @@ -172,16 +166,6 @@ def smart_mask_update(self, pos, n_updates, _): end_pos = pos + n_updates self.mask[:, :, start_pos:end_pos] = 0 - @classmethod - def from_input_ids( - cls, input_ids: List[List[int]], max_seq_length: int, **kwargs - ) -> "CausalAttentionMask": - """Build a causal mask and apply padding for variable-length sequences.""" - mask = cls(len(input_ids), max_seq_length, max_seq_length) - mask._mask = mask._mask.clone() - mask._mask_padding_positions(input_ids, max_seq_length) - return mask - class SlidingWindowAttentionMask(BaseAttentionMask): def __init__( @@ -210,24 +194,31 @@ def smart_mask_init(self, pos): def smart_mask_update(self, pos, n_updates, lade_pos_offset): """ Smart Mask mechanism for attention mask updating + Initial mask(5x15) layout (before any updates): Each row represents a query token in the autoregressive context. ● = activate (can attend), ○ = inactivate (masked) + 0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○ 1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○ 2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ 3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ 4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● + After 1st update (e.g., pos=0, n_updates=5, sliding_window=3): Newly added tokens are unmasked (set to 0). Earlier tokens lose access to older cache due to sliding window limits. + 0 ○ ○ ○ ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○ 1 ○ ○ ○ ○ ● ○ ○ ○ ○ ○ ● ● ○ ○ ○ 2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ 3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ 4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● + + After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3): Sliding window shifts again, masking older positions and activate new position. + 0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○ 1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ 2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ @@ -249,24 +240,7 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset): if end_pos > available_cache_len: # Mask tokens that are no longer within the sliding window # TODO: [Optional]: it can be optimized by computing the exact start index - self.mask[:, i, : end_pos - available_cache_len] = PADDING_MASK_VALUE - - def _extra_init_kwargs(self) -> dict: - return {"sliding_window": self.sliding_window} - - @classmethod - def from_input_ids( - cls, - input_ids: List[List[int]], - max_seq_length: int, - sliding_window: int, - **kwargs, - ) -> "SlidingWindowAttentionMask": - """Build a sliding-window mask and apply padding for variable-length sequences.""" - mask = cls(len(input_ids), max_seq_length, max_seq_length, sliding_window) - mask._mask = mask._mask.clone() - mask._mask_padding_positions(input_ids, max_seq_length) - return mask + self.mask[:, i, : end_pos - available_cache_len] = -255.0 class AttentionMask: @@ -283,28 +257,3 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset=None): def __iter__(self): return iter([mask.mask for mask in self.masks]) - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, ...]: - return tuple(m.mask[idx] for m in self.masks) - - @classmethod - def from_input_ids( - cls, - template: "AttentionMask", - input_ids: List[List[int]], - max_seq_length: int, - ) -> "AttentionMask": - """ - Build a calibration AttentionMask that mirrors template's mask types. - - Delegates construction to each mask's own classmethod so that adding a - new mask type only requires implementing from_input_ids on that class — - no edits needed here. - """ - masks = [ - type(base_mask).from_input_ids( - input_ids, max_seq_length, **base_mask._extra_init_kwargs() - ) - for base_mask in template.masks - ] - return cls(masks) diff --git a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py index b16a5e2a252..02f19a0b676 100644 --- a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py +++ b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py @@ -26,9 +26,7 @@ from executorch.devtools.inspector._intermediate_output_capturer import ( IntermediateOutputCapturer, ) -from executorch.examples.qualcomm.oss_scripts.llama.inference import DecoderInference from executorch.exir.debug_handle_utils import DEBUG_HANDLE_KEY -from torch.utils.data import DataLoader from torchao.quantization.pt2e import MinMaxObserver from torchao.quantization.utils import compute_error @@ -76,49 +74,45 @@ def __init__( torch.ops.quantized_decomposed.dequantize_per_tensor.default, } - def analyze( - self, - decoder_inference: DecoderInference, - text_dataloader: DataLoader, - num_sharding: int = 5, - ) -> "SqnrReport": + def analyze(self, samples: List[Tuple], num_sharding: int = 5) -> "SqnrReport": """ - Evaluates both the fp32 and QDQ graphs using batches from text_dataloader + Evaluates both the fp32 and QDQ graphs using the provided input_samples and computes the per-node Signal-to-Quantization-Noise Ratio (SQNR). Args: - decoder_inference: Provides get_inputs() to assemble each - batch into the compiled model's input signature. - text_dataloader: DataLoader for text-only calibration batches. - num_sharding: Number of contiguous layer groups to bucket the model - into for SQNR aggregation. + input_samples: A list of tuples containing tensors corresponding to the model's inputs. + num_sharding: Number of contiguous layer groups to bucket the model into for SQNR + aggregation. Rather than flagging individual layers, layers are grouped into + ``num_sharding`` consecutive ranges (e.g. layers 0-7, 8-15, …) and the SQNR + is averaged within each group. Because upgrading isolated layers is usually ineffective: quantization error from surrounding + low-precision layers accumulates and dominates downstream behavior. Returns: An ``SqnrReport`` object containing the aggregated analysis results. """ + input_samples = [sample for sample in samples if sample is not None] + + if not input_samples: + logging.warning("No input samples provided for analysis.") + return SqnrReport( + self.model_name, defaultdict(list), [], self.analysis_recipe + ) + self._assign_debug_handles(self.fp32_gm) self._assign_debug_handles(self.qdq_gm) - num_samples = 0 + num_samples = len(input_samples) + logging.info(f"num samples: {num_samples}") + + # Accumulate SQNR per module path across all input samples path_sqnr_sum = defaultdict(float) - for text_batch in text_dataloader: - input_ids = text_batch["input_ids"] - attn_mask = text_batch["attention_mask"] - sample = tuple(decoder_inference.get_inputs(input_ids, attn_mask)) + for sample in input_samples: fp_outputs = self._capture(self.fp32_gm, sample) qdq_outputs = self._capture(self.qdq_gm, sample) for path, sqnr in self._match_and_score(fp_outputs, qdq_outputs).items(): path_sqnr_sum[path] += sqnr - num_samples += 1 - - if num_samples == 0: - logging.warning("No input samples provided for analysis.") - return SqnrReport( - self.model_name, defaultdict(list), [], self.analysis_recipe - ) - - logging.info(f"num samples: {num_samples}") + # Average the SQNRs and group them by normalized layer ranges report = defaultdict(list) for path, total_sqnr in path_sqnr_sum.items(): group = self._normalize_group_name( diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py index 954b73384fa..2894777f776 100644 --- a/examples/qualcomm/oss_scripts/llama/tokenizer.py +++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py @@ -8,21 +8,19 @@ import json import logging import re -from typing import Dict, List +import warnings +from typing import Callable, List from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( AUDIO_ENCODER, VISION_ENCODER, ) -from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ModelArgs from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer from transformers import AutoTokenizer - -# Generic special tokens for multimodality, used for runtime identification. IMG_TOKEN = "" AUDIO_TOKEN = "