From 64b7c92738daa67d8cfa6074f44ad6eec6ac5224 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@fb.com>
Date: Tue, 23 Jun 2026 09:38:07 -0700
Subject: [PATCH 1/2] shelve changes to: Arm backend: Add a repro command when
 VGF model-converter fails (#20443)

---
 .ci/scripts/export_model_artifact.sh          |   3 +-
 .ci/scripts/test_model_e2e.sh                 | 101 ----
 .claude/skills/qualcomm/new_op_development.md |  15 +-
 .flake8                                       |   1 -
 .github/workflows/build-cadence-runner.yml    |  64 ++-
 .github/workflows/mlx.yml                     |  23 -
 .github/workflows/validate_flatbuffer_gen.yml |  16 +-
 .gitignore                                    |   1 -
 .lintrunner.toml                              |   3 -
 Makefile                                      |  12 +-
 backends/aoti/aoti_backend.py                 |  19 +-
 backends/aoti/aoti_delegate_handle.h          |  26 -
 backends/aoti/aoti_partitioner.py             | 120 ++--
 backends/aoti/tests/TARGETS                   |  12 -
 .../apple/metal/runtime/metal_backend.cpp     |  10 +-
 backends/arm/_passes/__init__.py              |   2 -
 backends/arm/_passes/arm_pass_manager.py      |   7 +-
 .../aten_to_tosa_activation_functions.py      |  18 -
 backends/arm/_passes/decompose_round_pass.py  |  14 +-
 .../arm/_passes/deduplicate_get_attr_pass.py  |  27 +-
 backends/arm/_passes/exir_to_tosa_pass.py     |  43 +-
 .../arm/_passes/insert_dynamic_padding.py     |  14 +-
 backends/arm/_passes/insert_rescales_pass.py  |  35 +-
 backends/arm/_passes/insert_table_ops.py      |   1 -
 backends/arm/_passes/rewrite_conv_pass.py     |  26 +-
 backends/arm/_passes/rewrite_mxfp_linear.py   |  98 +---
 .../arm/_passes/size_adjust_input_pass.py     |  43 +-
 backends/arm/_passes/symbolic_value_range.py  | 113 +---
 backends/arm/ao_ext/mxfp.py                   |  79 +--
 backends/arm/ao_ext/mxfp_tosa_lib.py          |   1 -
 backends/arm/ao_ext/mxfp_transform.py         |   3 -
 backends/arm/ao_ext/ops/__init__.py           |   2 -
 backends/arm/ao_ext/ops/mxfp_linear_op.py     |  77 +--
 backends/arm/operator_support/TARGETS         |   1 -
 backends/arm/operator_support/__init__.py     |   1 -
 .../tosa_profile_supported_op_lists.py        |   3 -
 .../tosa_supported_operators.py               | 406 ++++---------
 backends/arm/operators/__init__.py            |   2 -
 .../operators/op_tosa_cast_to_block_scaled.py |  85 +--
 .../op_tosa_matmul_t_block_scaled.py          |   8 +-
 backends/arm/operators/op_tosa_shapes.py      | 218 +------
 backends/arm/process_node.py                  | 101 +---
 .../arm/quantizer/quantization_annotator.py   |   1 -
 .../arm/scripts/install_models_for_test.sh    |   7 +-
 backends/arm/scripts/pre-push                 |   2 +-
 backends/arm/test/misc/test_mxfp_linear_ao.py | 103 +---
 backends/arm/test/misc/test_process_node.py   |  80 +--
 backends/arm/test/misc/test_runner_utils.py   | 116 ----
 backends/arm/test/misc/test_vgf_backend.py    | 188 +-----
 backends/arm/test/misc/test_vgf_check_env.py  |  99 +---
 .../test_tosa_dialect_cast_to_block_scaled.py |  50 +-
 .../test_tosa_dialect_mxfp_linear.py          |  34 --
 .../arm/test/ops/mxfp/test_mxfp_linear.py     | 218 ++-----
 backends/arm/test/ops/test_round.py           |   2 +
 .../test_insert_dynamic_padding_pass.py       | 154 ++---
 .../arm/test/passes/test_rewrite_conv_pass.py |  38 +-
 .../passes/test_rewrite_mxfp_linear_pass.py   |  42 +-
 .../test/passes/test_symbolic_value_range.py  |  13 -
 backends/arm/test/runner_utils.py             | 142 +----
 backends/arm/test/targets.bzl                 |   3 -
 backends/arm/tosa/dialect/__init__.py         |   1 -
 .../tosa/dialect/ops/cast_to_block_scaled.py  |  25 +-
 backends/arm/tosa/dialect/ops/conv2d.py       |  31 +-
 backends/arm/tosa/dialect/ops/conv3d.py       |  28 +-
 .../arm/tosa/dialect/ops/depthwise_conv2d.py  |  19 +-
 .../tosa/dialect/ops/matmul_t_block_scaled.py |  44 +-
 backends/arm/tosa/mapping.py                  |  17 +-
 backends/arm/tosa/partitioner.py              |  12 +-
 backends/arm/tosa/utils.py                    |   4 -
 backends/arm/vgf/backend.py                   | 160 +-----
 backends/arm/vgf/check_env.py                 | 177 +++++-
 backends/arm/vgf/model_converter.py           | 180 +-----
 backends/cadence/fused_quant/op_add.cpp       |   2 +-
 backends/cadence/fused_quant/op_add.h         |  13 +-
 backends/cadence/fused_quant/op_bmm.cpp       |   2 +-
 backends/cadence/fused_quant/op_bmm.h         |  13 +-
 backends/cadence/fused_quant/op_hardswish.cpp |   2 +-
 backends/cadence/fused_quant/op_hardswish.h   |   8 +-
 backends/cadence/fused_quant/op_mul.cpp       |   2 +-
 backends/cadence/fused_quant/op_mul.h         |  13 +-
 backends/cadence/fused_quant/op_relu.cpp      |   2 +-
 backends/cadence/fused_quant/op_relu.h        |   8 +-
 backends/cadence/fused_quant/quant_utils.h    |   4 +-
 .../cadence/fused_quant/tests/test_op_add.cpp |   2 +-
 .../cadence/fused_quant/tests/test_op_bmm.cpp |   2 +-
 .../fused_quant/tests/test_op_hardswish.cpp   |   2 +-
 .../cadence/fused_quant/tests/test_op_mul.cpp |   2 +-
 .../fused_quant/tests/test_op_relu.cpp        |   2 +-
 .../generic/operators/op_avg_pool2d.cpp       |   2 +-
 .../cadence/generic/operators/op_avg_pool2d.h |   5 +-
 .../generic/operators/op_fully_connected.cpp  |   2 +-
 .../generic/operators/op_fully_connected.h    |   2 +-
 .../generic/operators/op_linalg_svd.cpp       |   2 +-
 .../cadence/generic/operators/op_linalg_svd.h |   2 +-
 .../operators/op_quantized_conv1d_nlc.cpp     |   2 +-
 .../operators/op_quantized_conv1d_nlc.h       |   2 +-
 .../generic/operators/op_quantized_conv2d.cpp |   4 +-
 .../generic/operators/op_quantized_conv2d.h   |   2 +-
 .../op_quantized_depthwise_conv1d_nlc.cpp     |   2 +-
 .../operators/op_quantized_embedding_byte.cpp |   2 +-
 .../operators/op_quantized_embedding_byte.h   |   3 +-
 .../op_quantized_fully_connected.cpp          |   2 +-
 .../operators/op_quantized_fully_connected.h  |   8 +-
 .../operators/op_quantized_layer_norm.cpp     |   2 +-
 .../generic/operators/op_quantized_linear.cpp |   2 +-
 .../generic/operators/op_quantized_linear.h   |   4 +-
 .../generic/operators/op_quantized_matmul.cpp |   2 +-
 .../generic/operators/op_quantized_matmul.h   |   2 +-
 .../generic/operators/op_quantized_mul.cpp    |   2 +-
 .../generic/operators/op_quantized_relu.cpp   |   2 +-
 .../generic/operators/op_requantize.cpp       |   2 +-
 .../cadence/generic/operators/op_rope.cpp     |   4 +-
 backends/cadence/generic/operators/op_rope.h  |   4 +-
 .../cadence/generic/operators/op_softmax.cpp  |   2 +-
 .../cadence/generic/operators/op_softmax.h    |   2 +-
 .../operators/op_transposed_convolution.cpp   |   2 +-
 backends/cadence/hifi/operators/op_mean.cpp   |   2 +-
 .../operators/op_quantized_conv1d_nlc.cpp     |   2 +-
 .../op_quantized_conv2d_nhwc_out.cpp          |   2 +-
 .../op_quantized_depthwise_conv1d_nlc.cpp     |   2 +-
 .../hifi/operators/op_quantized_matmul_out.h  |   2 +-
 .../hifi/operators/op_softmax_f32_f32.cpp     |   4 +-
 backends/cadence/hifi/operators/operators.h   |   6 +-
 .../operators/op_quantized_conv_out.cpp       |   2 +-
 .../op_quantized_fully_connected_out.cpp      |   2 +-
 .../operators/op_quantized_linear_out.cpp     |   4 +-
 .../operators/op_quantized_matmul_out.cpp     |   4 +-
 .../cadence/vision/operators/op_softmax.cpp   |   2 +-
 backends/cadence/vision/operators/operators.h |   2 +-
 backends/cortex_m/TARGETS                     |  13 +-
 .../ops/cmsis_scratch_buffer_context.h        |   2 +-
 backends/cortex_m/ops/op_quantized_conv2d.cpp |   4 +-
 .../ops/op_quantized_depthwise_conv2d.cpp     |   4 +-
 backends/cortex_m/ops/op_quantized_linear.cpp |   4 +-
 .../ops/op_quantized_transpose_conv2d.cpp     |   4 +-
 backends/cortex_m/passes/BUCK                 |   2 -
 backends/cortex_m/passes/__init__.py          |  30 +
 .../cortex_m/passes/aten_to_cortex_m_pass.py  |   2 +-
 .../cortex_m/passes/scratch_buffer_sizes.py   |   2 +-
 backends/cortex_m/target_config.py            |   3 +-
 .../cortex_m/test/misc/test_cmsis_pybind.py   |   3 +-
 .../cortex_m/test/misc/test_target_config.py  |   3 +-
 backends/cortex_m/test/ops/test_avg_pool2d.py |   2 +-
 backends/cuda/runtime/cuda_backend.cpp        |  25 +-
 backends/cuda/tests/test_cuda_partitioner.py  | 160 +-----
 backends/cuda/tests/test_tq4_sdpa.py          | 438 +-------------
 .../nxp/backend/edge_program_converter.py     |   1 -
 backends/nxp/backend/graph_utils.py           |   2 +-
 .../ops_converters/__init__.py                |   4 -
 .../ops_converters/clamp_converter.py         |  50 +-
 .../ops_converters/hardtanh_converter.py      |  85 ++-
 .../ops_converters/mean_dim_converter.py      | 125 +---
 .../ir/converter/quantization_utils.py        |   5 +-
 backends/nxp/backend/node_format_inference.py |  73 +--
 backends/nxp/neutron_partitioner.py           |   3 +-
 backends/nxp/nxp_backend.py                   |  55 +-
 backends/nxp/quantizer/neutron_quantizer.py   |   2 -
 backends/nxp/quantizer/patterns.py            |  43 +-
 backends/nxp/run_unittests.sh                 |   2 +-
 backends/nxp/tests/conftest.py                |   2 +-
 backends/nxp/tests/executorch_pipeline.py     |   4 -
 .../nxp/tests/generic_tests/test_cifarnet.py  |  10 +-
 .../generic_tests/test_convert_div_to_mul.py  |   3 +-
 .../tests/generic_tests/test_integration.py   |   2 +-
 .../test_quantized_input_data.py              |  43 +-
 .../node_converter/test_abs_converter.py      |   6 +-
 .../test_adaptive_avg_pool2d_converter.py     |  11 +-
 .../test_add_tensor_converter.py              |  83 +--
 .../test_avg_pool2d_converter.py              |  19 +-
 .../node_converter/test_cat_converter.py      |  24 +-
 .../node_converter/test_clamp_converter.py    |  38 +-
 .../test_constant_pad_nd_converter.py         |  29 +-
 .../node_converter/test_hardtanh_converter.py | 313 +++-------
 .../test_leaky_relu_converter.py              |  18 +-
 .../node_converter/test_log_converter.py      |   6 +-
 .../test_max_pool_2d_converter.py             |  35 +-
 .../node_converter/test_mean_dim_converter.py | 340 +++--------
 .../test_mul_tensor_converter.py              |  13 +-
 .../test_permute_copy_converter.py            |  49 +-
 .../node_converter/test_relu_converter.py     |  18 +-
 .../node_converter/test_sigmoid_converter.py  |  13 +-
 .../test_slice_tensor_converter.py            |  40 +-
 .../test_sub_tensor_converter.py              |  89 +--
 .../node_converter/test_tanh_converter.py     |  18 +-
 .../test_upsample_bilinear2d.py               |  37 +-
 .../node_converter/test_upsample_nearest2d.py |  17 +-
 backends/nxp/tests/model_output_comparator.py |  30 +-
 backends/nxp/tests/nsys_testing.py            | 106 +---
 backends/nxp/tests/ops_aliases.py             |   2 -
 backends/nxp/tests/utils.py                   |  32 --
 backends/qualcomm/_passes/__init__.py         |   4 -
 backends/qualcomm/_passes/decompose_acos.py   |   4 +-
 backends/qualcomm/_passes/decompose_atan2.py  |   4 +-
 .../_passes/decompose_log_variants.py         |   6 +-
 .../qualcomm/_passes/decompose_remainder.py   |   6 +-
 backends/qualcomm/_passes/decompose_var.py    |   4 +-
 backends/qualcomm/_passes/qnn_pass_manager.py |   8 -
 backends/qualcomm/_passes/utils.py            |   2 +-
 backends/qualcomm/aot/wrappers/targets.bzl    |   3 +-
 backends/qualcomm/builders/README.md          |   6 -
 backends/qualcomm/debugger/README.md          |   4 +-
 backends/qualcomm/export_utils.py             |   7 +-
 .../quantizer/annotators/htp_rules.py         |   6 +-
 .../quantizer/annotators/lpai_rules.py        |   6 +-
 backends/qualcomm/runtime/targets.bzl         |   5 +-
 backends/qualcomm/targets.bzl                 |  32 +-
 backends/qualcomm/tests/models.py             |  50 --
 backends/qualcomm/tests/test_qnn_delegate.py  | 166 +-----
 .../postpone_permute_below_squeeze_view.py    |   6 +-
 .../test/test_permute_optimization_passes.py  |  35 --
 .../runtime/graph/ops/impl/Q8taBinary.cpp     |  28 +-
 .../runtime/graph/ops/impl/Q8taConv2d.cpp     |  52 +-
 .../runtime/graph/ops/impl/Q8taConv2d.h       |   7 +-
 .../runtime/graph/ops/impl/Q8taConv2dDW.cpp   |  52 +-
 .../graph/ops/impl/Q8taConv2dIm2Col.cpp       |  72 +--
 .../runtime/graph/ops/impl/Q8taConv2dPW.cpp   |  89 +--
 .../graph/ops/impl/Q8taConv2dTransposed.cpp   |  58 +-
 .../runtime/graph/ops/impl/Q8taLinear.cpp     |  30 +-
 .../graph/ops/impl/Q8taQuantizeDequantize.cpp |  23 +-
 .../test/custom_ops/impl/TestConv2dDw.cpp     |   8 +-
 backends/vulkan/test/custom_ops/utils.cpp     |   4 -
 backends/vulkan/test/custom_ops/utils.h       |  18 -
 backends/webgpu/CMakeLists.txt                |  39 +-
 backends/webgpu/runtime/WebGPUBackend.cpp     |  11 +-
 backends/webgpu/runtime/WebGPUGraph.cpp       | 247 ++------
 backends/webgpu/runtime/WebGPUGraph.h         |  52 +-
 backends/webgpu/runtime/WebGPUUtils.h         |  22 -
 .../webgpu/scripts/test_webgpu_native_ci.sh   | 125 ++--
 backends/webgpu/test/test_build_webgpu.sh     |  29 +-
 backends/webgpu/test/test_webgpu_native.cpp   | 533 +++++-------------
 codegen/api/et_cpp.py                         |   3 +-
 codegen/api/types/types.py                    |   3 +-
 devtools/bundled_program/schema/README.md     |  10 -
 devtools/bundled_program/serialize/BUCK       |   3 +-
 .../bundled_program/serialize/__init__.py     | 184 +-----
 docs/source/backends/nxp/op-support.csv       |   1 -
 .../executor_runner/arm_memory_allocator.cpp  |  25 +-
 .../executor_runner/arm_memory_allocator.h    |  15 +-
 examples/espressif/README.md                  |   2 +
 .../espressif/executor_runner/CMakeLists.txt  |   3 +-
 .../executor_runner/esp_executor_runner.cpp   |   9 +-
 examples/models/BUCK                          |   3 -
 examples/models/__init__.py                   |   8 -
 examples/models/gemma4_31b/README.md          |  21 +-
 .../gemma4_31b/cuda_source_transformations.py |  14 +-
 examples/models/gemma4_31b/export.py          |   9 -
 examples/models/gemma4_31b/gguf_loader.py     |  70 +--
 examples/models/gemma4_31b/main.cpp           | 122 +---
 examples/models/gemma4_31b/model.py           |   2 +-
 examples/models/gemma4_31b/sampler.py         |   7 +-
 .../gemma4_31b/tests/test_cuda_pipeline.py    |  49 +-
 examples/models/parakeet/CMakeLists.txt       |  61 +-
 examples/models/parakeet/CMakePresets.json    |  12 +-
 examples/models/parakeet/README.md            |  25 -
 examples/models/parakeet/main.cpp             | 274 +++++++--
 examples/models/qwen3_5_moe/CMakeLists.txt    |  19 +-
 examples/models/qwen3_5_moe/CMakePresets.json |  43 +-
 examples/models/qwen3_5_moe/README.md         |  83 +--
 examples/models/qwen3_5_moe/export.py         |   6 -
 .../models/qwen3_5_moe/qwen35_moe_engine.cpp  | 111 +---
 examples/qualcomm/oss_scripts/llama/README.md | 108 +---
 examples/qualcomm/oss_scripts/llama/TARGETS   | 131 +----
 .../llama/encoder/encoder_config.py           |   7 +
 examples/qualcomm/oss_scripts/llama/llama.py  | 183 +++---
 .../oss_scripts/llama/masking_utils.py        |  91 +--
 .../llama/mix_precision_analyzer.py           |  50 +-
 .../qualcomm/oss_scripts/llama/tokenizer.py   | 142 +++--
 .../llama/wrappers/base_component.py          |  15 +-
 .../llama/wrappers/llm_wrappers.py            | 493 +++++++++++-----
 .../llm_utils/decoder_model_wrapper.py        |  13 +-
 .../llm_utils/qnn_decoder_model_manager.py    |  21 +-
 .../qualcomm/oss_scripts/qwen2_5/qwen2_5.py   |   5 +-
 exir/backend/utils.py                         |  10 +-
 exir/pass_base.py                             |  91 ---
 exir/tensor.py                                |   4 +-
 exir/tests/test_pass_infra.py                 | 108 ----
 exir/tests/test_tensor.py                     |  20 -
 .../make_aten_functor_from_et_functor.h       |   6 +-
 ...make_aten_functor_from_et_functor_test.cpp |  90 +--
 extension/data_loader/file_data_loader.cpp    |   8 +-
 .../test/file_data_loader_test.cpp            |  95 ----
 .../flat_tensor/flat_tensor_data_map.cpp      |   8 +-
 extension/flat_tensor/flat_tensor_data_map.h  |   6 +-
 extension/llm/modules/turboquant/kv_cache.py  |  12 +-
 extension/named_data_map/merged_data_map.cpp  |   2 +-
 extension/named_data_map/merged_data_map.h    |   6 +-
 kernels/portable/cpu/op_index_put.cpp         |   3 +-
 kernels/portable/cpu/op_log_softmax.cpp       |   8 +-
 kernels/portable/cpu/op_native_dropout.cpp    |   2 +-
 kernels/test/op_native_dropout_test.cpp       |   2 +-
 runtime/core/exec_aten/exec_aten.h            |   6 +-
 .../core/exec_aten/util/scalar_type_util.h    |   2 +-
 runtime/core/memory_allocator.h               |  16 -
 runtime/core/named_data_map.h                 |  10 +-
 runtime/core/portable_type/optional.h         |  19 +-
 runtime/core/portable_type/string_view.h      |   6 +-
 .../core/portable_type/test/CMakeLists.txt    |  10 +-
 runtime/core/portable_type/test/targets.bzl   |   8 +
 runtime/core/test/memory_allocator_test.cpp   |  83 ---
 runtime/executor/merged_data_map.h            |   7 +-
 runtime/executor/pte_data_map.cpp             |   3 +-
 runtime/executor/pte_data_map.h               |   7 +-
 runtime/executor/test/method_meta_test.cpp    |   4 +-
 test/utils/OSSTestConfig.json                 |   1 +
 304 files changed, 2993 insertions(+), 8623 deletions(-)

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 0e28098a1e8..b5bf19f4155 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -422,9 +422,8 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
       --no-compile
   echo "::endgroup::"
 
-  # Copy tokenizer files for the runner and model-specific serving launcher.
+  # Copy tokenizer for the runner
   cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
-  cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json"
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index d8bca45e695..503bd381a8d 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -447,105 +447,4 @@ case "$MODEL_NAME" in
 esac
 echo "::endgroup::"
 
-if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
-  echo "::group::Run $MODEL_NAME OpenAI serving smoke"
-  pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1"
-  python -m pip install --no-deps --no-build-isolation --editable . -v
-
-  PORT=$(python - <<'PY'
-import socket
-
-with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-    s.bind(("127.0.0.1", 0))
-    print(s.getsockname()[1])
-PY
-)
-  SERVER_LOG=$(mktemp)
-  WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
-  python -u -m executorch.examples.models.qwen3_5_moe.serve \
-    --model-path "${MODEL_DIR}/model.pte" \
-    --data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \
-    --tokenizer-path "${MODEL_DIR}/tokenizer.json" \
-    --hf-tokenizer "${MODEL_DIR}" \
-    --model-id qwen3.5-moe \
-    --max-context 4096 \
-    --max-sessions 2 \
-    --no-think \
-    --worker-bin "$WORKER_BIN" \
-    --host 127.0.0.1 \
-    --port "$PORT" >"$SERVER_LOG" 2>&1 &
-  SERVER_PID=$!
-
-  cleanup_qwen_server() {
-    if kill -0 "$SERVER_PID" 2>/dev/null; then
-      kill "$SERVER_PID" 2>/dev/null || true
-      wait "$SERVER_PID" 2>/dev/null || true
-    fi
-    rm -f "$SERVER_LOG"
-  }
-  trap cleanup_qwen_server EXIT
-
-  if ! python - "$PORT" "$SERVER_LOG" <<'PY'
-import json
-import sys
-import time
-import urllib.request
-
-port = sys.argv[1]
-log_path = sys.argv[2]
-base = f"http://127.0.0.1:{port}"
-
-
-def request(path, payload=None):
-    data = None
-    headers = {}
-    if payload is not None:
-        data = json.dumps(payload).encode("utf-8")
-        headers["Content-Type"] = "application/json"
-    req = urllib.request.Request(base + path, data=data, headers=headers)
-    with urllib.request.urlopen(req, timeout=120) as resp:
-        return json.loads(resp.read().decode("utf-8"))
-
-
-last = None
-for _ in range(180):
-    try:
-        request("/health")
-        break
-    except Exception as e:
-        last = e
-        time.sleep(1)
-else:
-    print(open(log_path, encoding="utf-8", errors="replace").read())
-    raise RuntimeError(f"server did not become healthy: {last}")
-
-models = request("/v1/models")
-ids = {m["id"] for m in models["data"]}
-if "qwen3.5-moe" not in ids:
-    raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}")
-
-body = {
-    "model": "qwen3.5-moe",
-    "messages": [{"role": "user", "content": "What is the capital of France?"}],
-    "max_tokens": 32,
-    "temperature": 0,
-}
-resp = request("/v1/chat/completions", body)
-content = resp["choices"][0]["message"].get("content") or ""
-if "Paris" not in content:
-    raise AssertionError(f"expected Paris in serving response, got: {content!r}")
-
-print("Qwen3.5-MoE serving smoke passed")
-PY
-  then
-    echo "Qwen3.5-MoE serving smoke failed; server log:"
-    cat "$SERVER_LOG"
-    exit 1
-  fi
-
-  cleanup_qwen_server
-  trap - EXIT
-  echo "::endgroup::"
-fi
-
 popd
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
index 149940b0796..4133a92ea48 100644
--- a/.claude/skills/qualcomm/new_op_development.md
+++ b/.claude/skills/qualcomm/new_op_development.md
@@ -217,17 +217,8 @@ class DecomposeMyOp(ExportPass):
 
 ### Registration (all decompose passes)
 1. `_passes/__init__.py` — import + `__all__`
-2. `_passes/qnn_pass_manager.py` — The pass manager uses classmethods for pipeline definitions:
-   - **Import** — add to the import block at top of file
-   - **`get_annotation_passes()`** — add pass class to the returned list (runs before quantizer, ATen IR)
-   - **`get_export_passes()`** — add pass class if needed for float-only path (runs after quantization, before to-edge)
-   - **`get_default_pass_activations()`** — add `(PassClass, True)` ONLY if the pass also needs to run in the to-edge pipeline
-   - **`get_passes_dependency_for_capture_program()`** — add `PassClass: [RemoveRedundancy]` dependency ONLY if also in `get_default_pass_activations`
-
-**When to add to which pipeline:**
-- **Annotation only** (most common for decompose passes): `get_annotation_passes()` — pass decomposes the op before the quantizer sees it
-- **Export pipeline** too: if the float-only test fails without it (op doesn't get handled by PyTorch's built-in decomposition during to-edge)
-- **Capture program** (to-edge) too: if the op can appear in edge dialect and needs decomposition there (e.g., `DecomposeVar`, `DecomposeCDist`, `DecomposeDiagonal`)
+2. `_passes/qnn_pass_manager.py` — import + `transform_for_annotation_pipeline` + `transform_for_export_pipeline` + `get_capture_program_passes`
+3. `_passes/utils.py` — add to `get_passes_dependency_for_capture_program()` with `[RemoveRedundancy]` dependency
 
 ---
 
@@ -264,4 +255,4 @@ class DecomposeMyOp(ExportPass):
 
 **Native QNN Op:** `qnn_constants.py` → `op_my_op.py` → `builders/__init__.py` → `htp_rules.py` → `lpai_rules.py` → `layout_transform.py` → `tests/models.py` → `test_qnn_delegate.py` → `partition/utils.py` (skip decomp) → `common_defs.py` (remove to_be_implemented) → `builders/README.md`
 
-**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (`get_annotation_passes` + optionally `get_export_passes`; if also needed in to-edge: `get_default_pass_activations` + `get_passes_dependency_for_capture_program`) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
+**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (annotation + export + capture) → `_passes/utils.py` (dependency) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
diff --git a/.flake8 b/.flake8
index ffb419da1e4..fc9feb45d8b 100644
--- a/.flake8
+++ b/.flake8
@@ -75,7 +75,6 @@ exclude =
     ./configurations,
     ./docs,
     ./exir/_serialize/generated/executorch_flatbuffer,
-    ./devtools/bundled_program/serialize/generated,
     ./third_party,
     *.pyi
 
diff --git a/.github/workflows/build-cadence-runner.yml b/.github/workflows/build-cadence-runner.yml
index c447e4f9a20..49f750eeea2 100644
--- a/.github/workflows/build-cadence-runner.yml
+++ b/.github/workflows/build-cadence-runner.yml
@@ -19,18 +19,36 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  # Same-repo PRs run on pull_request, which reads the PR's own workflow AND code
-  # -- so CI changes, new test jobs, code, and tests are all validated pre-merge.
-  # Fork PRs can't get credentials (OIDC) on pull_request, so Meta-exported forks
-  # (labeled CLA Signed + meta-exported) run on pull_request_target instead. The
-  # run condition is inlined per job (GitHub Actions has no YAML anchors and env
-  # is unavailable in job-level if), so keep the copies in sync.
+  gate:
+    runs-on: ubuntu-latest
+    outputs:
+      run-cadence: ${{ steps.decide.outputs.run }}
+    steps:
+      - id: decide
+        env:
+          EVENT: ${{ github.event_name }}
+          IS_FORK: ${{ github.event.pull_request.head.repo.full_name != github.repository }}
+          HAS_CLA: ${{ contains(github.event.pull_request.labels.*.name, 'CLA Signed') }}
+          HAS_EXPORT: ${{ contains(github.event.pull_request.labels.*.name, 'meta-exported') }}
+        run: |
+          run=false
+          case "${EVENT}" in
+            push|schedule|workflow_dispatch)
+              run=true
+              ;;
+            pull_request)
+              [ "${IS_FORK}" = "false" ] && run=true
+              ;;
+            pull_request_target)
+              if [ "${IS_FORK}" = "true" ] && [ "${HAS_CLA}" = "true" ] && [ "${HAS_EXPORT}" = "true" ]; then
+                run=true
+              fi
+              ;;
+          esac
+          echo "run=${run}" >> "${GITHUB_OUTPUT}"
+
   cpu-build:
-    if: >-
-      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
-      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
-      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
+    if: github.event_name != 'pull_request_target'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -40,7 +58,7 @@ jobs:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: recursive
-      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: cadence-runner-build
       script: |
@@ -57,28 +75,21 @@ jobs:
 
   cpu-test:
     needs: cpu-build
-    if: >-
-      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
-      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
-      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
+    if: github.event_name != 'pull_request_target'
     permissions:
       id-token: write
       contents: read
     uses: ./.github/workflows/_test_cadence.yml
     with:
-      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
   # Cross-compile cadence_executor_runner for each Cadence Xtensa core, one job
   # per backend so they show as separate lines (no matrix grouping). Shared logic
   # lives in _xtensa_build.yml. fusion_g3 is omitted until the upstream fusion_g3
   # <-> nnlib-FusionG3 API skew is fixed (its runner does not link).
   hifi-build:
-    if: >-
-      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
-      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
-      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
+    needs: gate
+    if: needs.gate.outputs.run-cadence == 'true'
     permissions:
       id-token: write
       contents: read
@@ -88,11 +99,8 @@ jobs:
       ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
 
   vision-build:
-    if: >-
-      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
-      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
-      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
+    needs: gate
+    if: needs.gate.outputs.run-cadence == 'true'
     permissions:
       id-token: write
       contents: read
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index 5a4ccbb4952..acc6b4840cf 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -161,29 +161,6 @@ jobs:
         fi
         echo "::endgroup::"
 
-        echo "::group::Verify chunked == unchunked prefill"
-        QWEN_TINY_PTE=/tmp/qwen35_moe_mlx_tiny/model.pte \
-          ${CONDA_RUN} python -m pytest \
-          examples/models/qwen3_5_moe/test_chunked_prefill.py -v
-        echo "::endgroup::"
-
-        echo "::group::Build Qwen 3.5 MoE MLX C++ runner"
-        # Validates the MLX C++ runner build wiring (compile + link + metallib).
-        # The tiny model has no compatible tokenizer (vocab 256, random weights),
-        # so we don't run C++ inference here — only confirm it builds.
-        ${CONDA_RUN} make qwen3_5_moe-mlx
-        RUNNER=cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
-        if [ ! -x "$RUNNER" ]; then
-          echo "Failed: runner not found at $RUNNER"
-          exit 1
-        fi
-        if [ ! -f "$(dirname "$RUNNER")/mlx.metallib" ]; then
-          echo "Failed: mlx.metallib not copied next to runner"
-          exit 1
-        fi
-        echo "Success: built $RUNNER"
-        echo "::endgroup::"
-
   backend-tester:
     needs: run-decision
     if: |
diff --git a/.github/workflows/validate_flatbuffer_gen.yml b/.github/workflows/validate_flatbuffer_gen.yml
index 6c0455784c6..96eeda95e04 100644
--- a/.github/workflows/validate_flatbuffer_gen.yml
+++ b/.github/workflows/validate_flatbuffer_gen.yml
@@ -5,9 +5,7 @@ on:
   pull_request:
     paths:
       - "schema/**"
-      - "devtools/bundled_program/schema/**"
-      - "exir/_serialize/generated/**"
-      - "devtools/bundled_program/serialize/generated/**"
+      - "exir/_serialize/generated/executorch_flatbuffer/**"
 
 jobs:
   exir-flatbuffer:
@@ -35,15 +33,3 @@ jobs:
             echo "Please run 'python exir/_serialize/generate_program.py' to regenerate the files and commit the changes."
             exit 1
           fi
-
-      - name: Generate bundled program flatbuffer Python
-        run: python devtools/bundled_program/serialize/generate_bundled_program.py
-
-      - name: Validate bundled_program_flatbuffer is unchanged
-        run: |
-          git add -A devtools/bundled_program/serialize/generated
-          if ! git diff --cached --quiet -- devtools/bundled_program/serialize/generated; then
-            echo "Error: bundled_program_flatbuffer has uncommitted changes."
-            echo "Please run 'python devtools/bundled_program/serialize/generate_bundled_program.py' to regenerate the files and commit the changes."
-            exit 1
-          fi
diff --git a/.gitignore b/.gitignore
index ee206e23d94..87772e21014 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,7 +26,6 @@ arm-scratch/
 executorch.egg-info
 pip-out/
 build-profiling/
-**/ddr_*_temp
 
 # Any exported models and profiling outputs
 *.bin
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 98c46c78960..ab498a5d0ac 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -9,7 +9,6 @@ exclude_patterns = [
     '.github/scripts/**',
     'exir/serde/**',
     'exir/_serialize/generated/executorch_flatbuffer/**',
-    'devtools/bundled_program/serialize/generated/**',
 ]
 command = [
     'python',
@@ -42,7 +41,6 @@ exclude_patterns = [
     '**/third-party/**',
     'exir/serde/**',
     'exir/_serialize/generated/executorch_flatbuffer/**',
-    'devtools/bundled_program/serialize/generated/**',
 ]
 command = [
     'python',
@@ -391,7 +389,6 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
-    'examples/llm_server',
     'backends/cadence/utils/FACTO',
     'examples/cuda',
     'examples/qualcomm',
diff --git a/Makefile b/Makefile
index 552bbf89bd7..c93085115aa 100644
--- a/Makefile
+++ b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,7 +131,6 @@ help:
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
-	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -468,15 +467,6 @@ qwen3_5_moe-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
-qwen3_5_moe-mlx:
-	@echo "==> Building and installing ExecuTorch with MLX..."
-	cmake --workflow --preset mlx-release
-	@echo "==> Building Qwen3.5 MoE runner with MLX..."
-	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
-	@echo ""
-	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
-
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index 91a8a60078e..a478b43cf0f 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-import hashlib
 import os
 import typing
 from abc import ABC, abstractmethod
@@ -277,21 +276,18 @@ def preprocess(
 
         # Create named data store
         named_data_store = NamedDataStore()
+        method_name = cls.method_name_from_compile_specs(compile_specs)
 
-        # Key each blob by a content hash so partitions in one method get distinct
-        # keys (a method-name-only key collides). Runtime recovers them from
-        # processed_bytes below.
-        so_blob_key = hashlib.sha256(so_data).hexdigest() + "_so_blob"
-        weights_blob_key = hashlib.sha256(blob_data).hexdigest() + "_weights_blob"
-
-        named_data_store.add_named_data(so_blob_key, so_data, 1, None)
+        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
         # Determine whether to save named data externally based on backend setting
         # External: save to separate .ptd file, otherwise merge with .pte file
         external_tag = (
             f"aoti_{device_name}_blob" if cls.save_data_externally() else None
         )
 
-        named_data_store.add_named_data(weights_blob_key, blob_data, 1, external_tag)
+        named_data_store.add_named_data(
+            method_name + "_weights_blob", blob_data, 1, external_tag
+        )
 
         # Clean up the generated files
         os.remove(so_path)
@@ -303,11 +299,8 @@ def preprocess(
         # the next preprocess call (e.g. for the next method).
         cls.release_moved_tensors(device_edge_program, compile_specs)
 
-        # The runtime cannot recompute these hash keys, so carry them (one per line).
-        processed_bytes = (so_blob_key + "\n" + weights_blob_key).encode("utf-8")
-
         return PreprocessResult(
-            processed_bytes=processed_bytes,
+            processed_bytes=b"",
             debug_handle_map={},
             data_store_output=named_data_store.get_named_data_store_output(),
         )
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index fbd748306cc..2d1a3146ae5 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -10,7 +10,6 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/core/freeable_buffer.h>
 #include <string>
 
 namespace executorch {
@@ -18,7 +17,6 @@ namespace backends {
 namespace aoti {
 
 using executorch::runtime::Error;
-using executorch::runtime::FreeableBuffer;
 using executorch::runtime::etensor::Tensor;
 
 extern "C" {
@@ -150,30 +148,6 @@ struct AOTIDelegateHandle {
       update_user_managed_constant_buffer_pairs;
 };
 
-// New-format payload is "<so_key>\n<weights_key>"; an empty payload is a
-// pre-this-change artifact, so fall back to the legacy method-name keys.
-inline Error resolve_blob_keys(
-    const FreeableBuffer* processed,
-    const std::string& method_name,
-    std::string& so_blob_key,
-    std::string& weights_blob_key) {
-  if (processed != nullptr && processed->size() > 0) {
-    const std::string keys(
-        static_cast<const char*>(processed->data()), processed->size());
-    const size_t newline = keys.find('\n');
-    if (newline == std::string::npos) {
-      return Error::Internal;
-    }
-    so_blob_key = keys.substr(0, newline);
-    weights_blob_key = keys.substr(newline + 1);
-  } else {
-    so_blob_key = method_name.empty() ? "so_blob" : method_name + "_so_blob";
-    weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
-  }
-  return Error::Ok;
-}
-
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index f84febbdc24..b263d0f9c81 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Dict, List, Mapping, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
 from executorch.exir._warnings import experimental
@@ -21,8 +21,6 @@
 )
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-from torch.fx.passes.operator_support import OperatorSupportBase
 
 
 @experimental(
@@ -32,10 +30,12 @@ class AotiPartitioner(Partitioner):
     """
     Base partitioner for AOTInductor-driven backend integration.
 
-    Delegates the non-lowered operators to AOTInductor as one or more convex
-    partitions (a single partition when nothing else has claimed part of the
-    graph). It skips core ATen decomposition, letting the backend decompose via
+    This partitioner creates a single partition containing all operators from the input graph.
+    It skips core ATen decomposition, allowing the backend to handle decomposition using
     AOTInductor's backend-specific decomposition table.
+
+    Only operators that cannot be handled by the aoti library will be excluded from
+    the partition and fall back to ExecuTorch's default or custom handling.
     """
 
     def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
@@ -49,76 +49,62 @@ def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
         self.delegation_spec = DelegationSpec(backend_name, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
-        """Delegate the non-lowered ops to AOTInductor.
-
-        Uses CapabilityBasedPartitioner rather than a single tag because a
-        delegated submodule must be convex: if a node that is not delegated sits
-        between the delegated ops, one tag would span a non-convex set and fusion
-        would fail with a dependency cycle.
         """
-        # Only nodes not already lowered are candidates for this backend.
-        non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph))
-
-        control_flow_targets = [
-            torch.ops.higher_order.cond,
-            torch.ops.higher_order.map_impl,
-            torch.ops.higher_order.while_loop,
-            torch.ops.higher_order.scan,
-        ]
-
-        class AotiOperatorSupport(OperatorSupportBase):
-            def is_node_supported(
-                self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
-            ) -> bool:
-                return node.op == "call_function" and node in non_lowered_nodes
-
-        partitioner = CapabilityBasedPartitioner(
-            exported_program.graph_module,
-            AotiOperatorSupport(),
-            allows_single_node_partition=True,
-        )
+        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
+        """
 
         partition_tags: Dict[str, DelegationSpec] = {}
-        for partition in partitioner.propose_partitions():
-            tag = f"aoti_{partition.id}"
-            partition_tags[tag] = self.delegation_spec
-            for node in partition.nodes:
-                node.meta["delegation_tag"] = tag
+        tag = "tag0"
+
+        # Tag torch.cond and other control flow operations
+        def is_control_flow(node: torch.fx.Node) -> bool:
+            return node.op == "call_function" and node.target in [
+                torch.ops.higher_order.cond,
+                torch.ops.higher_order.map_impl,
+                torch.ops.higher_order.while_loop,
+            ]
+
+        # Nodes already lowered by an earlier partitioner (e.g. a preceding
+        # TensorRT partition) appear as executorch_call_delegate calls and their
+        # output getitems; re-delegating them would nest a foreign delegate. Tag
+        # only the remaining non-lowered ops so this partitioner composes after
+        # others.
+        non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph))
 
-        # A control-flow op carries its branch GraphModules as get_attr operands;
-        # they must share the op's tag so they land inside the same submodule. A
-        # branch module feeds a single control-flow op, so first match wins.
         for node in exported_program.graph.nodes:
-            if node.op != "get_attr":
-                continue
-            for user in node.users:
-                if (
-                    user.op == "call_function"
-                    and user.target in control_flow_targets
-                    and "delegation_tag" in user.meta
-                ):
-                    node.meta["delegation_tag"] = user.meta["delegation_tag"]
-                    break
+            if node.op == "call_function":
+                if node not in non_lowered_nodes:
+                    continue
+                node.meta["delegation_tag"] = tag
+            # Tag get_attr nodes that are used by control flow operations
+            elif node.op == "get_attr":
+                # Check if any user is a control flow operation
+                for user in node.users:
+                    if is_control_flow(user):
+                        node.meta["delegation_tag"] = tag
+                        break
+
+        partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
-        # tag_constant_data only tags constants that have users; tag the
-        # genuinely unused ones too so none are left dangling.
-        if partition_tags:
-            fallback_tag = next(iter(partition_tags))
-            for node in exported_program.graph.nodes:
-                if (
-                    node.op == "placeholder"
-                    and not node.users
-                    and "delegation_tag" not in node.meta
-                    and (
-                        is_param(exported_program, node)
-                        or is_buffer(exported_program, node)
-                        or is_lifted_tensor_constant(exported_program, node)
-                    )
-                ):
-                    node.meta["delegation_tag"] = fallback_tag
+        # A constant that still has users feeds only a prior delegate; tagging it
+        # would fail backend lowering's same-tag check (its user keeps the prior
+        # tag). tag_constant_data already claimed the ones this partition uses, so
+        # tag only the genuinely unused constants here.
+        for node in exported_program.graph.nodes:
+            if (
+                node.op == "placeholder"
+                and not node.users
+                and "delegation_tag" not in node.meta
+                and (
+                    is_param(exported_program, node)
+                    or is_buffer(exported_program, node)
+                    or is_lifted_tensor_constant(exported_program, node)
+                )
+            ):
+                node.meta["delegation_tag"] = tag
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS
index f41c1bfb517..d92e0e32a1f 100644
--- a/backends/aoti/tests/TARGETS
+++ b/backends/aoti/tests/TARGETS
@@ -3,18 +3,6 @@ load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 oncall("executorch")
 
-cpp_unittest(
-    name = "test_resolve_blob_keys",
-    srcs = [
-        "test_resolve_blob_keys.cpp",
-    ],
-    deps = [
-        "//executorch/backends/aoti:delegate_handle",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/core:evalue",
-    ],
-)
-
 cpp_unittest(
     name = "test_common_shims",
     srcs = [
diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index b9579d59d9c..c0d996df62b 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -245,12 +245,8 @@ class ET_EXPERIMENTAL MetalBackend final
       }
     }
 
-    std::string so_blob_key;
-    std::string weights_blob_key;
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        executorch::backends::aoti::resolve_blob_keys(
-            processed, method_name, so_blob_key, weights_blob_key),
-        "Malformed named-data key payload");
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
     ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str());
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
@@ -262,6 +258,8 @@ class ET_EXPERIMENTAL MetalBackend final
     // Prefetch the weights blob — trigger async readahead so pages are
     // resident by the time update_constants_from_blob memcpy's them.
     // This overlaps disk I/O with the .so write + dlopen (~200ms).
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
     {
       auto prefetch_buf = named_data_map->get_data(weights_blob_key.c_str());
       if (prefetch_buf.ok() && prefetch_buf->data() != nullptr) {
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 27e5088fc72..29062b57579 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -171,14 +171,12 @@
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_max_pool2d_pass import RewriteMaxPool2dPass  # noqa
-from .rewrite_mxfp_conv2d import RewriteMXFPConv2dPass  # noqa
 from .rewrite_mxfp_linear import RewriteMXFPLinearPass  # noqa
 from .rewrite_pad import RewritePadPass  # noqa
 from .rewrite_slice import RewriteSlicePass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
-from .symbolic_to_tosa_shape_pass import SymbolicToTosaShapesPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
 from .replace_inf_and_limit_values_pass import (  # noqa  # usort: skip
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index fedca6eb65b..5ec57ee1787 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -117,7 +117,6 @@
     InsertConstShapesPass,
     InsertControlFlowRescalesPass,
     InsertDataLayoutCastsPass,
-    InsertDynamicPaddingPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
     InsertRescaleInt32Pass,
     InsertRescalePass,
@@ -147,14 +146,12 @@
     RewriteLeLtToGeGtPass,
     RewriteMatmulPass,
     RewriteMaxPool2dPass,
-    RewriteMXFPConv2dPass,
     RewriteMXFPLinearPass,
     RewritePadPass,
     RewriteSlicePass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
-    SymbolicToTosaShapesPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
@@ -613,7 +610,6 @@ def _tosa_pipeline(
                 RewriteMaxPool2dPass(),
                 DecomposeAdaptiveMaxPool2dPass(),
                 RewriteConvPass(exported_program),
-                RewriteMXFPConv2dPass(exported_program),
                 RewriteMXFPLinearPass(exported_program),
                 RewriteMatmulPass(),
                 RewritePadPass(),
@@ -634,8 +630,6 @@ def _tosa_pipeline(
             [
                 CastInt64BuffersToInt32Pass(exported_program),
                 FuseEqualPlaceholdersPass(exported_program),
-                SymbolicToTosaShapesPass(),
-                InsertDynamicPaddingPass(),
                 FuseConsecutiveConcatShapesPass(),
                 EnsureUniqueOutputNodesPass(),
                 RemoveNoopPass(),
@@ -683,6 +677,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                     InsertInt32CastsAfterInt64PlaceholdersPass(tfa_pass=True),
                     DecomposeEmbeddingPass(tfa_pass=True),
                     DecomposeScaledDotProductAttentionPass(tfa_pass=True),
+                    DecomposeRoundPass(tfa_pass=True),
                     DecomposeLogitPass(tfa_pass=True),
                     PromoteBoolOperandsPass(tfa_pass=True),
                     DecomposeSignPass(tfa_pass=True),
diff --git a/backends/arm/_passes/aten_to_tosa_activation_functions.py b/backends/arm/_passes/aten_to_tosa_activation_functions.py
index 8d51f092991..9b92b31e630 100644
--- a/backends/arm/_passes/aten_to_tosa_activation_functions.py
+++ b/backends/arm/_passes/aten_to_tosa_activation_functions.py
@@ -128,21 +128,3 @@ def rewrite_clamp(node: Node, pass_: AtenToDialectPass) -> DialectNodeSpec | Non
         exir_ops.backend.tosa.CLAMP.default,
         (node.args[0], *min_max_args),
     )
-
-
-def get_activation_replacement(
-    node: Node, pass_: AtenToDialectPass
-) -> DialectNodeSpec | None:
-    # Dispatch activation rewrites from their ATen target to the matching TOSA
-    # dialect node builder.
-    match node.target:
-        case exir_ops.edge.aten.clamp.default:
-            return rewrite_clamp(node, pass_)
-        case exir_ops.edge.aten.erf.default:
-            return rewrite_erf(node, pass_)
-        case exir_ops.edge.aten.sigmoid.default:
-            return rewrite_sigmoid(node, pass_)
-        case exir_ops.edge.aten.tanh.default:
-            return rewrite_tanh(node, pass_)
-        case _:
-            return None
diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py
index 48b26f1d027..476f75d6b56 100644
--- a/backends/arm/_passes/decompose_round_pass.py
+++ b/backends/arm/_passes/decompose_round_pass.py
@@ -5,6 +5,7 @@
 
 from typing import Set, Type
 
+import torch
 from executorch.backends.arm._passes import ArmOpTargetedPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -32,6 +33,16 @@ def _get_round_decomposition_ops(op) -> tuple[Op, Op, Op, Op, Op, Op, Op]:
             exir_ops.edge.aten.ceil.default,
             exir_ops.edge.aten.where.self,
         )
+    elif op == torch.ops.aten.round.default:
+        return (
+            torch.ops.aten.full.default,
+            torch.ops.aten.ge.Tensor,
+            torch.ops.aten.add.Scalar,
+            torch.ops.aten.sub.Scalar,
+            torch.ops.aten.floor.default,
+            torch.ops.aten.ceil.default,
+            torch.ops.aten.where.self,
+        )
     raise RuntimeError(f"Can't get round decomposition ops for op {op}")
 
 
@@ -54,10 +65,11 @@ class DecomposeRoundPass(ArmOpTargetedPass):
 
     target_ops = {
         exir_ops.edge.aten.round.default,
+        torch.ops.aten.round.default,
     }
 
     def call_operator(self, op, args, kwargs, meta, updated=False):
-        if op not in self.target_ops or self._is_quantized_meta(meta):
+        if op not in self.target_ops or not self.allowed_to_transform(meta):
             return super().call_operator(op, args, kwargs, meta, updated)
         x = args[0]
         input_dtype = x.node.meta["val"].dtype
diff --git a/backends/arm/_passes/deduplicate_get_attr_pass.py b/backends/arm/_passes/deduplicate_get_attr_pass.py
index f5760a2fcb8..201a9036e34 100644
--- a/backends/arm/_passes/deduplicate_get_attr_pass.py
+++ b/backends/arm/_passes/deduplicate_get_attr_pass.py
@@ -9,7 +9,6 @@
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
-from torch.fx.node import map_arg
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
 
@@ -25,13 +24,6 @@ class DeduplicateGetAttrPass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    def _replace_input_node(self, node: Node, old_node: Node, new_node: Node) -> None:
-        def maybe_replace_node(arg: Any) -> Any:
-            return new_node if arg is old_node else arg
-
-        node.args = map_arg(node.args, maybe_replace_node)
-        node.kwargs = map_arg(node.kwargs, maybe_replace_node)
-
     def _get_attr(self, graph_module: GraphModule, target: str) -> Any:
         attr: Any = graph_module
         for target_atom in target.split("."):
@@ -59,26 +51,9 @@ def _copy_attr(self, graph_module: GraphModule, node: Node) -> str:
 
         return attr_name
 
-    def _split_shared_get_attrs(self, graph_module: GraphModule) -> bool:
-        modified = False
-
-        for node in list(graph_module.graph.find_nodes(op="get_attr")):
-            users = list(node.users)
-            if len(users) <= 1:
-                continue
-
-            for user in users[1:]:
-                with graph_module.graph.inserting_before(user):
-                    new_node = graph_module.graph.get_attr(node.target)
-                    new_node.meta.update(node.meta)
-                self._replace_input_node(user, node, new_node)
-                modified = True
-
-        return modified
-
     def call(self, graph_module: GraphModule) -> PassResult:
         seen_targets: set[str] = set()
-        modified = self._split_shared_get_attrs(graph_module)
+        modified = False
 
         for node in graph_module.graph.find_nodes(op="get_attr"):
 
diff --git a/backends/arm/_passes/exir_to_tosa_pass.py b/backends/arm/_passes/exir_to_tosa_pass.py
index c0c6efb1a6c..b77171b9eaf 100644
--- a/backends/arm/_passes/exir_to_tosa_pass.py
+++ b/backends/arm/_passes/exir_to_tosa_pass.py
@@ -5,38 +5,37 @@
 
 import executorch.backends.arm.tosa.dialect  # noqa: F401
 from executorch.backends.arm._passes.aten_to_tosa_activation_functions import (
-    get_activation_replacement,
-)
-from executorch.backends.arm._passes.aten_to_tosa_tensor_operators import rewrite_argmax
-from executorch.backends.transforms.aten_to_dialect_pass import (
-    AtenToDialectPass,
-    DialectNodeSpec,
+    rewrite_clamp,
+    rewrite_erf,
+    rewrite_sigmoid,
+    rewrite_tanh,
 )
+from executorch.backends.transforms.aten_to_dialect_pass import AtenToDialectPass
 from executorch.exir.dialects._ops import ops as exir_ops
-from torch.fx import Node
 
 
 class ExirToTosaPass(AtenToDialectPass):
     """Rewrite simple EXIR ops to equivalent backend TOSA dialect ops.
 
-    Rewrite functions are registered with the shared ATen-to-dialect pass
-    infrastructure.
+    Rewrite functions are grouped by op category and registered with the shared
+    ATen-to-dialect pass infrastructure.
 
     """
 
 
-@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.argmax.default)
-def _get_tensor_operators_replacement(
-    node: Node, pass_: AtenToDialectPass
-) -> DialectNodeSpec:
-    return rewrite_argmax(node, pass_)
+_ACTIVATION_FUNCTION_REWRITES = {
+    exir_ops.edge.aten.clamp.default: rewrite_clamp,
+    exir_ops.edge.aten.erf.default: rewrite_erf,
+    exir_ops.edge.aten.sigmoid.default: rewrite_sigmoid,
+    exir_ops.edge.aten.tanh.default: rewrite_tanh,
+}
 
+_DIRECT_REWRITE_CATEGORIES = {
+    "activation_functions": _ACTIVATION_FUNCTION_REWRITES,
+}
 
-@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.clamp.default)
-@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.erf.default)
-@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.sigmoid.default)
-@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.tanh.default)
-def _get_activation_replacement(
-    node: Node, pass_: AtenToDialectPass
-) -> DialectNodeSpec | None:
-    return get_activation_replacement(node, pass_)
+# Register each category's ATen targets with the function that builds the
+# corresponding TOSA dialect node spec.
+for _rewrite_category in _DIRECT_REWRITE_CATEGORIES.values():
+    for _edge_target, _rewrite_fn in _rewrite_category.items():
+        ExirToTosaPass.register_dialect_substitution(_edge_target)(_rewrite_fn)
diff --git a/backends/arm/_passes/insert_dynamic_padding.py b/backends/arm/_passes/insert_dynamic_padding.py
index b1d998268eb..bfc0382e4ad 100644
--- a/backends/arm/_passes/insert_dynamic_padding.py
+++ b/backends/arm/_passes/insert_dynamic_padding.py
@@ -29,7 +29,6 @@ class InsertDynamicPaddingPass(ArmOpTargetedPass):
     _passes_required_after: Set[Type[ExportPass]] = set()
     target_ops = (
         exir_ops.backend.tosa.CONV2D.default,
-        exir_ops.backend.tosa.CONV3D.default,
         exir_ops.backend.tosa.DEPTHWISE_CONV2D.default,
         exir_ops.backend.tosa.MAX_POOL2D.default,
         exir_ops.backend.tosa.AVG_POOL2D.default,
@@ -58,12 +57,11 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue:
         if not self._is_dynamic_padding(padding):
             return super().call_operator(op, args, kwargs, meta, updated)
 
-        # Create a pad op before the convolution/pool op.
+        # Create a pad op before conv2d
         input_tensor = args[0]
 
         zero_padding_pair = [0, 0]
-        spatial_rank = 3 if op == exir_ops.backend.tosa.CONV3D.default else 2
-        zero_spatial_padding = [0] * (spatial_rank * 2)
+        zero_spatial_padding = [0, 0, 0, 0]
         N_padding = super().call_shape_operator(
             exir_ops.backend.tosa.CONST_SHAPE.default,
             (zero_padding_pair,),
@@ -95,7 +93,7 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue:
             meta,
             True,
         )
-        new_args = list(args)
-        new_args[0] = pad_res
-        new_args[padding_index] = zero_spatial_padding
-        return super().call_operator(op, tuple(new_args), kwargs, meta, updated)
+        new_conv2d_args = list(args)
+        new_conv2d_args[0] = pad_res
+        new_conv2d_args[padding_index] = zero_spatial_padding
+        return super().call_operator(op, tuple(new_conv2d_args), kwargs, meta, updated)
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index f84ec5b678e..45374c12c3b 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -18,7 +18,6 @@
 
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
@@ -36,12 +35,6 @@ class InsertRescalePass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    _mxfp_payload_dtypes = {
-        TosaSpecialDtype.FP4E2M1,
-        TosaSpecialDtype.FP6E2M3,
-        TosaSpecialDtype.FP6E3M2,
-    }
-
     def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None:
         """Ensure uint8 tensors only appear at IO boundaries.
 
@@ -58,23 +51,21 @@ def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None:
                 continue
             if node.op in ("placeholder", "output"):
                 continue
-            if node.op == "call_function":
-                if node.target == operator.getitem and all(
-                    user.op == "output" for user in node.users
-                ):
-                    continue
-                if node.target == exir_ops.backend.tosa.RESCALE.default:
-                    continue
-                if (
-                    node.target
-                    == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
-                ):
-                    # dim_order is a view-like transform; allow it to preserve uint8 at IO.
+            if node.op == "call_function" and node.target == operator.getitem:
+                if all(user.op == "output" for user in node.users):
                     continue
-            if node.meta.get(TosaSpecialDtype.meta_key()) in self._mxfp_payload_dtypes:
-                # Sub-byte FP types are stored uint8 arrays, so we need an exception for those.
+            if (
+                node.op == "call_function"
+                and node.target
+                == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+            ):
+                # dim_order is a view-like transform; allow it to preserve uint8 at IO.
+                continue
+            if (
+                node.op == "call_function"
+                and node.target == exir_ops.backend.tosa.RESCALE.default
+            ):
                 continue
-
             raise ValueError(
                 f"Found internal uint8 tensor at node {node.name} "
                 f"({node.target}). Uint8 is only allowed at IO boundaries."
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index 82d2ff1dbe0..10b85149dad 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -58,7 +58,6 @@ class TableOps:
         exir_ops.edge.aten.acos.default: torch.acos,
         exir_ops.edge.aten.tan.default: torch.tan,
         exir_ops.edge.aten.silu.default: torch.nn.functional.silu,
-        exir_ops.edge.aten.round.default: torch.round,
     }
 
     # Targets that must be treated explicitly
diff --git a/backends/arm/_passes/rewrite_conv_pass.py b/backends/arm/_passes/rewrite_conv_pass.py
index 3ae5ae9f9fb..6f588a1a1f1 100644
--- a/backends/arm/_passes/rewrite_conv_pass.py
+++ b/backends/arm/_passes/rewrite_conv_pass.py
@@ -97,25 +97,23 @@ def _adjust_pad_if_needed(
 
         if isinstance(mod_remainder, torch.SymInt):
             shape_env = get_context_shape_env()
-            exact_values = evaluate_symbolic_expr_values(mod_remainder, shape_env)
+            exact_values = evaluate_symbolic_expr_values(
+                mod_remainder.node.expr, shape_env
+            )
             if exact_values is not None:
                 mod_remainder_upper = max(exact_values)
-                if len(exact_values) == 1:
-                    mod_remainder = int(next(iter(exact_values)))
-                elif mod_remainder_upper == 0:
-                    mod_remainder = 0
-                else:
-                    return pad - mod_remainder
             else:
-                # SizeAdjustInputPass already trims symbolic remainder classes
-                # that would force negative padding. Keep the symbolic
-                # expression here instead of asking ShapeEnv to normalize it.
-                return pad - mod_remainder
-        if mod_remainder > pad:
+                value_ranges = shape_env.bound_sympy(mod_remainder.node.expr)
+                mod_remainder_upper = int(value_ranges.upper)
+            if mod_remainder_upper == 0:
+                mod_remainder = 0
+        else:
+            mod_remainder_upper = mod_remainder
+
+        if mod_remainder_upper > pad:
             raise RuntimeError(
-                "This case should be handled by SizeAdjustInputPass, is it enabled?\n"
+                "This case should be handled by the SizeAdjustInputPass, is it enabled?\n"
             )
-
         return pad - mod_remainder
 
     def _is_depthwise_conv2d(self, node: torch.fx.Node) -> bool:
diff --git a/backends/arm/_passes/rewrite_mxfp_linear.py b/backends/arm/_passes/rewrite_mxfp_linear.py
index 6f4a475e46e..d4ca436dc41 100644
--- a/backends/arm/_passes/rewrite_mxfp_linear.py
+++ b/backends/arm/_passes/rewrite_mxfp_linear.py
@@ -8,53 +8,16 @@
 from typing import Any, cast, Sequence, Set, Type
 
 import torch
-from executorch.backends.arm._passes import ArmOpTargetedPass
+from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
 )
-from executorch.backends.arm.ao_ext.mxfp import (
-    mxfp_dtype_to_str,
-    mxfp_str_to_dtype,
-    MXFPDType,
-)
-from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
-
-
-def _get_weights_payload_dtype(
-    qdata_node: torch.fx.Node,
-    dtype: str = "",
-) -> MXFPDType:
-    if dtype:
-        return mxfp_str_to_dtype(dtype)
-    qdata = get_first_fake_tensor(qdata_node)
-    if qdata.dtype == torch.uint8:
-        return torch.float4_e2m1fn_x2
-    return qdata.dtype
-
-
-def _mark_mxfp_payload(node: torch.fx.Node, payload_dtype: MXFPDType) -> None:
-    """Annotate uint8-backed MXFP payload nodes with their TOSA dtype.
 
-    PyTorch represents sub-byte MXFP payloads as ``torch.uint8`` tensors, so
-    the tensor dtype alone cannot distinguish FP4E2M1, FP6E2M3, and FP6E3M2.
-    Store the logical TOSA dtype in node metadata so later lowering and
-    serialization treat the payload as MXFP data rather than ordinary uint8.
-    FP8 payloads have native PyTorch dtypes and do not need this metadata.
 
-    """
-    if payload_dtype == torch.float4_e2m1fn_x2:
-        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP4E2M1
-    elif payload_dtype == DTYPE_FP6_E2M3:
-        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E2M3
-    elif payload_dtype == DTYPE_FP6_E3M2:
-        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E3M2
-
-
-class RewriteMXFPLinearPass(ArmOpTargetedPass):
+class RewriteMXFPLinearPass(ArmPass):
     """Rewrite ``tosa_mxfp.linear`` into explicit TOSA MXFP operators.
 
     For each MXFP linear custom op, the pass:
@@ -69,24 +32,15 @@ class RewriteMXFPLinearPass(ArmOpTargetedPass):
 
     """
 
-    target_ops = {
-        torch.ops.tosa_mxfp.linear.default,
-        exir_ops.edge.tosa_mxfp.linear.default,
-    }
     _passes_required_after: Set[Type[ExportPass]] = set()
 
     def __init__(self, exported_program: torch.export.ExportedProgram, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
-    def _get_linear_args(self, node: torch.fx.Node) -> tuple[
-        torch.fx.Node,
-        torch.fx.Node,
-        torch.fx.Node,
-        torch.fx.Node | None,
-        int,
-        MXFPDType,
-    ]:
+    def _get_linear_args(
+        self, node: torch.fx.Node
+    ) -> tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node, torch.fx.Node | None, int]:
         """Extract the MXFP linear operands from a custom-op node."""
         input_node = cast(torch.fx.Node, node.args[0])
         weight_qdata_node = cast(torch.fx.Node, node.args[1])
@@ -99,26 +53,7 @@ def _get_linear_args(self, node: torch.fx.Node) -> tuple[
             int,
             node.args[4] if len(node.args) > 4 else node.kwargs.get("block_size", 32),
         )
-        payload_dtype_str = cast(
-            str,
-            (
-                node.args[5]
-                if len(node.args) > 5
-                else node.kwargs.get(
-                    "weight_payload_dtype",
-                    node.kwargs.get("weight_dtype", ""),
-                )
-            ),
-        )
-        payload_dtype = _get_weights_payload_dtype(weight_qdata_node, payload_dtype_str)
-        return (
-            input_node,
-            weight_qdata_node,
-            weight_scale_node,
-            bias_node,
-            block_size,
-            payload_dtype,
-        )
+        return input_node, weight_qdata_node, weight_scale_node, bias_node, block_size
 
     def _reshape_with_view(
         self,
@@ -149,15 +84,12 @@ def _create_block_scaled_inputs(
         weight_qdata_node: torch.fx.Node,
         weight_scale_node: torch.fx.Node,
         block_size: int,
-        payload_dtype: MXFPDType,
     ) -> tuple[torch.fx.Node, torch.fx.Node]:
         """Create rank-3 inputs for the block-scaled cast and matmul ops."""
         graph = graph_module.graph
         input_fake = get_first_fake_tensor(input_node)
         weight_qdata_fake = get_first_fake_tensor(weight_qdata_node)
         weight_scale_fake = get_first_fake_tensor(weight_scale_node)
-        payload_dtype_str = mxfp_dtype_to_str(payload_dtype)
-        _mark_mxfp_payload(weight_qdata_node, payload_dtype)
 
         batches = reduce(operator.mul, input_fake.shape[:-1], 1)
         input_reshape_shape = [1, batches, input_fake.shape[-1]]
@@ -177,13 +109,13 @@ def _create_block_scaled_inputs(
             graph=graph,
             op_target=exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default,
             args=(input_reshaped, block_size),
-            kwargs={"output_dtype": payload_dtype_str},
+            kwargs={"output_dtype": weight_qdata_fake.dtype},
             from_node=mxfp_linear_node,
         )
         cast_node.meta["val"] = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
             get_first_fake_tensor(input_reshaped),
             block_size,
-            output_dtype=payload_dtype_str,
+            output_dtype=weight_qdata_fake.dtype,
         )
 
         input_qdata_node = create_node(
@@ -194,7 +126,6 @@ def _create_block_scaled_inputs(
             from_node=mxfp_linear_node,
         )
         input_qdata_node.meta["val"] = cast_node.meta["val"][0]
-        _mark_mxfp_payload(input_qdata_node, payload_dtype)
 
         input_scale_node = create_node(
             graph=graph,
@@ -219,10 +150,8 @@ def _create_matmul_node(
         weight_qdata_node: torch.fx.Node,
         weight_scale_node: torch.fx.Node,
         block_size: int,
-        payload_dtype: MXFPDType,
     ) -> torch.fx.Node:
         """Insert ``MATMUL_T_BLOCK_SCALED`` with updated fake metadata."""
-        payload_dtype_str = mxfp_dtype_to_str(payload_dtype)
         matmul_node = create_node(
             graph=graph_module.graph,
             op_target=exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default,
@@ -233,7 +162,7 @@ def _create_matmul_node(
                 weight_scale_node,
                 block_size,
             ),
-            kwargs={"payload_dtype": payload_dtype_str},
+            kwargs={},
             from_node=mxfp_linear_node,
         )
         matmul_node.meta["val"] = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default(
@@ -242,7 +171,6 @@ def _create_matmul_node(
             get_first_fake_tensor(weight_qdata_node),
             get_first_fake_tensor(weight_scale_node),
             block_size,
-            payload_dtype=payload_dtype_str,
         )
         return matmul_node
 
@@ -327,7 +255,6 @@ def _rewrite_mxfp_linear_node(
             weight_scale_node,
             bias_node,
             block_size,
-            payload_dtype,
         ) = self._get_linear_args(mxfp_linear_node)
 
         with graph.inserting_before(mxfp_linear_node):
@@ -341,7 +268,6 @@ def _rewrite_mxfp_linear_node(
                 weight_qdata_node,
                 weight_scale_node,
                 block_size,
-                payload_dtype,
             )
             matmul_node = self._create_matmul_node(
                 graph_module,
@@ -351,7 +277,6 @@ def _rewrite_mxfp_linear_node(
                 weight_qdata_node,
                 weight_scale_node,
                 block_size,
-                payload_dtype,
             )
 
         with graph.inserting_after(matmul_node):
@@ -374,7 +299,10 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
 
         for node in list(graph.nodes):
-            if node.op != "call_function" or node.target not in self.target_ops:
+            if node.op != "call_function" or node.target not in (
+                torch.ops.tosa_mxfp.linear.default,
+                exir_ops.edge.tosa_mxfp.linear.default,
+            ):
                 continue
 
             modified = True
diff --git a/backends/arm/_passes/size_adjust_input_pass.py b/backends/arm/_passes/size_adjust_input_pass.py
index 6028e618d65..1c331b9c329 100644
--- a/backends/arm/_passes/size_adjust_input_pass.py
+++ b/backends/arm/_passes/size_adjust_input_pass.py
@@ -62,41 +62,6 @@ def _greater_than(input: SymIntLike, other: int) -> bool | torch.SymBool:
         return input > other
 
 
-def _get_slice_adjustment(
-    remainder: SymIntLike,
-    pad: int,
-    stride: int,
-) -> SymIntLike | None:
-    """Return the amount to slice from the end of a conv dimension.
-
-    The required trim is ``max(remainder - pad, 0)``. For symbolic shapes we
-    encode that clamp using only integer arithmetic that the TOSA shape
-    materializer already supports: a sum of floor-div terms over the possible
-    residue classes.
-
-    """
-    if not isinstance(remainder, torch.SymInt):
-        return remainder - pad if remainder > pad else None
-
-    shape_env = get_context_shape_env()
-    exact_values = evaluate_symbolic_expr_values(remainder.node.expr, shape_env)
-    if exact_values is not None:
-        adjustments = {max(value - pad, 0) for value in exact_values}
-        if len(adjustments) == 1:
-            adjustment = next(iter(adjustments))
-            return adjustment if adjustment > 0 else None
-
-    if pad >= stride - 1:
-        return None
-
-    adjustment: SymIntLike | None = None  # type: ignore[no-redef]
-    for threshold in range(pad + 1, stride):
-        term = (remainder + stride - threshold) // stride
-        adjustment = term if adjustment is None else adjustment + term
-
-    return adjustment
-
-
 def get_slices_convolution(conv_node: torch.fx.Node) -> Slices:
     slices: Slices = []
 
@@ -120,12 +85,8 @@ def get_slices_convolution(conv_node: torch.fx.Node) -> Slices:
         remainder = conv_remainder(
             input_shape[dim], pad, dilation, weight_shape[dim], stride
         )
-        adjustment = _get_slice_adjustment(
-            remainder,
-            pad,
-            stride,
-        )
-        if adjustment is not None:
+        if _greater_than(remainder, pad):
+            adjustment = remainder - pad
             args = (dim, 0, input_shape[dim] - adjustment)
             slices.append(args)
 
diff --git a/backends/arm/_passes/symbolic_value_range.py b/backends/arm/_passes/symbolic_value_range.py
index 609a84edc54..0753fefa270 100644
--- a/backends/arm/_passes/symbolic_value_range.py
+++ b/backends/arm/_passes/symbolic_value_range.py
@@ -39,70 +39,11 @@ def _symbol_values(symbol: sympy.Symbol, shape_env: ShapeEnv) -> _ExactValues:
     return frozenset(sympy.Integer(value) for value in range(lower, upper + 1))
 
 
-def _expr_symbols_to_values(
-    expr: sympy.Basic,
-    shape_env: ShapeEnv,
-) -> dict[sympy.Symbol, _ExactValues]:
-    return {symbol: _symbol_values(symbol, shape_env) for symbol in expr.free_symbols}
-
-
-def _try_expr_to_int(expr: sympy.Basic) -> Optional[int]:
-    integer_value = _expr_to_int(expr)
-    if integer_value is not None:
-        return integer_value
-
-    try:
-        return _expr_to_int(sympy.simplify(expr))
-    except (RecursionError, TypeError):
-        return None
-
-
-def _constant_expr_values(expr: sympy.Basic) -> Optional[set[int]]:
-    if expr.free_symbols:
-        return None
-
-    integer_value = _try_expr_to_int(expr)
-    return {integer_value} if integer_value is not None else None
-
-
-def _evaluate_exact_values(
-    expr: sympy.Basic,
-    shape_env: ShapeEnv,
-) -> _ExactValues:
-    try:
-        return sympy_interp(
-            _ExactValueAnalysis,
-            _expr_symbols_to_values(expr, shape_env),
-            expr,
-            missing_handler=lambda symbol: _symbol_values(symbol, shape_env),
-        )
-    except (RecursionError, TypeError):
-        return None
-
-
-def _exact_values_to_ints(exact_values: _ExactValues) -> Optional[set[int]]:
-    if exact_values is None:
-        return None
-
-    result: set[int] = set()
-    for value in exact_values:
-        integer_value = _try_expr_to_int(value)
-        if integer_value is None:
-            return None
-        result.add(integer_value)
-    return result
-
-
 def _map_values(values: _ExactValues, fn) -> _ExactValues:
     if values is None:
         return None
 
-    result = set()
-    for value in values:
-        try:
-            result.add(fn(value))
-        except (RecursionError, TypeError):
-            return None
+    result = {sympy.simplify(fn(value)) for value in values}
     if len(result) > _MAX_SET_SIZE:
         return None
     return frozenset(result)
@@ -114,13 +55,7 @@ def _combine_values(lhs: _ExactValues, rhs: _ExactValues, fn) -> _ExactValues:
     if len(lhs) * len(rhs) > _MAX_SET_SIZE * _MAX_SET_SIZE:
         return None
 
-    result = set()
-    for a in lhs:
-        for b in rhs:
-            try:
-                result.add(fn(a, b))
-            except (RecursionError, TypeError):
-                return None
+    result = {sympy.simplify(fn(a, b)) for a in lhs for b in rhs}
     if len(result) > _MAX_SET_SIZE:
         return None
     return frozenset(result)
@@ -145,12 +80,6 @@ def mod(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
             return None
         return _combine_values(lhs, rhs, lambda a, b: sympy.Mod(a, b))
 
-    @staticmethod
-    def floordiv(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
-        if rhs is None or any(value == 0 for value in rhs):
-            return None
-        return _combine_values(lhs, rhs, lambda a, b: sympy.floor(a / b))
-
     @staticmethod
     def pow(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
         return _combine_values(lhs, rhs, lambda a, b: a**b)
@@ -175,15 +104,35 @@ def evaluate_symbolic_expr_values(
 ) -> Optional[set[int]]:
     """Return a best-effort finite set of possible integer values.
 
-    The helper avoids ShapeEnv bound queries here because some exported dynamic
-    expressions trigger very deep SymPy normalization. Instead, it relies on a
-    small exact-set analysis over bounded symbols using ``sympy_interp``.
+    The helper first relies on ``bound_sympy`` for cheap singleton detection.
+    When interval bounds are not precise enough, it falls back to a small
+    exact-set analysis over bounded symbols using ``sympy_interp``.
 
     """
-    root_expr = expr.node.expr if isinstance(expr, torch.SymInt) else expr
-
-    constant_values = _constant_expr_values(root_expr)
-    if constant_values is not None:
-        return constant_values
+    root_expr = sympy.simplify(
+        expr.node.expr if isinstance(expr, torch.SymInt) else expr
+    )
+    value_range = shape_env.bound_sympy(root_expr)
+    if value_range.is_int and value_range.is_singleton():
+        singleton = _expr_to_int(value_range.lower)
+        return {singleton} if singleton is not None else None
+
+    exact_values = sympy_interp(
+        _ExactValueAnalysis,
+        {
+            symbol: _symbol_values(symbol, shape_env)
+            for symbol in root_expr.free_symbols
+        },
+        root_expr,
+        missing_handler=lambda symbol: _symbol_values(symbol, shape_env),
+    )
+    if exact_values is None:
+        return None
 
-    return _exact_values_to_ints(_evaluate_exact_values(root_expr, shape_env))
+    result: set[int] = set()
+    for value in exact_values:
+        integer_value = _expr_to_int(sympy.simplify(value))
+        if integer_value is None:
+            return None
+        result.add(integer_value)
+    return result
diff --git a/backends/arm/ao_ext/mxfp.py b/backends/arm/ao_ext/mxfp.py
index f3b611ce14c..783da92590e 100644
--- a/backends/arm/ao_ext/mxfp.py
+++ b/backends/arm/ao_ext/mxfp.py
@@ -10,85 +10,12 @@
 from executorch.exir._warnings import experimental
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.mx_formats.config import ScaleCalculationMode
-from torchao.prototype.mx_formats.mx_tensor import (
-    DTYPE_FP6_E2M3,
-    DTYPE_FP6_E3M2,
-    to_dtype,
-    to_mx,
-)
 from torchao.quantization import quantize_
 
 
-# Pytorch lacks dtypes for the FP6 types, so we use ao's string representations for those.
-MXFPDType = torch.dtype | str
-
-
-SUPPORTED_MXFP_DTYPES: set[MXFPDType] = {
-    torch.float4_e2m1fn_x2,
-    torch.float8_e4m3fn,
-    torch.float8_e5m2,
-    # Use ao's string representations.
-    DTYPE_FP6_E2M3,
-    DTYPE_FP6_E3M2,
-}
-
-
-_DTYPE_TO_STR: dict[MXFPDType, str] = {
-    DTYPE_FP6_E2M3: "fp6e2m3",
-    DTYPE_FP6_E3M2: "fp6e3m2",
-    torch.float4_e2m1fn_x2: "f4e2m1",
-    torch.float8_e4m3fn: "f8e4m3",
-    torch.float8_e5m2: "f8e5m2",
-}
-
-
-_STR_TO_DTYPE = {value: key for (key, value) in _DTYPE_TO_STR.items()}
-
-
-def mxfp_dtype_to_str(dtype: MXFPDType) -> str:
-    try:
-        return _DTYPE_TO_STR[dtype]
-    except KeyError as e:
-        supported = ", ".join(str(dtype) for dtype in _DTYPE_TO_STR)
-        raise ValueError(
-            f"Unsupported MXFP dtype {dtype}. Supported dtypes: {supported}"
-        ) from e
-
-
-def mxfp_str_to_dtype(dtype: str) -> MXFPDType:
-    try:
-        return _STR_TO_DTYPE[dtype]
-    except KeyError as e:
-        supported = ", ".join(sorted(_STR_TO_DTYPE))
-        raise ValueError(
-            f"Unsupported MXFP dtype string {dtype!r}. Supported strings: {supported}"
-        ) from e
-
-
 def _match_supported_modules(module: torch.nn.Module, _name: str) -> bool:
     """Default filter function that matches supported modules."""
-    return isinstance(module, (torch.nn.Linear, torch.nn.Conv2d))
-
-
-def _cast_to_block_scaled_cpu_ref(
-    input: torch.Tensor,
-    output_dtype: MXFPDType,
-    block_size: int,
-) -> torch.Tensor:
-    """Emulate the current TOSA activation cast in eager mode."""
-    input_scale, input_qdata = to_mx(
-        input.to(torch.float32).contiguous(),
-        elem_dtype=output_dtype,
-        block_size=block_size,
-        scaling_mode=ScaleCalculationMode.RCEIL,
-    )
-    return to_dtype(
-        input_qdata,
-        input_scale,
-        output_dtype,
-        block_size,
-        torch.float32,
-    )
+    return isinstance(module, torch.nn.Linear)
 
 
 @experimental("This API is experimental and may change without notice.")
@@ -96,7 +23,7 @@ def _cast_to_block_scaled_cpu_ref(
 class MXFPOpConfig(AOBaseConfig):
     """Configuration for Arm MXFP source transforms."""
 
-    weight_dtype: MXFPDType = torch.float8_e4m3fn
+    weight_dtype: torch.dtype = torch.float8_e4m3fn
     weight_scaling_mode: ScaleCalculationMode = ScaleCalculationMode.RCEIL
 
     # Only block size of 32 is currently supported for now, so we hardcode it here.
@@ -105,7 +32,7 @@ def block_size(self) -> int:
         return 32
 
     def __post_init__(self) -> None:
-        if self.weight_dtype not in SUPPORTED_MXFP_DTYPES:
+        if self.weight_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
             raise ValueError(f"Unsupported weight_dtype: {self.weight_dtype}")
         if not isinstance(self.weight_scaling_mode, ScaleCalculationMode):
             raise ValueError(
diff --git a/backends/arm/ao_ext/mxfp_tosa_lib.py b/backends/arm/ao_ext/mxfp_tosa_lib.py
index 911d944c720..4459ec59126 100644
--- a/backends/arm/ao_ext/mxfp_tosa_lib.py
+++ b/backends/arm/ao_ext/mxfp_tosa_lib.py
@@ -8,5 +8,4 @@
 # MXFP TOSA library definition for the Arm backend containing.
 # This library will generate custom ops like the following example:
 #   torch.ops.tosa_mxfp.linear.default
-#   torch.ops.tosa_mxfp.conv2d.default
 MXFP_TOSA_LIB = Library("tosa_mxfp", "DEF")
diff --git a/backends/arm/ao_ext/mxfp_transform.py b/backends/arm/ao_ext/mxfp_transform.py
index e1f119aa0a0..b7823524475 100644
--- a/backends/arm/ao_ext/mxfp_transform.py
+++ b/backends/arm/ao_ext/mxfp_transform.py
@@ -6,7 +6,6 @@
 import torch
 
 from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig
-from executorch.backends.arm.ao_ext.ops.mxfp_conv2d_op import transform_conv2d_to_mxfp
 from executorch.backends.arm.ao_ext.ops.mxfp_linear_op import transform_linear_to_mxfp
 from torchao.quantization.transform_module import register_quantize_module_handler
 
@@ -21,7 +20,5 @@ def _transform_to_mxfp(
     """
     if isinstance(module, torch.nn.Linear):
         return transform_linear_to_mxfp(module, config)
-    elif isinstance(module, torch.nn.Conv2d):
-        return transform_conv2d_to_mxfp(module, config)
     else:
         return module
diff --git a/backends/arm/ao_ext/ops/__init__.py b/backends/arm/ao_ext/ops/__init__.py
index d4c602154fe..a690c4b7b02 100644
--- a/backends/arm/ao_ext/ops/__init__.py
+++ b/backends/arm/ao_ext/ops/__init__.py
@@ -3,10 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .mxfp_conv2d_op import MXFPConv2dOp
 from .mxfp_linear_op import MXFPLinearOp
 
 __all__ = [
-    "MXFPConv2dOp",
     "MXFPLinearOp",
 ]
diff --git a/backends/arm/ao_ext/ops/mxfp_linear_op.py b/backends/arm/ao_ext/ops/mxfp_linear_op.py
index 565d8695c5a..5238f85a847 100644
--- a/backends/arm/ao_ext/ops/mxfp_linear_op.py
+++ b/backends/arm/ao_ext/ops/mxfp_linear_op.py
@@ -12,50 +12,17 @@
 
 import torch
 import torch.nn.functional as F
-from executorch.backends.arm.ao_ext.mxfp import (
-    _cast_to_block_scaled_cpu_ref,
-    mxfp_dtype_to_str,
-    mxfp_str_to_dtype,
-    MXFPDType,
-    MXFPOpConfig,
-)
+from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig
 from executorch.backends.arm.ao_ext.mxfp_tosa_lib import MXFP_TOSA_LIB
+from torchao.prototype.mx_formats.config import ScaleCalculationMode
 from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx
 
-
-# Define the custom TOSA operator. Note that weight_payload_dtype is needed as
-# an extra argument because sub-byte dtypes (FP4 and FP6) are contained
-# in uint8 tensors, meaning the weight tensor itself does not contain
-# the dtype.
 MXFP_TOSA_LIB.define(
     "linear(Tensor input, Tensor weight_qdata, Tensor weight_scale, "
-    "Tensor? bias=None, SymInt block_size=32, str weight_payload_dtype='') -> Tensor"
+    "Tensor? bias=None, SymInt block_size=32) -> Tensor"
 )
 
 
-def _get_mx_elem_dtype(
-    weight_qdata: torch.Tensor,
-    weight_payload_dtype: str = "",
-) -> MXFPDType:
-    if weight_payload_dtype:
-        return mxfp_str_to_dtype(weight_payload_dtype)
-    if weight_qdata.dtype == torch.uint8:
-        return torch.float4_e2m1fn_x2
-    return weight_qdata.dtype
-
-
-def _get_num_input_features(
-    weight_qdata: torch.Tensor, weight_payload_dtype: str = ""
-) -> int:
-    num_input_features = weight_qdata.shape[-1]
-    if weight_qdata.dtype == torch.uint8 and weight_payload_dtype == mxfp_dtype_to_str(
-        torch.float4_e2m1fn_x2
-    ):
-        # FP4 elements are packed pairwise in each byte in a uint8 tensor.
-        num_input_features *= 2
-    return num_input_features
-
-
 @torch.library.register_fake("tosa_mxfp::linear", lib=MXFP_TOSA_LIB)  # type: ignore[misc]
 def _mxfp_linear_fake(
     input: torch.Tensor,
@@ -63,7 +30,6 @@ def _mxfp_linear_fake(
     weight_scale: torch.Tensor,
     bias: torch.Tensor | None = None,
     block_size: int = 32,
-    weight_payload_dtype: str = "",
 ) -> torch.Tensor:
     if weight_qdata.ndim != 3:
         raise ValueError(
@@ -73,16 +39,15 @@ def _mxfp_linear_fake(
         raise ValueError(
             f"Expected weight_qdata batch dim to be 1, got {weight_qdata.shape[0]}"
         )
-    num_input_features = _get_num_input_features(weight_qdata, weight_payload_dtype)
-    if input.shape[-1] != num_input_features:
+    if input.shape[-1] != weight_qdata.shape[-1]:
         raise ValueError(
             f"Input last dim {input.shape[-1]} must match linear in_features "
-            f"{num_input_features}"
+            f"{weight_qdata.shape[-1]}"
         )
     expected_scale_shape = (
         1,
         weight_qdata.shape[1],
-        num_input_features // block_size,
+        weight_qdata.shape[-1] // block_size,
     )
     if tuple(weight_scale.shape) != expected_scale_shape:
         raise ValueError(
@@ -93,6 +58,27 @@ def _mxfp_linear_fake(
     return input.new_empty(output_shape, dtype=torch.float32)
 
 
+def _cast_to_block_scaled_cpu_ref(
+    input: torch.Tensor,
+    output_dtype: torch.dtype,
+    block_size: int,
+) -> torch.Tensor:
+    """Emulate the current TOSA activation cast in eager mode."""
+    input_scale, input_qdata = to_mx(
+        input.to(torch.float32).contiguous(),
+        elem_dtype=output_dtype,
+        block_size=block_size,
+        scaling_mode=ScaleCalculationMode.RCEIL,
+    )
+    return to_dtype(
+        input_qdata,
+        input_scale,
+        output_dtype,
+        block_size,
+        torch.float32,
+    )
+
+
 @torch.library.impl("tosa_mxfp::linear", "cpu", lib=MXFP_TOSA_LIB)
 def _mxfp_linear_cpu(
     input: torch.Tensor,
@@ -100,26 +86,23 @@ def _mxfp_linear_cpu(
     weight_scale: torch.Tensor,
     bias: torch.Tensor | None = None,
     block_size: int = 32,
-    weight_payload_dtype: str = "",
 ) -> torch.Tensor:
     """CPU reference implementation of the MXFP linear op."""
 
     if weight_qdata.ndim != 3 or weight_scale.ndim != 3:
         raise ValueError("Expected rank-3 weight tensors for MXFP linear")
 
-    elem_dtype = _get_mx_elem_dtype(weight_qdata, weight_payload_dtype)
-
     # Cast the input to block-scaled format and back again to match the
     # expected input format of the TOSA
     dequantized_input = _cast_to_block_scaled_cpu_ref(
         input,
-        elem_dtype,
+        weight_qdata.dtype,
         block_size,
     )
     dequantized_weight = to_dtype(
         weight_qdata,
         weight_scale,
-        elem_dtype,
+        weight_qdata.dtype,
         block_size,
         torch.float32,
     )
@@ -141,7 +124,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
-        self.weight_dtype = mxfp_dtype_to_str(config.weight_dtype)
 
         self.register_buffer("weight_qdata", weight_qdata, persistent=True)
         self.register_buffer("weight_scale", weight_scale, persistent=True)
@@ -164,7 +146,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.weight_scale,
             self.bias,
             self.config.block_size,
-            self.weight_dtype,
         )
 
 
diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS
index 88e112feac5..a2fd054d472 100644
--- a/backends/arm/operator_support/TARGETS
+++ b/backends/arm/operator_support/TARGETS
@@ -4,7 +4,6 @@ runtime.python_library(
     name = "operator_support",
     srcs = glob(["*.py"]),
     deps = [
-        "//executorch/backends/arm:ao_ext",
         "//executorch/backends/arm:constants",
         "//executorch/backends/arm/_passes:passes",
         "//executorch/backends/arm/tosa:resize_utils",
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index 4d48d6ad0ff..066b5462f64 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -21,7 +21,6 @@
     reduce_sum_support,
     right_shift_support,
     slice_copy_support,
-    sym_size_int_support,
     to_dim_order_copy_support,
     tosa_supported_operators,
     unfold_copy_support,
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index dc448ba0d5f..fab4e6c60c1 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -99,7 +99,6 @@
     exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
     exir_ops.edge.aten.pad.default,
     exir_ops.edge.aten.constant_pad_nd.default,
-    exir_ops.edge.aten.argmax.default,
     exir_ops.edge.aten.amax.default,
     exir_ops.edge.aten.amin.default,
     exir_ops.edge.aten.eye.default,
@@ -129,7 +128,6 @@
     exir_ops.edge.aten.tan.default,
     exir_ops.edge.aten.silu.default,
     exir_ops.edge.aten.detach_copy.default,
-    exir_ops.edge.aten.round.default,
 }
 
 
@@ -239,7 +237,6 @@
     operator.getitem,
     exir_ops.edge.aten.pad.default,
     exir_ops.edge.aten.constant_pad_nd.default,
-    exir_ops.edge.aten.argmax.default,
     exir_ops.edge.aten.amax.default,
     exir_ops.edge.aten.amin.default,
     exir_ops.edge.aten.eye.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 82a529d62a2..2e640b758d2 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -14,12 +14,9 @@
 import typing
 from typing import final, Optional, Sequence, Type
 
-# Register Arm-specific torch.library ops and MXFP transforms at package
-# import time.
-import executorch.backends.arm.ao_ext  # noqa: F401
-
 import torch
 import torch.fx as fx
+
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_first_fake_tensor,
     is_submodule_node,
@@ -87,7 +84,7 @@ def __init__(self, tosa_spec: TosaSpecification, reporter: WhyNoPartitionReporte
 
     # Class attributes populated by subclasses
     tosa_specs: list[TosaSpecification] = TosaSpecification.all_versions_and_profiles()
-    targets: list[object] = []
+    targets: list[str] = []
 
     @final
     def is_node_supported(
@@ -243,10 +240,7 @@ def get_registered_tosa_support_checks(
 class MXOpsSupportList(OperatorSupportBase):
     """Accept Arm MX custom ops when the active spec enables MX support."""
 
-    targets = (
-        exir_ops.edge.tosa_mxfp.conv2d.default,
-        exir_ops.edge.tosa_mxfp.linear.default,
-    )
+    targets = (exir_ops.edge.tosa_mxfp.linear.default,)
 
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
@@ -254,141 +248,88 @@ def is_node_supported(
         return node.op == "call_function" and node.target in self.targets
 
 
-def _profile_support_check(
+def tosa_support_factory(
     tosa_spec: TosaSpecification,
-) -> Optional[OperatorSupportBase]:
-    if tosa_spec.support_integer() and tosa_spec.support_float():
-        return TOSAProINTFPSupportList()
-    if tosa_spec.support_integer():
-        return TOSAProINTSupportList()
-    if tosa_spec.support_float():
-        return TOSAProFPSupportList()
-    return None
+    exported_program: ExportedProgram,
+    reporter: WhyNoPartitionReporter,
+    additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
+) -> OperatorSupportBase:
+    """Create an OperatorSupport composite for a TOSA spec.
 
+    Combine profile-specific positive checks, registered operator checks, and
+    negative checks into a single :py:class:`OperatorSupportBase` chain.
 
-def _registered_support_checks(
-    tosa_spec: TosaSpecification,
-    reporter: WhyNoPartitionReporter,
-) -> list[OperatorSupportBase]:
-    return [
-        check(tosa_spec, reporter)
-        for check in get_registered_tosa_support_checks(tosa_spec)
-    ]
+    Args:
+        tosa_spec (TosaSpecification): Active TOSA specification.
+        exported_program (ExportedProgram): Program context for checks.
+        reporter (WhyNoPartitionReporter): Reporter for rejections.
+        additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
+            negative checks to apply.
 
+    Returns:
+        OperatorSupportBase: Composite checker for the given spec.
 
-def _positive_checks(
-    tosa_spec: TosaSpecification,
-    exported_program: ExportedProgram,
-    reporter: WhyNoPartitionReporter,
-) -> list[OperatorSupportBase]:
-    checks: list[OperatorSupportBase] = [
+    """
+    # Postive checks: Add nodes to partitioning
+    positive_checks: list[OperatorSupportBase] = [
         ControlFlowSubmoduleSupported(exported_program, tosa_spec, reporter),
         ControlFlowOpSupported(exported_program, tosa_spec, reporter),
     ]
 
-    if profile_check := _profile_support_check(tosa_spec):
-        checks.append(profile_check)
-
+    if tosa_spec.support_integer() and tosa_spec.support_float():
+        positive_checks.append(TOSAProINTFPSupportList())
+    elif tosa_spec.support_integer():
+        positive_checks.append(TOSAProINTSupportList())
+    elif tosa_spec.support_float():
+        positive_checks.append(TOSAProFPSupportList())
     if tosa_spec.support_extension("mxfp"):
-        checks.append(MXOpsSupportList())
-
+        positive_checks.append(MXOpsSupportList())
     # TODO: Refactor to use TOSAProSupportLists + negtive checks
-    checks.extend(_registered_support_checks(tosa_spec, reporter))
+    positive_checks += [
+        check(tosa_spec, reporter)
+        for check in get_registered_tosa_support_checks(tosa_spec)
+    ]
 
-    return checks
+    # Negative checks: Remove nodes from partitioning
+    negative_checks: list[OperatorSupportBase] = [
+        CheckInt64InputsAndOutputs(exported_program, reporter),
+        RankCheck(reporter, max_rank=MAX_RANK),
+        *[
+            reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
+            for check in (additional_checks if additional_checks else [])
+        ],
+    ]
 
+    if tosa_spec.support_float():
+        negative_checks.append(CheckMixedFloatingInputs(reporter))
+    else:
+        negative_checks.append(CheckArmQuantized(reporter))
+        negative_checks.append(CheckProperQuantization(reporter))
 
-def _disallowed_dtypes(tosa_spec: TosaSpecification) -> list[torch.dtype]:
-    dtypes = [torch.float64]
+    disallowed_dtypes = [torch.float64]
     if not tosa_spec.support_extension("bf16"):
-        dtypes.append(torch.bfloat16)
+        disallowed_dtypes.append(torch.bfloat16)
     if not (
         tosa_spec.support_extension("fp8e4m3") or tosa_spec.support_extension("mxfp")
     ):
-        dtypes.append(torch.float8_e4m3fn)
+        disallowed_dtypes.append(torch.float8_e4m3fn)
     if not (
         tosa_spec.support_extension("fp8e5m2") or tosa_spec.support_extension("mxfp")
     ):
-        dtypes.append(torch.float8_e5m2)
+        disallowed_dtypes.append(torch.float8_e5m2)
     if tosa_spec.is_U55_subset:
-        dtypes.append(torch.bool)
-    return dtypes
-
-
-def _wrapped_additional_checks(
-    additional_checks: Optional[Sequence[OperatorSupportBase]],
-    reporter: WhyNoPartitionReporter,
-) -> list[OperatorSupportBase]:
-    if not additional_checks:
-        return []
-    return [
-        reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
-        for check in additional_checks
-    ]
-
-
-def _negative_checks(
-    tosa_spec: TosaSpecification,
-    exported_program: ExportedProgram,
-    reporter: WhyNoPartitionReporter,
-    additional_checks: Optional[Sequence[OperatorSupportBase]],
-) -> list[OperatorSupportBase]:
-    checks: list[OperatorSupportBase] = [RankCheck(reporter, MAX_RANK)]
-
-    if not tosa_spec.support_extension("int64"):
-        checks.append(CheckInt64InputsAndOutputs(exported_program, reporter, tosa_spec))
-
-    checks.extend(_wrapped_additional_checks(additional_checks, reporter))
-
-    if tosa_spec.support_float():
-        checks.append(CheckMixedFloatingInputs(reporter))
-    else:
-        checks.append(CheckArmQuantized(reporter))
-        checks.append(CheckProperQuantization(reporter))
-
-    checks.append(
+        disallowed_dtypes.append(torch.bool)
+    negative_checks.append(
         CheckDtypeInputsAndOutputs(
-            exported_program, reporter, _disallowed_dtypes(tosa_spec), tosa_spec
+            exported_program, reporter, disallowed_dtypes, tosa_spec
         )
     )
-
     if tosa_spec.is_U55_subset:
-        checks.append(EthosU55NotSupported(reporter))
-        checks.append(EthosU55DtypeSupport(reporter))
-        checks.append(EthosU55CastCheck(reporter))
-
+        negative_checks.append(EthosU55NotSupported(reporter))
+        negative_checks.append(EthosU55DtypeSupport(reporter))
+        negative_checks.append(EthosU55CastCheck(reporter))
     if not tosa_spec.support_extension("shape"):
-        checks.append(SymbolicShapeSupportCheck(reporter))
-
-    return checks
-
-
-def tosa_support_factory(
-    tosa_spec: TosaSpecification,
-    exported_program: ExportedProgram,
-    reporter: WhyNoPartitionReporter,
-    additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
-) -> OperatorSupportBase:
-    """Create an OperatorSupport composite for a TOSA spec.
-
-    Combine profile-specific positive checks, registered operator checks, and
-    negative checks into a single :py:class:`OperatorSupportBase` chain.
-
-    Args:
-        tosa_spec (TosaSpecification): Active TOSA specification.
-        exported_program (ExportedProgram): Program context for checks.
-        reporter (WhyNoPartitionReporter): Reporter for rejections.
-        additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
-            negative checks to apply.
-
-    Returns:
-        OperatorSupportBase: Composite checker for the given spec.
-
-    """
-    positive_checks = _positive_checks(tosa_spec, exported_program, reporter)
-    negative_checks = _negative_checks(
-        tosa_spec, exported_program, reporter, additional_checks
-    )
+        negative_checks.append(SymbolicShapeSupportCheck(reporter))
 
     return chain(
         reporter.wrap_check(
@@ -427,40 +368,6 @@ def _has_symbolic_shape(node: fx.Node) -> bool:
 
         return False
 
-    def _partition_dynamic_upmsample_nearest2d(self, node: fx.Node) -> bool:
-        """Check if the node is an upsample_nearest2d with symbolic shapes.
-
-        Args:
-            node (fx.Node): FX node to check.
-
-        Returns:
-            bool: True if the node is an upsample_nearest2d with symbolic
-                shapes; otherwise, False.
-
-        """
-        if node.target != exir_ops.edge.aten.upsample_nearest2d.vec:
-            return False
-
-        try:
-            input_tensor = get_first_fake_tensor(node.all_input_nodes[0])
-            output_tensor = get_first_fake_tensor(node)
-        except Exception as exc:
-            self.reporter.report_reject(
-                node,
-                f"upsample_nearest2d symbolic shapes need tensor metadata: {exc}",
-            )
-            return False
-
-        input_size_xy = input_tensor.shape[2:4]
-        output_size_xy = output_tensor.shape[2:4]
-        if len(input_size_xy) != 2 or len(output_size_xy) != 2:
-            self.reporter.report_reject(
-                node, "upsample_nearest2d expects 2D spatial input/output."
-            )
-            return False
-
-        return True
-
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
@@ -487,13 +394,14 @@ def is_node_supported(
             self._has_symbolic_shape(input_node) for input_node in node.all_input_nodes
         ):
             if node.target == exir_ops.edge.aten.upsample_nearest2d.vec:
-                return self._partition_dynamic_upmsample_nearest2d(node)
-            else:
-                self.reporter.report_reject(
-                    node,
-                    "Node has symbolic shape, has the TOSA spec shape extension support?",
-                )
-                return False
+                return True
+
+            self.reporter.report_reject(
+                node,
+                "Node has symbolic shape but the TOSA spec does not support "
+                "the shape extension.",
+            )
+            return False
 
         return True
 
@@ -654,10 +562,7 @@ def is_node_supported(
             self.reporter.report_reject(node, "One or more inputs were not quantized.")
             return False
 
-        all_q_users = all(
-            output_node.target in (*Q_OPS, torch.ops.aten.sym_size.int)
-            for output_node in node.users
-        )
+        all_q_users = all((output_node.target in Q_OPS) for output_node in node.users)
         output_dtype = get_first_fake_tensor(node).dtype
         output_quantized = (
             output_quantized or all_q_users or _is_integer_dtype(output_dtype)
@@ -683,10 +588,7 @@ class CheckInt64InputsAndOutputs(OperatorSupportBase):
     """
 
     def __init__(
-        self,
-        exported_program: ExportedProgram,
-        reporter: WhyNoPartitionReporter,
-        tosa_spec: TosaSpecification,
+        self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter
     ):
         """Initialize the check with program context and reporter."""
         self.input_names = [
@@ -695,7 +597,6 @@ def __init__(
             if spec.kind == InputKind.USER_INPUT
         ]
         self.reporter = reporter
-        self.tosa_spec = tosa_spec
         self.int32_min = torch.iinfo(torch.int32).min
         self.int32_max = torch.iinfo(torch.int32).max
         super().__init__()
@@ -708,138 +609,6 @@ def inside_int32_bounds(self, node: torch.fx.Node) -> bool:
         min_val, max_val = int(torch.min(data)), int(torch.max(data))
         return min_val >= self.int32_min and max_val <= self.int32_max
 
-    def has_rejected_int64_output(
-        self, node: torch.fx.Node, tensor_list: Sequence[typing.Any]
-    ) -> bool:
-        if node.target in (
-            torch.ops.aten.argmax.default,
-            exir_ops.edge.aten.argmax.default,
-        ):
-            return not self._is_tosa_argmax_supported(node)
-        return any(
-            tensor.dtype == torch.int64
-            for tensor in tensor_list
-            if isinstance(tensor, FakeTensor)
-        )
-
-    def _is_tosa_argmax_dtype_supported(
-        self, node: torch.fx.Node, input_dtype: torch.dtype
-    ) -> bool:
-        if input_dtype == torch.int8:
-            if not self.tosa_spec.support_integer():
-                self.reporter.report_reject(
-                    node, "TOSA ARGMAX requires PRO-INT for int8 input."
-                )
-                return False
-        elif input_dtype == torch.int16:
-            if not (
-                self.tosa_spec.support_integer()
-                and self.tosa_spec.support_extension("int16")
-            ):
-                self.reporter.report_reject(
-                    node, "TOSA ARGMAX requires EXT-INT16 for int16 input."
-                )
-                return False
-        elif input_dtype in (torch.float16, torch.float32):
-            if not self.tosa_spec.support_float():
-                self.reporter.report_reject(
-                    node, f"TOSA ARGMAX requires PRO-FP for {input_dtype} input."
-                )
-                return False
-        elif input_dtype == torch.bfloat16:
-            if not (
-                self.tosa_spec.support_float()
-                and self.tosa_spec.support_extension("bf16")
-            ):
-                self.reporter.report_reject(
-                    node, "TOSA ARGMAX requires EXT-BF16 for bfloat16 input."
-                )
-                return False
-        else:
-            self.reporter.report_reject(
-                node, f"TOSA ARGMAX does not support {input_dtype} input."
-            )
-            return False
-        return True
-
-    def _is_tosa_argmax_supported(self, node: torch.fx.Node) -> bool:
-        dim = node.kwargs.get("dim", node.args[1] if len(node.args) > 1 else None)
-        if dim is None:
-            self.reporter.report_reject(
-                node, "TOSA ARGMAX requires an explicit reduction dimension."
-            )
-            return False
-        if not isinstance(dim, int):
-            self.reporter.report_reject(
-                node, "TOSA ARGMAX requires a statically known reduction dimension."
-            )
-            return False
-
-        input_node = typing.cast(torch.fx.Node, node.args[0])
-        input_tensor = get_first_fake_tensor(input_node)
-        if not self._is_tosa_argmax_dtype_supported(node, input_tensor.dtype):
-            return False
-
-        input_rank = len(input_tensor.shape)
-        if input_rank == 0:
-            self.reporter.report_reject(
-                node, "TOSA ARGMAX requires an input with rank at least 1."
-            )
-            return False
-
-        axis = dim + input_rank if dim < 0 else dim
-        if axis < 0 or axis >= input_rank:
-            self.reporter.report_reject(
-                node,
-                f"TOSA ARGMAX axis must be in [0, {input_rank - 1}] but got {dim}.",
-            )
-            return False
-
-        keepdim = node.kwargs.get(
-            "keepdim", node.args[2] if len(node.args) > 2 else False
-        )
-        if keepdim:
-            self.reporter.report_reject(
-                node, "TOSA ARGMAX does not support keepdim=True."
-            )
-            return False
-
-        return True
-
-    def _check_int64_input_nodes(self, node: torch.fx.Node) -> bool:
-        """Check if all int64 input nodes are constant and will be
-        partitioned.
-        """
-        for input_node in (
-            input_node
-            for input_node in node.all_input_nodes
-            if input_node.op != "get_attr"
-        ):
-            if isinstance(input_node.meta["val"], torch.SymInt):
-                continue
-            tensor_in = get_first_fake_tensor(input_node)
-            if tensor_in.dtype != torch.int64:
-                continue
-            # Constant placeholder
-            if (
-                input_node.op != "call_function"
-                and input_node.name not in self.input_names
-            ):
-                continue
-            # Constant operator
-            if input_node.op == "call_function":
-                if input_node.target in ComputeConstantOpsAOTPass.targeted_ops:
-                    # This is not perfect since the input_node can still be rejected by other checks but
-                    # this should cover the majority of cases.
-                    if self.is_node_supported({}, input_node):
-                        continue
-            self.reporter.report_reject(
-                node, f"Non-constant int64 input {input_node.name}"
-            )
-            return False
-
-        return True
-
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
@@ -849,7 +618,7 @@ def is_node_supported(
         vals = node.meta["val"]
         tensor_list = vals if isinstance(vals, (list, tuple)) else [vals]
 
-        any_int64 = self.has_rejected_int64_output(node, tensor_list)
+        any_int64 = any(tensor.dtype == torch.int64 for tensor in tensor_list)
         # Don't partition nodes with int64 output...
         if any_int64:
             # ... Except for constant ops that are directly cast to something non-int64.
@@ -883,7 +652,35 @@ def is_node_supported(
                 )
                 return False
 
-        return self._check_int64_input_nodes(node)
+        # Ops with int64 inputs are only partitioned if input nodes are constant and will be partitioned.
+        # If it is not partitioned, the partition will get an int64 input and fail.
+        for input_node in (
+            input_node
+            for input_node in node.all_input_nodes
+            if input_node.op != "get_attr"
+        ):
+            tensor_in = get_first_fake_tensor(input_node)
+            if tensor_in.dtype != torch.int64:
+                continue
+            # Constant placeholder
+            if (
+                input_node.op != "call_function"
+                and input_node.name not in self.input_names
+            ):
+                continue
+            # Constant operator
+            if input_node.op == "call_function":
+                if input_node.target in ComputeConstantOpsAOTPass.targeted_ops:
+                    # This is not perfect since the input_node can still be rejected by other checks but
+                    # this should cover the majority of cases.
+                    if self.is_node_supported({}, input_node):
+                        continue
+            self.reporter.report_reject(
+                node, f"Non-constant int64 input {input_node.name}"
+            )
+            return False
+
+        return True
 
 
 class CheckDtypeInputsAndOutputs(OperatorSupportBase):
@@ -915,9 +712,6 @@ def is_node_supported(
             for input_node in node.all_input_nodes
             if input_node.op != "get_attr"
         ):
-            if isinstance(input_node.meta["val"], torch.SymInt):
-                continue
-
             tensor = get_first_fake_tensor(input_node)
             if tensor.dtype in self.disallowed_dtypes:
                 self.reporter.report_reject(
@@ -978,8 +772,6 @@ def is_node_supported(
             for input_node in node.all_input_nodes
             if input_node.op != "get_attr"
         ):
-            if isinstance(input_node.meta["val"], torch.SymInt):
-                continue
             dtype = get_first_fake_tensor(input_node).dtype
             if dtype.is_floating_point:
                 floating_dtypes.add(dtype)
@@ -1017,8 +809,6 @@ def is_node_supported(
         )
         # check if any input node has an unsupported rank
         for input_node in input_nodes:
-            if isinstance(input_node.meta["val"], torch.SymInt):
-                continue
             input_node_shape = get_first_fake_tensor(input_node).shape
             if len(input_node_shape) > self.max_rank:
                 self.reporter.report_reject(
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 1acaf4e65ef..aa988a1ccd7 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -42,13 +42,11 @@
     op_sub,
     op_sum,
     op_to_dim_order_copy,
-    op_tosa_argmax,
     op_tosa_avg_pool2d,
     op_tosa_avg_pool2d_adaptive,
     op_tosa_cast_to_block_scaled,
     op_tosa_clamp,
     op_tosa_conv2d,
-    op_tosa_conv2d_block_scaled,
     op_tosa_conv3d,
     op_tosa_custom,
     op_tosa_depthwise_conv2d,
diff --git a/backends/arm/operators/op_tosa_cast_to_block_scaled.py b/backends/arm/operators/op_tosa_cast_to_block_scaled.py
index b49fa521866..454c28ddfe2 100644
--- a/backends/arm/operators/op_tosa_cast_to_block_scaled.py
+++ b/backends/arm/operators/op_tosa_cast_to_block_scaled.py
@@ -5,7 +5,7 @@
 """Provide a visitor for lowering block-scaled casts to TOSA."""
 
 import operator
-from typing import Any, List
+from typing import Any, cast, List
 
 import torch
 import tosa_serializer as ts
@@ -16,36 +16,25 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
-    validate_valid_dtype,
 )
-from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
+from executorch.backends.arm.tosa.mapping import TosaArg
 from executorch.backends.arm.tosa.specification import TosaSpecification
 
 
-def _getitem_index(node: torch.fx.Node) -> int:
-    index = node.args[1]
-    if not isinstance(index, int):
-        raise ValueError(
-            f"CAST_TO_BLOCK_SCALED: expected integer getitem index, got {index!r}"
-        )
-    return index
-
-
-def _ordered_getitem_outputs(node: torch.fx.Node) -> list[torch.fx.Node]:
+def _ordered_getitem_output_names(node: torch.fx.Node) -> list[str]:
     getitem_users = [
         user
         for user in node.users
         if user.op == "call_function" and user.target == operator.getitem
     ]
 
-    ordered_users = sorted(getitem_users, key=_getitem_index)
+    ordered_users = sorted(getitem_users, key=lambda user: cast(int, user.args[1]))
     if len(ordered_users) != 2:
         raise ValueError(
-            f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem "
-            f"outputs, got {len(ordered_users)}"
+            f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem outputs, got {len(ordered_users)}"
         )
 
-    return ordered_users
+    return [user.name for user in ordered_users]
 
 
 @register_node_visitor
@@ -69,67 +58,15 @@ def define_node(
             raise ValueError(f"{self.target} requires the TOSA mxfp extension")
 
         input_tensor = inputs[0]
-        block_size = inputs[1].number if hasattr(inputs[1], "number") else None
-        if not isinstance(block_size, int) or isinstance(block_size, bool):
-            raise ValueError(f"{self.target}: missing block_size argument")
-
-        validate_valid_dtype(
-            self.target,
-            input_tensor,
-            [ts.DType.FP32, ts.DType.BF16, ts.DType.FP16],
-            self.tosa_spec,
-        )
-
-        if not isinstance(node.meta.get("val"), tuple) or len(node.meta["val"]) != 2:
-            raise ValueError(
-                f"{self.target}: expected tuple metadata with two outputs, got {node.meta.get('val')!r}"
-            )
+        block_size = inputs[1].number
         output_data_tensor, output_scale_tensor = node.meta["val"]
-        output_getitems = _ordered_getitem_outputs(node)
-        output_names = [user.name for user in output_getitems]
-        output_payload_dtype = output_getitems[0].meta.get(TosaSpecialDtype.meta_key())
-
-        if output_payload_dtype in (
-            TosaSpecialDtype.FP4E2M1,
-            TosaSpecialDtype.FP6E2M3,
-            TosaSpecialDtype.FP6E3M2,
-        ):
-            output_data_dtype = output_payload_dtype.get_tosa_dtype()
-        elif output_data_tensor.dtype == torch.float8_e4m3fn:
-            output_data_dtype = ts.DType.FP8E4M3
-        elif output_data_tensor.dtype == torch.float8_e5m2:
-            output_data_dtype = ts.DType.FP8E5M2
-        else:
-            raise ValueError(
-                f"{self.target}: unsupported payload dtype {output_data_tensor.dtype}"
-            )
-        if output_data_dtype not in (
-            ts.DType.FP4E2M1,
-            ts.DType.FP6E2M3,
-            ts.DType.FP6E3M2,
-            ts.DType.FP8E4M3,
-            ts.DType.FP8E5M2,
-        ):
-            raise ValueError(
-                f"{self.target}: unsupported payload dtype {output_data_dtype}"
-            )
-        if output_scale_tensor.dtype != torch.float8_e8m0fnu:
-            raise ValueError(
-                f"{self.target}: unsupported scale dtype {output_scale_tensor.dtype}"
-            )
 
-        if not hasattr(ts.Op, "CAST_TO_BLOCK_SCALED"):
-            raise NotImplementedError(
-                "tosa_serializer does not provide CAST_TO_BLOCK_SCALED yet"
-            )
+        # TODO(MLETORCH-2018): This is a local workaround for multi-output TOSA ops.
+        # Remove it once twe can handle multiple outputs generally.
+        output_names = _ordered_getitem_output_names(node)
 
         attr = ts.TosaSerializerAttribute()
-        attr_ctor = getattr(attr, "CastToBlockScaledAttribute", None)
-        if attr_ctor is None:
-            raise NotImplementedError(
-                "tosa_serializer does not provide CastToBlockScaledAttribute yet"
-            )
-        attr_ctor(block_size)
+        attr.CastToBlockScaledAttribute(block_size)
 
         self._serialize_operator(
             node,
diff --git a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
index 4c3a8ba99b2..2f1bd88c2bb 100644
--- a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
+++ b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
@@ -53,13 +53,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [A_data, B_data],
-            [
-                ts.DType.FP4E2M1,
-                ts.DType.FP6E2M3,
-                ts.DType.FP6E3M2,
-                ts.DType.FP8E4M3,
-                ts.DType.FP8E5M2,
-            ],
+            [ts.DType.FP8E4M3, ts.DType.FP8E5M2],
             self.tosa_spec,
         )
         validate_valid_dtype(
diff --git a/backends/arm/operators/op_tosa_shapes.py b/backends/arm/operators/op_tosa_shapes.py
index b7480d78a4d..25c861a403d 100644
--- a/backends/arm/operators/op_tosa_shapes.py
+++ b/backends/arm/operators/op_tosa_shapes.py
@@ -13,7 +13,6 @@
     NodeVisitor,
     register_node_visitor,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
 from executorch.backends.arm.tosa.utils import normalize_symint
 
@@ -22,6 +21,9 @@
 class TosaConstShapeVisitor(NodeVisitor):
     target = "tosa.CONST_SHAPE.default"
 
+    def __init__(self, *args):
+        super().__init__(*args)
+
     def define_node(
         self,
         node: torch.fx.Node,
@@ -41,217 +43,3 @@ def define_node(
             vals=vals,
             name=output.name,
         )
-
-
-class TosaShapeNodeVisitor(NodeVisitor):
-
-    tosa_specs = TosaSpecification.all_profiles_for_version("1.1")
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
-        tosa_graph.currRegion.currBasicBlock.addShape(
-            output.name,
-            output.shape[0],
-        )
-
-
-class TosaBasicShapeVisitor(TosaShapeNodeVisitor):
-    tosa_op: ts.Op
-    attr_method: str
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        super().define_node(node, tosa_graph, inputs, output)
-        self.serialize(
-            node,
-            tosa_graph,
-            tosa_op=self.tosa_op,
-            inputs=inputs,
-            output=output,
-            attr_method=self.attr_method,
-        )
-
-
-@register_node_visitor
-class TosaDimShapeVisitor(TosaShapeNodeVisitor):
-    target = "tosa.DIM.default"
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        super().define_node(node, tosa_graph, inputs, output)
-
-        attr = ts.TosaSerializerAttribute()
-        attr.DimAttribute(axis=node.kwargs["axis"])
-        self._serialize_operator(
-            node,
-            tosa_graph,
-            ts.Op.DIM,
-            [inputs[0].name],
-            [output.name],
-            attr,
-        )
-
-
-@register_node_visitor
-class TosaAddShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.ADD_SHAPE.default"
-
-    tosa_op = ts.Op.ADD_SHAPE
-    attr_method = "AddShapeAttribute"
-
-
-@register_node_visitor
-class TosaSubShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.SUB_SHAPE.default"
-
-    tosa_op = ts.Op.SUB_SHAPE
-    attr_method = "SubShapeAttribute"
-
-
-@register_node_visitor
-class TosaAssertEqualShapeVisitor(TosaShapeNodeVisitor):
-    target = "tosa.ASSERT_EQUAL_SHAPE.default"
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        super().define_node(node, tosa_graph, inputs, output)
-        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
-        attr = ts.TosaSerializerAttribute()
-        attr.AssertEqualShapeAttribute(allow_broadcast=node.kwargs["allow_broadcast"])
-        self._serialize_operator(
-            node,
-            tosa_graph,
-            ts.Op.ASSERT_EQUAL_SHAPE,
-            [inputs[0].name, inputs[1].name],
-            [output.name],
-            attr,
-        )
-
-
-@register_node_visitor
-class TosaCatShapeVisitor(TosaShapeNodeVisitor):
-    target = "tosa.CONCAT_SHAPE.default"
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        super().define_node(node, tosa_graph, inputs, output)
-        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
-
-        input_shape_list = [input.name for input in inputs[0].special]
-
-        attr = ts.TosaSerializerAttribute()
-        attr.ConcatShapeAttribute()
-        self._serialize_operator(
-            node,
-            tosa_graph,
-            ts.Op.CONCAT_SHAPE,
-            input_shape_list,
-            [output.name],
-            attr,
-        )
-
-
-@register_node_visitor
-class TosaDivCeilShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.DIV_CEIL_SHAPE.default"
-
-    tosa_op = ts.Op.DIV_CEIL_SHAPE
-    attr_method = "DivCeilShapeAttribute"
-
-
-@register_node_visitor
-class TosaDivShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.DIV_FLOOR_SHAPE.default"
-
-    tosa_op = ts.Op.DIV_FLOOR_SHAPE
-    attr_method = "DivFloorShapeAttribute"
-
-
-@register_node_visitor
-class TosaExp2ShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.EXP2_SHAPE.default"
-
-    tosa_op = ts.Op.EXP2_SHAPE
-    attr_method = "Exp2ShapeAttribute"
-
-
-@register_node_visitor
-class TosaLog2CeilShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.LOG2_CEIL_SHAPE.default"
-
-    tosa_op = ts.Op.LOG2_CEIL_SHAPE
-    attr_method = "Log2CeilShapeAttribute"
-
-
-@register_node_visitor
-class TosaLog2FloorShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.LOG2_FLOOR_SHAPE.default"
-
-    tosa_op = ts.Op.LOG2_FLOOR_SHAPE
-    attr_method = "Log2FloorShapeAttribute"
-
-
-@register_node_visitor
-class TosaMaxShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.MAX_SHAPE.default"
-
-    tosa_op = ts.Op.MAX_SHAPE
-    attr_method = "MaxShapeAttribute"
-
-
-@register_node_visitor
-class TosaMinShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.MIN_SHAPE.default"
-
-    tosa_op = ts.Op.MIN_SHAPE
-    attr_method = "MinShapeAttribute"
-
-
-@register_node_visitor
-class TosaMulShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.MUL_SHAPE.default"
-
-    tosa_op = ts.Op.MUL_SHAPE
-    attr_method = "MulShapeAttribute"
-
-
-@register_node_visitor
-class TosaSliceShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.SLICE_SHAPE.default"
-
-    tosa_op = ts.Op.SLICE_SHAPE
-    attr_method = "SliceShapeAttribute"
-
-
-@register_node_visitor
-class TosaModShapeVisitor(TosaBasicShapeVisitor):
-    target = "tosa.MOD_SHAPE.default"
-
-    tosa_op = ts.Op.MOD_SHAPE
-    attr_method = "ModShapeAttribute"
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index a0c2dbeb1fb..5f9c3e3938c 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -56,82 +56,14 @@ def _tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
 
 
 def _prepare_const_values_for_tosa_dtype(
-    values: np.ndarray, tosa_arg: TosaArg
+    values: np.ndarray, tosa_dtype: ts.DType
 ) -> np.ndarray:
     """Normalize constant storage to the expected TOSA serializer dtype."""
-    if tosa_arg.dtype == ts.DType.INT48 and values.dtype != np.int64:
+    if tosa_dtype == ts.DType.INT48 and values.dtype != np.int64:
         return values.astype(np.int64)
-    if tosa_arg.dtype in (ts.DType.FP6E2M3, ts.DType.FP6E3M2):
-        if values.dtype == np.uint8:
-            try:
-                import ml_dtypes  # type: ignore[import-not-found]
-            except ImportError as e:
-                raise RuntimeError(
-                    "ml_dtypes is required to serialize FP6 tensors for TOSA. "
-                    "Have you run setup.sh?"
-                ) from e
-            ml_dtype = {
-                ts.DType.FP6E2M3: ml_dtypes.float6_e2m3fn,
-                ts.DType.FP6E3M2: ml_dtypes.float6_e3m2fn,
-            }[tosa_arg.dtype]
-            return values.view(ml_dtype)
     return values
 
 
-def _get_const_shape(values: np.ndarray, tosa_arg: TosaArg) -> list[int]:
-    """Return the TOSA logical shape for a serialized constant."""
-    if tosa_arg.dtype == ts.DType.FP4E2M1:
-        return normalize_symint(tosa_arg.shape)
-    return normalize_symint(values.shape)
-
-
-def _is_packed_fp4_const(values: np.ndarray, tosa_arg: TosaArg) -> bool:
-    """FP4 elements are pairwise in each byte of a uint8 tensor.
-
-    This function checks if the given values and TOSA argument represent a
-    packed FP4 constant.
-
-    """
-
-    return (
-        tosa_arg.dtype == ts.DType.FP4E2M1
-        and values.dtype == np.uint8
-        and values.shape[-1] * 2 == tosa_arg.shape[-1]
-    )
-
-
-def _add_const(
-    tosa_graph: Any,
-    values: np.ndarray,
-    tosa_arg: TosaArg,
-    name: str,
-) -> None:
-    """Add a constant, preserving packed FP4 storage when required."""
-    if _is_packed_fp4_const(values, tosa_arg):
-        # TOSA FP4 tensors have logical FP4 shape, but constants are stored as
-        # packed bytes (two values per byte). Add the raw bytes as INT8 first
-        # then set TOSA dtype and shape correctly on the tensor metadata.
-        tosa_graph.addConst(
-            normalize_symint(values.shape),
-            ts.DType.INT8,
-            values,
-            name=name,
-        )
-        tensor = tosa_graph.currRegion.currBasicBlock.tensors[name]
-        tensor.setDtype(ts.DType.FP4E2M1)
-        for dim, size in enumerate(normalize_symint(tosa_arg.shape)):
-            tensor.SetDimSize(dim, size)
-        return
-
-    prepared_values = _prepare_const_values_for_tosa_dtype(values, tosa_arg)
-    tosa_graph.addConst(
-        _get_const_shape(prepared_values, tosa_arg),
-        tosa_arg.dtype,
-        prepared_values,
-        name=name,
-    )
-
-
 def process_call_function(
     node: torch.fx.Node,
     tosa_graph: Any,
@@ -222,7 +154,16 @@ def process_inputs_to_parameters(
             f"{type(parameter_data).__name__}"
         )
     parameter_values = _tensor_to_numpy(parameter_data)
-    _add_const(tosa_graph, parameter_values, tosa_arg, name=tosa_arg.name)
+    parameter_values = _prepare_const_values_for_tosa_dtype(
+        parameter_values, tosa_arg.dtype
+    )
+
+    tosa_graph.addConst(
+        normalize_symint(parameter_values.shape),
+        tosa_arg.dtype,
+        parameter_values,
+        name=tosa_arg.name,
+    )
 
 
 def process_inputs_to_buffers(
@@ -247,7 +188,14 @@ def process_inputs_to_buffers(
             f"{type(buffer_data).__name__}"
         )
     buffer_values = _tensor_to_numpy(buffer_data)
-    _add_const(tosa_graph, buffer_values, tosa_arg, name=tosa_arg.name)
+    buffer_values = _prepare_const_values_for_tosa_dtype(buffer_values, tosa_arg.dtype)
+
+    tosa_graph.addConst(
+        normalize_symint(buffer_values.shape),
+        tosa_arg.dtype,
+        buffer_values,
+        name=tosa_arg.name,
+    )
 
 
 def process_inputs_to_lifted_tensor_constants(
@@ -269,7 +217,14 @@ def process_inputs_to_lifted_tensor_constants(
         f"{type(tensor).__name__}"
     )
     tensor_values = _tensor_to_numpy(tensor)
-    _add_const(tosa_graph, tensor_values, tosa_arg, name=tosa_arg.name)
+    tensor_values = _prepare_const_values_for_tosa_dtype(tensor_values, tosa_arg.dtype)
+
+    tosa_graph.addConst(
+        normalize_symint(tensor_values.shape),
+        tosa_arg.dtype,
+        tensor_values,
+        name=tosa_arg.name,
+    )
 
 
 def _is_submodule_input(
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 3b713659e84..7810077a679 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -532,7 +532,6 @@ def _get_fixed_qparams_qspec(
     torch.ops.aten.selu.default,
     torch.ops.aten.celu.default,
     torch.ops.aten.floor.default,
-    torch.ops.aten.round.default,
     torch.ops.aten.log.default,
     torch.ops.aten.reciprocal.default,
     torch.ops.aten.rsqrt.default,
diff --git a/backends/arm/scripts/install_models_for_test.sh b/backends/arm/scripts/install_models_for_test.sh
index 1e91cd9c08f..d6a7b9cdec0 100644
--- a/backends/arm/scripts/install_models_for_test.sh
+++ b/backends/arm/scripts/install_models_for_test.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2025-2026 Arm Limited and/or its affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,8 +8,7 @@ set -e
 pip install -r backends/arm/requirements-arm-models-test.txt
 
 # Install model gym repository
-MODEL_GYM_REF="${MODEL_GYM_REF:-v0.3.0}"
-git clone --depth 1 --branch "$MODEL_GYM_REF" https://github.com/arm/neural-graphics-model-gym.git
+git clone https://github.com/arm/neural-graphics-model-gym.git
 cd neural-graphics-model-gym
 # Remove model-converter installation from model-gym repository (to prevent overwriting executorch version)
 if [[ "$(uname)" == "Darwin" ]]; then
@@ -19,4 +18,4 @@ else
 fi
 pip install . --no-deps
 cd ..
-rm -rf neural-graphics-model-gym
+rm -rf neural-graphics-model-gym
\ No newline at end of file
diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index 9c324e0d784..1aa51a8f9ac 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -177,7 +177,7 @@ for COMMIT in ${COMMITS}; do
     for committed_file in "${license_files[@]}"; do
         # Skip files with certain extensions
         case "$committed_file" in
-            *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS|*/generated/*)
+            *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS)
                 echo -e "${INFO} Skipping license check for ${committed_file} (excluded extension)"
                 continue
                 ;;
diff --git a/backends/arm/test/misc/test_mxfp_linear_ao.py b/backends/arm/test/misc/test_mxfp_linear_ao.py
index 1412d8ffdfe..0f2b6b9198c 100644
--- a/backends/arm/test/misc/test_mxfp_linear_ao.py
+++ b/backends/arm/test/misc/test_mxfp_linear_ao.py
@@ -5,11 +5,9 @@
 
 import torch
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
-from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType
 from executorch.backends.arm.ao_ext.ops import MXFPLinearOp
 
 from torch.export import export
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 class LinearModule(torch.nn.Module):
@@ -21,86 +19,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.linear(x)
 
 
-def _test_mxfp_linear_quantize_swaps_module(
-    weight_dtype: MXFPDType,
-    expected_weight_qdata_dtype: torch.dtype,
-    expected_weight_qdata_shape: tuple[int, ...],
-) -> None:
+def test_mxfp_linear_quantize_swaps_module() -> None:
     model = LinearModule().eval()
 
-    to_mxfp(
-        model,
-        MXFPOpConfig(weight_dtype=weight_dtype),
-    )
+    to_mxfp(model, MXFPOpConfig())
 
     assert isinstance(model.linear, MXFPLinearOp)
-    assert model.linear.weight_qdata.dtype == expected_weight_qdata_dtype
-    assert model.linear.weight_dtype == mxfp_dtype_to_str(weight_dtype)
+    assert model.linear.weight_qdata.dtype == torch.float8_e4m3fn
     assert model.linear.weight_scale.dtype == torch.float8_e8m0fnu
-    assert tuple(model.linear.weight_qdata.shape) == expected_weight_qdata_shape
+    assert tuple(model.linear.weight_qdata.shape) == (1, 8, 32)
     assert tuple(model.linear.weight_scale.shape) == (1, 8, 1)
 
 
-def test_mxfp8_e4m3_linear_quantize_swaps_module() -> None:
-    _test_mxfp_linear_quantize_swaps_module(
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fn,
-        (1, 8, 32),
-    )
-
-
-def test_mxfp4_linear_quantize_swaps_module() -> None:
-    _test_mxfp_linear_quantize_swaps_module(
-        torch.float4_e2m1fn_x2,
-        torch.uint8,
-        (1, 8, 16),
-    )
-
-
-def test_mxfp6_e2m3_linear_quantize_swaps_module() -> None:
-    _test_mxfp_linear_quantize_swaps_module(
-        DTYPE_FP6_E2M3,
-        torch.uint8,
-        (1, 8, 32),
-    )
-
-
-def test_mxfp6_e3m2_linear_quantize_swaps_module() -> None:
-    _test_mxfp_linear_quantize_swaps_module(
-        DTYPE_FP6_E3M2,
-        torch.uint8,
-        (1, 8, 32),
-    )
-
-
-def test_mxfp_linear_quantize_filter_fn_selects_modules() -> None:
-    class TwoLinearModule(torch.nn.Module):
-        def __init__(self) -> None:
-            super().__init__()
-            self.selected = torch.nn.Linear(32, 8)
-            self.skipped = torch.nn.Linear(32, 8)
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return self.selected(x) + self.skipped(x)
-
-    def _is_selected_linear(module: torch.nn.Module, fqn: str) -> bool:
-        return isinstance(module, torch.nn.Linear) and fqn == "selected"
-
-    model = TwoLinearModule().eval()
-
-    to_mxfp(
-        model,
-        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
-        filter_fn=_is_selected_linear,
-    )
-
-    assert isinstance(model.selected, MXFPLinearOp)
-    assert isinstance(model.skipped, torch.nn.Linear)
-
-
-def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None:
+def test_mxfp_linear_export_preserves_custom_op() -> None:
     model = LinearModule().eval()
-    to_mxfp(model, config)
+    to_mxfp(model, MXFPOpConfig())
 
     exported = export(model, (torch.randn(4, 32),), strict=False)
 
@@ -111,27 +44,3 @@ def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None:
     ]
 
     assert torch.ops.tosa_mxfp.linear.default in targets
-
-
-def test_mxfp8_e4m3_linear_export_preserves_custom_op() -> None:
-    _test_mxfp_linear_export_preserves_custom_op(
-        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
-    )
-
-
-def test_mxfp4_linear_export_preserves_custom_op() -> None:
-    _test_mxfp_linear_export_preserves_custom_op(
-        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2)
-    )
-
-
-def test_mxfp6_e2m3_linear_export_preserves_custom_op() -> None:
-    _test_mxfp_linear_export_preserves_custom_op(
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3)
-    )
-
-
-def test_mxfp6_e3m2_linear_export_preserves_custom_op() -> None:
-    _test_mxfp_linear_export_preserves_custom_op(
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2)
-    )
diff --git a/backends/arm/test/misc/test_process_node.py b/backends/arm/test/misc/test_process_node.py
index 02d2a5e012b..1ef348abdbf 100644
--- a/backends/arm/test/misc/test_process_node.py
+++ b/backends/arm/test/misc/test_process_node.py
@@ -3,19 +3,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from types import SimpleNamespace
-from typing import cast
-
 import numpy as np
 import torch
 import tosa_serializer as ts
-from executorch.backends.arm.process_node import _add_const, process_placeholder
-from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
+from executorch.backends.arm.process_node import process_placeholder
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import TosaSpecification
 from executorch.exir import to_edge
 from torch._export.utils import is_param
-from tosa.TosaGraph import TosaGraph  # type: ignore[import-not-found, import-untyped]
-from tosa_serializer.numpy_utils import pack_6bit_array
 
 
 class Int32BiasModule(torch.nn.Module):
@@ -99,74 +94,3 @@ def test_process_placeholder_int48_normalizes_int32_const_values() -> None:
     assert tosa_graph.values is not None
     assert tosa_graph.values.dtype == np.int64
     assert tosa_graph.serialized_bytes == _expected_int48_bytes(module.bias)
-
-
-def test_add_const_fp4_in_packed_storage() -> None:
-    packed_values = np.array([0xDE, 0xFE, 0x6D, 0x55], dtype=np.uint8).reshape(
-        1,
-        1,
-        4,
-    )
-    tosa_arg = cast(
-        TosaArg,
-        SimpleNamespace(dtype=ts.DType.FP4E2M1, shape=(1, 1, 8)),
-    )
-    tosa_graph = ts.TosaSerializer()
-
-    _add_const(tosa_graph, packed_values, tosa_arg, name="fp4_weight")
-
-    graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0)
-    block = graph.Regions(0).Blocks(0)
-    tensors = {
-        block.Tensors(index).Name().decode(): block.Tensors(index)
-        for index in range(block.TensorsLength())
-    }
-    tensor = tensors["fp4_weight"]
-
-    assert tensor.Type() == ts.DType.FP4E2M1
-    assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [1, 1, 8]
-    assert [tensor.Data(index) for index in range(tensor.DataLength())] == [
-        0xDE,
-        0xFE,
-        0x6D,
-        0x55,
-    ]
-
-
-def _test_add_const_fp6_in_packed_storage(dtype: int) -> None:
-    values = np.arange(32, dtype=np.uint8).reshape(1, 1, 32)
-
-    tosa_arg = cast(
-        TosaArg,
-        SimpleNamespace(dtype=dtype, shape=(1, 1, 32)),
-    )
-    tosa_graph = ts.TosaSerializer()
-
-    _add_const(tosa_graph, values, tosa_arg, name="fp6_weight")
-
-    graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0)
-    block = graph.Regions(0).Blocks(0)
-    tensors = {
-        block.Tensors(index).Name().decode(): block.Tensors(index)
-        for index in range(block.TensorsLength())
-    }
-    tensor = tensors["fp6_weight"]
-
-    assert tensor.Type() == dtype
-    assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [
-        1,
-        1,
-        32,
-    ]
-    assert tensor.DataLength() == 24
-    assert [tensor.Data(index) for index in range(tensor.DataLength())] == (
-        pack_6bit_array(values).reshape(-1).tolist()
-    )
-
-
-def test_add_const_fp6e2m3_in_packed_storage() -> None:
-    _test_add_const_fp6_in_packed_storage(ts.DType.FP6E2M3)
-
-
-def test_add_const_fp6e3m2_in_packed_storage() -> None:
-    _test_add_const_fp6_in_packed_storage(ts.DType.FP6E3M2)
diff --git a/backends/arm/test/misc/test_runner_utils.py b/backends/arm/test/misc/test_runner_utils.py
index 54d41548a22..3c78b21e008 100644
--- a/backends/arm/test/misc/test_runner_utils.py
+++ b/backends/arm/test/misc/test_runner_utils.py
@@ -3,13 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import json
 from pathlib import Path
-from types import SimpleNamespace
 from typing import Any, cast
 
-import numpy as np
-import torch
 from executorch.backends.arm.test import runner_utils
 
 
@@ -117,115 +113,3 @@ def test_get_elf_path_accepts_nested_runner_output(monkeypatch, tmp_path: Path)
     monkeypatch.setattr(runner_utils, "_elf_search_roots", lambda: [tmp_path])
 
     assert runner_utils.get_elf_path("corstone-300") == str(elf_path)
-
-
-def test_shape_inference_json_uses_tosa_input_layout(tmp_path: Path) -> None:
-    test_case_path = tmp_path / "test_case.json"
-    artifact_path = tmp_path / "model.tosa"
-    input_tensor = torch.randn(1, 3, 4, 5).to(memory_format=torch.channels_last)
-
-    runner_utils.TosaReferenceModelDispatch()._generate_shape_inference_json(
-        b"",
-        artifact_path,
-        test_case_path,
-        ["input"],
-        (input_tensor,),
-    )
-
-    test_case = json.loads(test_case_path.read_text(encoding="utf-8"))
-
-    assert test_case == {
-        "tosa_file": str(artifact_path),
-        "shapes": {"input": [1, 4, 5, 3]},
-    }
-
-
-def test_numpy_to_torch_tensor_converts_dynamic_nhwc_output(monkeypatch) -> None:
-    symbolic_dim = object()
-    output_tensor = SimpleNamespace(
-        shape=(1, 3, symbolic_dim, 5),
-        dtype=torch.float32,
-        dim_order=lambda: runner_utils.NHWC_ORDER,
-    )
-    monkeypatch.setattr(
-        runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor
-    )
-    array = np.arange(60, dtype=np.float32).reshape(1, 4, 5, 3)
-
-    result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object()))
-
-    assert result.shape == (1, 3, 4, 5)
-    assert result.is_contiguous(memory_format=torch.channels_last)
-    torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 3, 1, 2))
-
-
-def test_numpy_to_torch_tensor_converts_dynamic_nnhwc_output(monkeypatch) -> None:
-    symbolic_dim = object()
-    output_tensor = SimpleNamespace(
-        shape=(1, 2, 3, symbolic_dim, 5),
-        dtype=torch.float32,
-        dim_order=lambda: runner_utils.NNHWC_ORDER,
-    )
-    monkeypatch.setattr(
-        runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor
-    )
-    array = np.arange(120, dtype=np.float32).reshape(1, 2, 4, 5, 3)
-
-    result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object()))
-
-    assert result.shape == (1, 2, 3, 4, 5)
-    assert result.dim_order() == runner_utils.NNHWC_ORDER
-    torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 1, 4, 2, 3))
-
-
-def _program_with_user_input(name: str) -> SimpleNamespace:
-    return SimpleNamespace(
-        graph_signature=SimpleNamespace(user_inputs=[name]),
-        graph=SimpleNamespace(nodes=[SimpleNamespace(op="placeholder", name=name)]),
-    )
-
-
-def test_user_inputs_need_shape_inference_rejects_static_input(monkeypatch) -> None:
-    monkeypatch.setattr(
-        runner_utils,
-        "get_first_fake_tensor",
-        lambda node: SimpleNamespace(shape=(1, 2)),
-    )
-
-    assert not runner_utils.user_inputs_need_shape_inference(
-        cast(Any, _program_with_user_input("input"))
-    )
-
-
-def test_user_inputs_need_shape_inference_accepts_symbolic_input(monkeypatch) -> None:
-    symbolic_dim = object()
-    monkeypatch.setattr(
-        runner_utils,
-        "get_first_fake_tensor",
-        lambda node: SimpleNamespace(shape=(1, symbolic_dim)),
-    )
-
-    assert runner_utils.user_inputs_need_shape_inference(
-        cast(Any, _program_with_user_input("input"))
-    )
-
-
-def test_user_inputs_need_shape_inference_ignores_non_user_inputs(monkeypatch) -> None:
-    program = SimpleNamespace(
-        graph_signature=SimpleNamespace(user_inputs=["input"]),
-        graph=SimpleNamespace(
-            nodes=[
-                SimpleNamespace(op="placeholder", name="input"),
-                SimpleNamespace(op="placeholder", name="param"),
-            ]
-        ),
-    )
-
-    def fake_tensor(node):
-        if node.name == "input":
-            return SimpleNamespace(shape=(1, 2))
-        return SimpleNamespace(shape=(1, object()))
-
-    monkeypatch.setattr(runner_utils, "get_first_fake_tensor", fake_tensor)
-
-    assert not runner_utils.user_inputs_need_shape_inference(cast(Any, program))
diff --git a/backends/arm/test/misc/test_vgf_backend.py b/backends/arm/test/misc/test_vgf_backend.py
index 406ba1b405a..22a8607fbc7 100644
--- a/backends/arm/test/misc/test_vgf_backend.py
+++ b/backends/arm/test/misc/test_vgf_backend.py
@@ -3,10 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import os
 from types import SimpleNamespace
 from typing import cast
-from unittest import mock
 
 import pytest
 
@@ -16,14 +14,7 @@
     clear_registered_pass_insertions,
     PassInsertions,
 )
-
-from executorch.backends.arm.vgf import backend, backend as vgf_backend, VgfCompileSpec
-from executorch.backends.arm.vgf.backend import (
-    _copy_failure_artifacts,
-    _format_repro_command,
-    _replace_converter_input_path,
-    vgf_compile,
-)
+from executorch.backends.arm.vgf import backend as vgf_backend, VgfCompileSpec
 from executorch.exir.backend.backend_details import PreprocessResult
 from executorch.exir.pass_base import ExportPass
 from torch.export.exported_program import ExportedProgram
@@ -114,180 +105,3 @@ def _raise(*args, **kwargs):
         assert _registry_state() == original_registry
     finally:
         clear_registered_pass_insertions()
-
-
-def test_format_repro_command_quotes_shell_metacharacters():
-    command = [
-        "model-converter",
-        "--flag=value with spaces",
-        "-i",
-        "input file.tosa",
-        "-o",
-        "output file.vgf",
-    ]
-
-    formatted = _format_repro_command(command)
-
-    assert formatted == (
-        "model-converter "
-        "'--flag=value with spaces' "
-        "-i "
-        "'input file.tosa' "
-        "-o "
-        "'output file.vgf'"
-    )
-
-
-def test_replace_converter_input_path_replaces_input_after_i():
-    command = [
-        "model-converter",
-        "--some-flag",
-        "-i",
-        "original.tosa",
-        "-o",
-        "output.vgf",
-    ]
-
-    replaced = _replace_converter_input_path(command, "preserved.tosa")
-
-    assert replaced == [
-        "model-converter",
-        "--some-flag",
-        "-i",
-        "preserved.tosa",
-        "-o",
-        "output.vgf",
-    ]
-    assert command[3] == "original.tosa"
-
-
-def test_copy_failure_artifacts_returns_none_without_artifact_path(tmp_path):
-    tosa_path = tmp_path / "input.tosa"
-    tosa_path.write_bytes(b"tosa bytes")
-
-    copied_path = _copy_failure_artifacts(
-        str(tosa_path),
-        artifact_path=None,
-        tag_name="delegate_0",
-    )
-
-    assert copied_path is None
-
-
-def test_copy_failure_artifacts_copies_tosa_with_tag_name(tmp_path):
-    tosa_path = tmp_path / "input.tosa"
-    artifact_path = tmp_path / "artifacts"
-    tosa_path.write_bytes(b"tosa bytes")
-
-    copied_path = _copy_failure_artifacts(
-        str(tosa_path),
-        str(artifact_path),
-        tag_name="delegate_0",
-    )
-
-    assert copied_path == os.path.join(
-        str(artifact_path),
-        "failed_model_converter_input_delegate_0.tosa",
-    )
-    assert os.path.exists(copied_path)
-    assert open(copied_path, "rb").read() == b"tosa bytes"
-
-
-def test_copy_failure_artifacts_copies_tosa_without_tag_name(tmp_path):
-    tosa_path = tmp_path / "input.tosa"
-    artifact_path = tmp_path / "artifacts"
-    tosa_path.write_bytes(b"tosa bytes")
-
-    copied_path = _copy_failure_artifacts(
-        str(tosa_path),
-        str(artifact_path),
-        tag_name="",
-    )
-
-    assert copied_path == os.path.join(
-        str(artifact_path),
-        "failed_model_converter_input.tosa",
-    )
-    assert os.path.exists(copied_path)
-    assert open(copied_path, "rb").read() == b"tosa bytes"
-
-
-@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env")
-@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary")
-@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run")
-def test_vgf_compile_failure_includes_repro_command_and_copies_tosa(
-    mock_run,
-    mock_require_model_converter_binary,
-    mock_model_converter_env,
-    tmp_path,
-):
-    artifact_path = tmp_path / "artifacts"
-
-    mock_require_model_converter_binary.return_value = "model-converter"
-    mock_model_converter_env.return_value = {"PATH": "/test/bin"}
-    mock_run.side_effect = backend.subprocess.CalledProcessError(
-        returncode=1,
-        cmd=["model-converter"],
-        output=b"converter stdout",
-        stderr=b"converter stderr",
-    )
-
-    with pytest.raises(RuntimeError) as exc_info:
-        vgf_compile(
-            b"serialized tosa",
-            ["--flag=value with spaces"],
-            artifact_path=str(artifact_path),
-            tag_name="delegate_0",
-        )
-
-    copied_tosa_path = os.path.join(
-        str(artifact_path),
-        "failed_model_converter_input_delegate_0.tosa",
-    )
-
-    assert os.path.exists(copied_tosa_path)
-    assert open(copied_tosa_path, "rb").read() == b"serialized tosa"
-
-    error = str(exc_info.value)
-    assert "Vgf compiler failed." in error
-    assert "Repro command:" in error
-    assert "model-converter '--flag=value with spaces' -i" in error
-    assert copied_tosa_path in error
-    assert " -o " in error
-    assert "Stderr:\nconverter stderr" in error
-    assert "Stdout:\nconverter stdout" in error
-
-
-@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env")
-@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary")
-@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run")
-def test_vgf_compile_failure_includes_temp_repro_command_without_artifact_path(
-    mock_run,
-    mock_require_model_converter_binary,
-    mock_model_converter_env,
-):
-    mock_require_model_converter_binary.return_value = "model-converter"
-    mock_model_converter_env.return_value = {"PATH": "/test/bin"}
-    mock_run.side_effect = backend.subprocess.CalledProcessError(
-        returncode=1,
-        cmd=["model-converter"],
-        output=b"converter stdout",
-        stderr=b"converter stderr",
-    )
-
-    with pytest.raises(RuntimeError) as exc_info:
-        vgf_compile(
-            b"serialized tosa",
-            ["--some-flag"],
-            artifact_path=None,
-            tag_name="delegate_0",
-        )
-
-    error = str(exc_info.value)
-    assert "Vgf compiler failed." in error
-    assert "Repro command:" in error
-    assert "model-converter --some-flag -i" in error
-    assert "output_delegate_0.tosa.vgf" in error
-    assert "failed_model_converter_input_delegate_0.tosa" not in error
-    assert "Stderr:\nconverter stderr" in error
-    assert "Stdout:\nconverter stdout" in error
diff --git a/backends/arm/test/misc/test_vgf_check_env.py b/backends/arm/test/misc/test_vgf_check_env.py
index 646eb6b9a79..499a9f35db0 100644
--- a/backends/arm/test/misc/test_vgf_check_env.py
+++ b/backends/arm/test/misc/test_vgf_check_env.py
@@ -9,10 +9,8 @@
 from pathlib import Path
 
 import executorch.backends.arm.vgf.check_env as check_env
-import executorch.backends.arm.vgf.model_converter as model_converter
 
 import pytest
-from executorch.backends.arm.vgf import backend as vgf_backend
 from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec
 
 
@@ -121,7 +119,7 @@ def test_is_vgf_runtime_available(monkeypatch):
 
 
 def test_model_converter_check_fails_when_missing(monkeypatch):
-    monkeypatch.setattr(model_converter, "find_model_converter_binary", lambda: None)
+    monkeypatch.setattr(check_env, "find_model_converter_binary", lambda: None)
 
     result = check_env._check_model_converter()
 
@@ -141,7 +139,7 @@ def test_model_converter_check_reports_version(monkeypatch, tmp_path):
         "raise SystemExit(1)\n",
     )
     monkeypatch.setattr(
-        model_converter, "find_model_converter_binary", lambda: str(converter)
+        check_env, "find_model_converter_binary", lambda: str(converter)
     )
 
     result = check_env._check_model_converter()
@@ -174,20 +172,20 @@ def test_find_existing_lib_finds_libvgf(tmp_path):
 
 def test_runtime_backend_check_passes_when_vgf_registered(monkeypatch):
     class BackendRegistry:
-        registered_backend_names = [vgf_backend.VGF_BACKEND_NAME]
+        registered_backend_names = [check_env.VGF_BACKEND_NAME]
 
         def is_available(self, backend_name):
-            return backend_name == vgf_backend.VGF_BACKEND_NAME
+            return backend_name == check_env.VGF_BACKEND_NAME
 
     class Runtime:
         backend_registry = BackendRegistry()
 
-    monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime())
+    monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime())
 
     result = check_env._check_runtime_vgf_backend()
 
     assert result.status == check_env.STATUS_OK
-    assert vgf_backend.VGF_BACKEND_NAME in result.detail
+    assert check_env.VGF_BACKEND_NAME in result.detail
 
 
 def test_runtime_backend_check_fails_when_vgf_not_registered(monkeypatch):
@@ -200,12 +198,12 @@ def is_available(self, backend_name):
     class Runtime:
         backend_registry = BackendRegistry()
 
-    monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime())
+    monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime())
 
     result = check_env._check_runtime_vgf_backend()
 
     assert result.status == check_env.STATUS_FAIL
-    assert vgf_backend.VGF_BACKEND_NAME in result.detail
+    assert check_env.VGF_BACKEND_NAME in result.detail
     assert "XnnpackBackend" in result.detail
 
 
@@ -359,84 +357,3 @@ def test_main_source_build_mode(monkeypatch, capsys):
 def test_main_rejects_build_dir_without_source_build():
     with pytest.raises(SystemExit):
         check_env.main(["--build-dir", "cmake-out-vkml"])
-
-
-def test_check_env_model_converter_probe_delegates_to_model_converter_module(
-    monkeypatch,
-):
-    monkeypatch.setattr(
-        model_converter,
-        "check_model_converter_environment",
-        lambda: model_converter.ModelConverterEnvironmentCheck(
-            "converter", model_converter.STATUS_OK, "from-owner"
-        ),
-    )
-
-    result = check_env._check_model_converter()
-
-    assert result.status == check_env.STATUS_OK
-    assert result.detail == "from-owner"
-
-
-def test_check_env_model_converter_lib_dir_probe_delegates_to_model_converter_module(
-    monkeypatch,
-):
-    monkeypatch.setattr(
-        model_converter,
-        "check_model_converter_lib_dir_environment",
-        lambda: model_converter.ModelConverterEnvironmentCheck(
-            "lib-dir", model_converter.STATUS_OK, "from-owner"
-        ),
-    )
-
-    result = check_env._check_model_converter_lib_dir()
-
-    assert result.status == check_env.STATUS_OK
-    assert result.detail == "from-owner"
-
-
-def test_check_env_runtime_probe_delegates_to_backend_module(monkeypatch):
-    monkeypatch.setattr(
-        vgf_backend,
-        "check_vgf_runtime_backend_environment",
-        lambda: vgf_backend.VgfRuntimeEnvironmentCheck(
-            "runtime", vgf_backend.STATUS_OK, "from-owner"
-        ),
-    )
-
-    result = check_env._check_runtime_vgf_backend()
-
-    assert result.status == check_env.STATUS_OK
-    assert result.detail == "from-owner"
-
-
-def test_model_converter_preflight_and_vgf_compile_share_executable_resolution(
-    monkeypatch,
-    tmp_path,
-):
-    converter = _make_executable(
-        tmp_path / "model-converter",
-        "#!/usr/bin/env python3\n"
-        "from pathlib import Path\n"
-        "import sys\n"
-        "\n"
-        "if '--version' in sys.argv:\n"
-        "    print('model-converter integration-test')\n"
-        "    raise SystemExit(0)\n"
-        "\n"
-        "out_index = sys.argv.index('-o') + 1\n"
-        "Path(sys.argv[out_index]).write_bytes(b'compiled-vgf')\n"
-        "raise SystemExit(0)\n",
-    )
-
-    monkeypatch.setenv("MODEL_CONVERTER_PATH", str(converter))
-
-    preflight = check_env._check_model_converter()
-    compiled = vgf_backend.vgf_compile(
-        tosa_flatbuffer=b"fake-tosa-flatbuffer",
-        compile_flags=[],
-    )
-
-    assert preflight.status == check_env.STATUS_OK
-    assert str(converter) in preflight.detail
-    assert compiled == b"compiled-vgf"
diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
index 77c42bf9f24..940023fa624 100644
--- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
+++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops import cast_to_block_scaled  # noqa: F401
 from executorch.backends.arm.tosa.specification import (
@@ -14,7 +13,6 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch._subclasses.fake_tensor import FakeTensorMode
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 def test_cast_to_block_scaled_requires_mxfp_extension() -> None:
@@ -29,7 +27,7 @@ def test_cast_to_block_scaled_requires_mxfp_extension() -> None:
             exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
                 mode.from_tensor(sample_input),
                 32,
-                output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
+                output_dtype=torch.float8_e4m3fn,
             )
 
 
@@ -41,7 +39,7 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None:
         output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
             mode.from_tensor(sample_input),
             32,
-            output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
+            output_dtype=torch.float8_e4m3fn,
         )
 
     assert output_data.dtype == torch.float8_e4m3fn
@@ -50,48 +48,6 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None:
     assert tuple(output_scale.shape) == (2, 1)
 
 
-def test_cast_to_block_scaled_tosa_fp_mxfp4() -> None:
-    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
-    sample_input = torch.randn((2, 32), dtype=torch.float32)
-
-    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
-        output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
-            mode.from_tensor(sample_input),
-            32,
-            output_dtype=mxfp_dtype_to_str(torch.float4_e2m1fn_x2),
-        )
-
-    assert output_data.dtype == torch.uint8
-    assert tuple(output_data.shape) == (2, 16)
-    assert output_scale.dtype == torch.float8_e8m0fnu
-    assert tuple(output_scale.shape) == (2, 1)
-
-
-def _test_cast_to_block_scaled_tosa_fp_mxfp6(dtype: str) -> None:
-    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
-    sample_input = torch.randn((2, 32), dtype=torch.float32)
-
-    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
-        output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
-            mode.from_tensor(sample_input),
-            32,
-            output_dtype=mxfp_dtype_to_str(dtype),
-        )
-
-    assert output_data.dtype == torch.uint8
-    assert tuple(output_data.shape) == (2, 32)
-    assert output_scale.dtype == torch.float8_e8m0fnu
-    assert tuple(output_scale.shape) == (2, 1)
-
-
-def test_cast_to_block_scaled_tosa_fp_mxfp6e2m3() -> None:
-    _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E2M3)
-
-
-def test_cast_to_block_scaled_tosa_fp_mxfp6e3m2() -> None:
-    _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E3M2)
-
-
 def test_cast_to_block_scaled_invalid_shape() -> None:
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
@@ -103,5 +59,5 @@ def test_cast_to_block_scaled_invalid_shape() -> None:
             exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
                 mode.from_tensor(torch.randn((2, 30), dtype=torch.float32)),
                 32,
-                output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
+                output_dtype=torch.float8_e4m3fn,
             )
diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
index 7dcffdeb4d9..74ce04bf3c1 100644
--- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
+++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops import matmul_t_block_scaled  # noqa: F401
 from executorch.backends.arm.tosa.specification import (
@@ -14,7 +13,6 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch._subclasses.fake_tensor import FakeTensorMode
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E3M2
 
 
 def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None:
@@ -37,38 +35,6 @@ def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None:
     assert tuple(output.shape) == (1, 4, 8)
 
 
-def _test_matmul_t_block_scaled_tosa_fp_subbyte(
-    payload_dtype: MXFPDType,
-    qdata_last_dim: int,
-) -> None:
-    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
-    a_data = torch.empty((1, 4, qdata_last_dim), dtype=torch.uint8)
-    a_scale = torch.empty((1, 4, 1), dtype=torch.float8_e8m0fnu)
-    b_data = torch.empty((1, 8, qdata_last_dim), dtype=torch.uint8)
-    b_scale = torch.empty((1, 8, 1), dtype=torch.float8_e8m0fnu)
-
-    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
-        output = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default(
-            mode.from_tensor(a_data),
-            mode.from_tensor(a_scale),
-            mode.from_tensor(b_data),
-            mode.from_tensor(b_scale),
-            32,
-            payload_dtype=mxfp_dtype_to_str(payload_dtype),
-        )
-
-    assert output.dtype == torch.float32
-    assert tuple(output.shape) == (1, 4, 8)
-
-
-def test_matmul_t_block_scaled_tosa_fp_mxfp4() -> None:
-    _test_matmul_t_block_scaled_tosa_fp_subbyte(torch.float4_e2m1fn_x2, 16)
-
-
-def test_matmul_t_block_scaled_tosa_fp_mxfp6() -> None:
-    _test_matmul_t_block_scaled_tosa_fp_subbyte(DTYPE_FP6_E3M2, 32)
-
-
 def test_matmul_t_block_scaled_invalid_scale_shape() -> None:
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
     a_data = torch.randn((1, 4, 32), dtype=torch.float32).to(torch.float8_e4m3fn)
diff --git a/backends/arm/test/ops/mxfp/test_mxfp_linear.py b/backends/arm/test/ops/mxfp/test_mxfp_linear.py
index fbec9307795..5cdd44cf138 100644
--- a/backends/arm/test/ops/mxfp/test_mxfp_linear.py
+++ b/backends/arm/test/ops/mxfp/test_mxfp_linear.py
@@ -10,7 +10,7 @@
 
 import torch
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common as arm_common
 from executorch.backends.arm.test.ops.mxfp.common import (
     MXFPTosaPipelineFP,
     MXFPVgfPipeline,
@@ -18,12 +18,14 @@
 from executorch.backends.arm.test.tester.analyze_output_utils import (
     compare_rel_frobenius_and_cosine_similarity,
 )
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 aten_op = "torch.ops.tosa_mxfp.linear.default"
 
 input_t1 = Tuple[torch.Tensor]
 
+_MXFP_FROBENIUS_THRESHOLD = 0.06
+_MXFP_COSINE_THRESHOLD = 0.995
+
 
 def _block_input_rank1() -> torch.Tensor:
     """Create a rank-1 input with distinct MXFP activation block scales."""
@@ -159,7 +161,6 @@ def _channels_last_rank4_input() -> torch.Tensor:
 
 test_data_vgf_fp = test_data_fp
 
-# TODO: MLETORCH-2141
 _vgf_xfail_reason = (
     "MXFP is not yet supported in the VGF toolchain. Enable this test when "
     "toolchain support is available."
@@ -214,45 +215,35 @@ def _is_linear(module: torch.nn.Module, _fqn: str) -> bool:
     return isinstance(module, torch.nn.Linear)
 
 
-def _test_mxfp_linear_eager_cpu(
-    test_data,
-    config: MXFPOpConfig,
-    frobenius_threshold=0.3,
-    cosine_threshold=0.95,
-) -> None:
+@arm_common.parametrize("test_data", test_data_fp)
+def test_mxfp_linear_tosa_FP(test_data) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
-    ref_model = Linear(
+    module = Linear(
         in_features=in_features,
         out_features=out_features,
         bias=has_bias,
     ).eval()
 
     if set_block_weights:
-        ref_model.set_block_test_weights()
-    test_model = copy.deepcopy(ref_model).eval()
-
-    to_mxfp(test_model, config, filter_fn=_is_linear)
-
-    test_output = test_model(test_input)
-    ref_output = ref_model(test_input)
+        module.set_block_test_weights()
 
-    compare_rel_frobenius_and_cosine_similarity(
-        ref_output,
-        test_output,
-        quantization_parameters=None,
-        frobenius_threshold=frobenius_threshold,
-        cosine_threshold=cosine_threshold,
-        clean_reference=False,
+    pipeline = MXFPTosaPipelineFP[input_t1](
+        module,
+        (test_input,),
+        aten_op,
+        filter_fn=_is_linear,
+        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
+        cosine_threshold=_MXFP_COSINE_THRESHOLD,
+        tosa_version="1.1",
+        tosa_extensions=["mxfp"],
     )
+    pipeline.run()
 
 
-def _test_mxfp_linear_vgf(
-    test_data,
-    config: MXFPOpConfig,
-    frobenius_threshold,
-    cosine_threshold,
-) -> None:
+@arm_common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
+@arm_common.SkipIfNoModelConverter
+def test_mxfp_linear_vgf(test_data) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
     module = Linear(
@@ -269,169 +260,36 @@ def _test_mxfp_linear_vgf(
         (test_input,),
         aten_op,
         filter_fn=_is_linear,
-        frobenius_threshold=frobenius_threshold,
-        cosine_threshold=cosine_threshold,
-        mxfp_config=config,
+        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
+        cosine_threshold=_MXFP_COSINE_THRESHOLD,
         tosa_spec="TOSA-1.1+FP+mxfp",
     )
     pipeline.run()
 
 
-def _test_mxfp_linear_tosa_FP(
-    test_data,
-    config: MXFPOpConfig,
-    frobenius_threshold=0.08,
-    cosine_threshold=0.995,
-) -> None:
+@arm_common.parametrize("test_data", test_data_fp)
+def test_mxfp_linear_eager_cpu(test_data) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
-    module = Linear(
+    ref_model = Linear(
         in_features=in_features,
         out_features=out_features,
         bias=has_bias,
     ).eval()
-
     if set_block_weights:
-        module.set_block_test_weights()
-
-    pipeline = MXFPTosaPipelineFP[input_t1](
-        module,
-        (test_input,),
-        aten_op,
-        filter_fn=_is_linear,
-        frobenius_threshold=frobenius_threshold,
-        cosine_threshold=cosine_threshold,
-        mxfp_config=config,
-        tosa_version="1.1",
-        tosa_extensions=["mxfp"],
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp8_linear_tosa_FP(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_tosa_FP(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
-    )
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp4_linear_tosa_FP(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_tosa_FP(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
-        frobenius_threshold=0.3,
-        cosine_threshold=0.95,
-    )
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp6_e2m3_linear_tosa_FP(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_tosa_FP(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
-    )
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp6_e3m2_linear_tosa_FP(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_tosa_FP(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
-    )
-
-
-@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
-@common.SkipIfNoModelConverter
-def test_mxfp8_linear_vgf(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_vgf(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
-        frobenius_threshold=0.08,
-        cosine_threshold=0.995,
-    )
-
-
-@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
-@common.SkipIfNoModelConverter
-def test_mxfp4_linear_vgf(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_vgf(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
-        frobenius_threshold=0.3,
-        cosine_threshold=0.95,
-    )
-
-
-@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
-@common.SkipIfNoModelConverter
-def test_mxfp6_e2m3_linear_vgf(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_vgf(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
-    )
-
-
-@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
-@common.SkipIfNoModelConverter
-def test_mxfp6_e3m2_linear_vgf(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_vgf(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
-    )
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp8_linear_eager_cpu(test_data: torch.Tensor) -> None:
-    """Check eager MXFP implementation.
-
-    The Arm lowering tests compare lowered output against the eager CPU
-    implementation, so the eager implementation must be accurate for it to be
-    used as a reference in other tests.
-
-    """
-    _test_mxfp_linear_eager_cpu(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
-        frobenius_threshold=0.08,
-        cosine_threshold=0.995,
-    )
-
-
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp4_linear_eager_cpu(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_eager_cpu(
-        test_data,
-        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
-        frobenius_threshold=0.3,
-        cosine_threshold=0.95,
-    )
-
+        ref_model.set_block_test_weights()
+    test_model = copy.deepcopy(ref_model).eval()
 
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp6_e2m3_linear_eager_cpu(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_eager_cpu(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
-    )
+    to_mxfp(test_model, MXFPOpConfig(), filter_fn=_is_linear)
 
+    test_output = test_model(test_input)
+    ref_output = ref_model(test_input)
 
-@common.parametrize("test_data", test_data_fp)
-def test_mxfp6_e3m2_linear_eager_cpu(test_data: torch.Tensor) -> None:
-    _test_mxfp_linear_eager_cpu(
-        test_data,
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
-        frobenius_threshold=0.2,
-        cosine_threshold=0.98,
+    compare_rel_frobenius_and_cosine_similarity(
+        ref_output,
+        test_output,
+        quantization_parameters=None,
+        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
+        cosine_threshold=_MXFP_COSINE_THRESHOLD,
+        clean_reference=False,
     )
diff --git a/backends/arm/test/ops/test_round.py b/backends/arm/test/ops/test_round.py
index bcc71b70725..ff86dbffff0 100644
--- a/backends/arm/test/ops/test_round.py
+++ b/backends/arm/test/ops/test_round.py
@@ -6,6 +6,7 @@
 
 from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -66,6 +67,7 @@ def test_round_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
+@pytest.mark.xfail(reason="where.self not supported on U55")
 def test_round_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Round(),
diff --git a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
index 69e1830e3ee..64594403dae 100644
--- a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
+++ b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
@@ -3,122 +3,72 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import executorch.backends.arm.tosa.dialect  # noqa: F401
 import torch
 from executorch.backends.arm._passes.insert_dynamic_padding import (
     InsertDynamicPaddingPass,
 )
-from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
     TosaSpecification,
 )
-from executorch.backends.test.graph_builder import GraphBuilder
+from executorch.exir import to_edge
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
-from torch.fx import GraphModule
-from torch.fx.passes.infra.pass_base import PassResult
+from torch._export.utils import _get_shape_env_from_gm
+from torch.export import Dim, export
 
 
-SPEC = TosaSpecification.create_from_string("TOSA-1.1+FP+shape")
+class ConvModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 16, kernel_size=2, stride=3, padding=2)
 
-
-def _build_conv_graph(
-    target_op,
-    input_shape: tuple[int, ...],
-    weight_shape: tuple[int, ...],
-    padding: list[int],
-    stride: list[int],
-    dilation: list[int],
-) -> GraphModule:
-    with TosaLoweringContext(SPEC):
-        builder = GraphBuilder()
-        input_tensor = builder.placeholder("input", torch.randn(input_shape))
-        weight = builder.placeholder("weight", torch.randn(weight_shape))
-        bias = builder.placeholder("bias", torch.randn(weight_shape[0]))
-        padding_shape = builder.call_operator(
-            exir_ops.backend.tosa.CONST_SHAPE.default, (padding,)
-        )
-        padding_shape.node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE
-        conv = builder.call_operator(
-            target_op,
-            (input_tensor, weight, bias, stride, padding_shape, dilation),
-        )
-        builder.output([conv])
-        return ExportPass().call(builder.get_graph_module()).graph_module
-
-
-def _run_insert_dynamic_padding(graph_module: GraphModule) -> GraphModule:
-    with TosaLoweringContext(SPEC):
-        result = InsertDynamicPaddingPass()(graph_module)
-    assert isinstance(result, PassResult)
-    return result.graph_module
-
-
-def _assert_inserted_padding(
-    graph_module: GraphModule,
-    target_op,
-    zero_spatial_padding: list[int],
-    expected_full_padding_len: int,
-) -> None:
-    nodes = graph_module.graph.nodes
-    conv_node = next(n for n in nodes if n.target == target_op)
-    assert conv_node.args[4] == zero_spatial_padding
-
-    padding_node = next(
-        n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default
-    )
-    padding_shape_node = padding_node.args[1]
-    assert padding_shape_node.target == exir_ops.backend.tosa.CONCAT_SHAPE.default
-
-    n_padding, spatial_padding, c_padding = padding_shape_node.args[0]
-    assert n_padding.meta["val"] == [0, 0]
-    assert spatial_padding.target == exir_ops.backend.tosa.CONST_SHAPE.default
-    assert c_padding.meta["val"] == [0, 0]
-
-    pad_list = padding_shape_node.meta["val"]
-    spatial_padding_value = spatial_padding.meta["val"]
-    assert len(pad_list) == expected_full_padding_len
-    assert pad_list[:2] == [0, 0]
-    assert pad_list[2:-2] == spatial_padding_value
-    assert pad_list[-2:] == [0, 0]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(x)
 
 
 def test_insert_dynamic_padding():
-    graph_module = _build_conv_graph(
-        exir_ops.backend.tosa.CONV2D.default,
-        input_shape=(1, 8, 8, 3),
-        weight_shape=(16, 2, 2, 3),
-        padding=[2, 2, 2, 2],
-        stride=[3, 3],
-        dilation=[1, 1],
-    )
-
-    graph_module = _run_insert_dynamic_padding(graph_module)
-
-    _assert_inserted_padding(
-        graph_module,
-        exir_ops.backend.tosa.CONV2D.default,
-        zero_spatial_padding=[0, 0, 0, 0],
-        expected_full_padding_len=8,
-    )
-
-
-def test_insert_dynamic_padding_conv3d():
-    graph_module = _build_conv_graph(
-        exir_ops.backend.tosa.CONV3D.default,
-        input_shape=(1, 8, 8, 8, 3),
-        weight_shape=(16, 2, 2, 2, 3),
-        padding=[2, 2, 2, 2, 2, 2],
-        stride=[3, 3, 3],
-        dilation=[1, 1, 1],
+    model = ConvModule()
+    example_inputs = (torch.randn(1, 3, 8, 8),)
+    ep = export(
+        model,
+        example_inputs,
+        dynamic_shapes={
+            "x": {2: Dim("height", min=4, max=10), 3: Dim("width", min=4, max=10)}
+        },
     )
+    edge_model = to_edge(ep)
+    shape_env = _get_shape_env_from_gm(edge_model.exported_program().graph_module)
+    with TosaLoweringContext(
+        TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env
+    ):
+        edge_model = edge_model.transform(
+            [RewriteConvPass(edge_model.exported_program())]
+        )
+        nodes = edge_model.exported_program().graph.nodes
+        conv_node = next(
+            n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default
+        )
+        initial_padding = conv_node.args[4]
+        assert any(isinstance(p, torch.SymInt) for p in initial_padding)
 
-    graph_module = _run_insert_dynamic_padding(graph_module)
-
-    _assert_inserted_padding(
-        graph_module,
-        exir_ops.backend.tosa.CONV3D.default,
-        zero_spatial_padding=[0, 0, 0, 0, 0, 0],
-        expected_full_padding_len=10,
-    )
+        edge_model = edge_model.transform(
+            [
+                InsertDynamicPaddingPass(),
+            ]
+        )
+        nodes = edge_model.exported_program().graph.nodes
+        conv_node = next(
+            n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default
+        )
+        padding = conv_node.args[4]
+        assert padding == [0, 0, 0, 0]
+        padding_node = next(
+            n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default
+        )
+        assert padding_node is not None
+        pad_list = padding_node.args[1].meta["val"]
+        assert len(pad_list) == 8
+        assert pad_list[:2] == [0, 0]  # N-padding
+        assert pad_list[2:6] == initial_padding  # HW-padding in NHWC order
+        assert pad_list[6:] == [0, 0]  # C-padding
diff --git a/backends/arm/test/passes/test_rewrite_conv_pass.py b/backends/arm/test/passes/test_rewrite_conv_pass.py
index 736aa685b86..fc8478afee5 100644
--- a/backends/arm/test/passes/test_rewrite_conv_pass.py
+++ b/backends/arm/test/passes/test_rewrite_conv_pass.py
@@ -336,15 +336,11 @@ def test_rewrite_conv_dynamic_keeps_static_padding_when_symbolic_remainder_is_ze
     assert all(not isinstance(p, torch.SymInt) for p in padding)
 
 
-def test_rewrite_conv_adjust_pad_if_needed_static_allows_negative_padding_until_later_validation():
+def test_rewrite_conv_adjust_pad_if_needed_static_raises_before_negative_padding():
     rewrite_pass, _, _ = _make_rewrite_pass((torch.randn(1, 3, 9, 12),))
 
-    try:
+    with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
         rewrite_pass._adjust_pad_if_needed(6, 2, 3, 0, 1)
-    except RuntimeError as e:
-        assert "SizeAdjustInputPass" in str(e)
-    else:
-        pytest.fail("Expected RuntimeError was not raised")
 
 
 def test_rewrite_conv_adjust_pad_if_needed_static_positive_padding_stays_non_negative():
@@ -391,7 +387,7 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_exact_zero_keeps_positive_pa
     assert adjusted_pad == 1
 
 
-def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_returns_symbolic_padding():
+def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_raises_before_negative_padding():
     rewrite_pass, shape_env, input_len = _make_rewrite_pass(
         (torch.randn(1, 3, 8, 8),),
         dynamic_shapes={
@@ -403,9 +399,8 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_retur
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
-        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1)
-
-    assert isinstance(adjusted_pad, torch.SymInt)
+        with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
+            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1)
 
 
 def test_rewrite_conv_symbolic_comparison_with_int_specializes_to_hint():
@@ -443,12 +438,11 @@ def unsafe_adjust(input_len, input_weight, stride, pad, dilation):
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
-        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
-
-    assert isinstance(adjusted_pad, torch.SymInt)
+        with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
+            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
 
 
-def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_returns_symbolic_padding():
+def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_raises_before_negative_padding():
     rewrite_pass, shape_env, input_len = _make_rewrite_pass(
         (torch.randn(1, 3, 8, 8),),
         dynamic_shapes={
@@ -457,22 +451,8 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_returns_s
         },
     )
 
-    with TosaLoweringContext(
-        TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
-    ):
-        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
-
-    assert isinstance(adjusted_pad, torch.SymInt)
-
-
-def test_rewrite_conv_adjust_pad_if_needed_symbolic_singleton_overflow_still_raises():
-    rewrite_pass, shape_env, input_len = _make_rewrite_pass(
-        (torch.randn(1, 3, 9, 12),),
-        dynamic_shapes=_multiples_of_three_dynamic_shapes(),
-    )
-
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
         with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
-            rewrite_pass._adjust_pad_if_needed(input_len, 3, 3, 1, 1)
+            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
diff --git a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
index f89872f93b8..572a2b247e9 100644
--- a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
+++ b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
@@ -9,15 +9,12 @@
 import torch
 from executorch.backends.arm._passes.rewrite_mxfp_linear import RewriteMXFPLinearPass
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
-from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str
-from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
     TosaSpecification,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export import export
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3
 
 
 class _LinearModule(torch.nn.Module):
@@ -52,11 +49,9 @@ def _get_nodes_from_target(
     ]
 
 
-def _rewrite_linear_module(
-    config: MXFPOpConfig,
-) -> tuple[torch.fx.GraphModule, list[torch.fx.Node], list[torch.fx.Node]]:
+def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
     model = _LinearModule(bias=True).eval()
-    to_mxfp(model, config, filter_fn=_is_linear)
+    to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear)
     exported = export(model, (torch.randn(4, 5, 32),), strict=False)
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
@@ -71,11 +66,6 @@ def _rewrite_linear_module(
     matmul_nodes = _get_nodes_from_target(
         graph_module, exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default
     )
-    return graph_module, cast_nodes, matmul_nodes
-
-
-def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
-    graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(MXFPOpConfig())
 
     assert (
         len(_get_nodes_from_target(graph_module, torch.ops.tosa_mxfp.linear.default))
@@ -98,34 +88,6 @@ def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
     assert tuple(output_node.meta["val"][0].shape) == (4, 5, 8)
 
 
-def test_rewrite_mxfp6_linear_marks_payload_dtype() -> None:
-    graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(
-        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3)
-    )
-    cast_node = cast_nodes[0]
-    matmul_node = matmul_nodes[0]
-    input_qdata_node = next(
-        node
-        for node in graph_module.graph.nodes
-        if node.op == "call_function"
-        and node.target == operator.getitem
-        and node.args[0] == cast_node
-        and node.args[1] == 0
-    )
-    weight_qdata_node = matmul_node.args[2]
-    assert isinstance(weight_qdata_node, torch.fx.Node)
-
-    assert cast_node.kwargs["output_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3)
-    assert matmul_node.kwargs["payload_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3)
-    assert tuple(cast_node.meta["val"][0].shape) == (1, 4 * 5, 32)
-    assert (
-        input_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3
-    )
-    assert (
-        weight_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3
-    )
-
-
 def test_rewrite_mxfp_dual_linear() -> None:
     model = _DualLinearModule().eval()
     to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear)
diff --git a/backends/arm/test/passes/test_symbolic_value_range.py b/backends/arm/test/passes/test_symbolic_value_range.py
index 99dfafc93a6..7a6ecfdf79c 100644
--- a/backends/arm/test/passes/test_symbolic_value_range.py
+++ b/backends/arm/test/passes/test_symbolic_value_range.py
@@ -68,16 +68,3 @@ def test_evaluate_symbolic_expr_values_bails_out_for_large_symbol_ranges() -> No
     shape_env, symint = _make_shape_env(hint=3, compiler_min=1, compiler_max=400)
 
     assert evaluate_symbolic_expr_values(symint, shape_env) is None
-
-
-def test_evaluate_symbolic_expr_values_does_not_require_shape_env_bounds(
-    monkeypatch,
-) -> None:
-    shape_env, symint = _make_shape_env(hint=3, compiler_min=2, compiler_max=6)
-
-    def raise_recursion(_expr):
-        raise RecursionError
-
-    monkeypatch.setattr(shape_env, "bound_sympy", raise_recursion)
-
-    assert evaluate_symbolic_expr_values(symint, shape_env) == {2, 3, 4, 5, 6}
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 9a63452e325..ff26d17ee13 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -2,10 +2,10 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 import importlib.resources as _resources
 import json
 import logging
-import numbers
 import os
 import re
 import shutil
@@ -14,11 +14,13 @@
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
+
 from types import NoneType
 from typing import Any, cast, Dict, List, Optional, Tuple
 
 import executorch.backends.arm.test as arm_test_package
 import executorch.backends.arm.tosa.schemas as tosa_schemas_package
+
 import numpy as np
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
@@ -29,6 +31,7 @@
     NNHWC_INVERSE_ORDER,
     NNHWC_ORDER,
 )
+
 from executorch.backends.arm.ethosu import EthosUCompileSpec
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification
@@ -40,6 +43,7 @@
 from executorch.exir import ExecutorchProgramManager, ExportedProgram
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from torch.fx.node import Node
+
 from torch.overrides import TorchFunctionMode
 from tosa.TosaGraph import TosaGraph  # type: ignore[import-not-found, import-untyped]
 
@@ -75,7 +79,6 @@
     "corstone-320",
     "vkml_emulation_layer",
 }
-INFER_SHAPES_PATH = "infer_shapes"
 
 
 class QuantizationParams:
@@ -99,9 +102,7 @@ def __init__(
         self.dtype = dtype
 
 
-def get_input_names(
-    program: ExportedProgram, is_lowered_module: bool = False
-) -> list[str]:
+def get_input_names(program: ExportedProgram) -> list[str]:
     """Get a list[str] with the names of the inputs to this model.
 
     Args:
@@ -110,15 +111,7 @@ def get_input_names(
         A list of strings with the names of the model input.
 
     """
-
-    if not is_lowered_module:
-        return [spec.arg.name for spec in program.graph_signature.input_specs]
-    else:
-        return [
-            user_input
-            for user_input in program.graph_signature.user_inputs
-            if isinstance(user_input, str)
-        ]
+    return [spec.arg.name for spec in program.graph_signature.input_specs]
 
 
 def get_input_quantization_params(
@@ -211,59 +204,25 @@ def torch_tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
     return tensor.numpy()
 
 
-def torch_tensor_to_tosa_shape(tensor: torch.Tensor) -> list[int]:
-    shape = list(tensor.shape)
-    dim_order = tensor.dim_order()
-    if dim_order in (NHWC_ORDER, NNHWC_ORDER):
-        shape = [shape[index] for index in dim_order]
-    return [int(dim) for dim in shape]
-
-
-def user_inputs_need_shape_inference(program: ExportedProgram) -> bool:
-    user_inputs = {
-        user_input
-        for user_input in program.graph_signature.user_inputs
-        if isinstance(user_input, str)
-    }
-    for node in program.graph.nodes:
-        if node.op != "placeholder" or node.name not in user_inputs:
-            continue
-        input_tensor = get_first_fake_tensor(node)
-        if any(not isinstance(dim, numbers.Integral) for dim in input_tensor.shape):
-            return True
-    return False
-
-
 def numpy_to_torch_tensor(array: np.ndarray, output_node: Node) -> torch.Tensor:
     output_tensor = get_first_fake_tensor(output_node)
     shape = output_tensor.shape
     dim_order = output_tensor.dim_order()
-
-    def is_concrete_shape(shape_like) -> bool:
-        return all(isinstance(dim, numbers.Integral) for dim in shape_like)
-
-    def to_torch_tensor() -> torch.Tensor:
-        if array.dtype.type is np.void:
-            # If dtype is void, "cheat" and use the output_tensor dtype.
-            return torch.frombuffer(array, dtype=output_tensor.dtype)
-        return torch.from_numpy(array)
-
     if dim_order == NHWC_ORDER:
-        tensor = to_torch_tensor()
-        if is_concrete_shape(shape):
-            tensor = tensor.reshape([shape[i] for i in NHWC_ORDER])
+        shape_with_dim_order = [shape[i] for i in NHWC_ORDER]
+        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
         return tensor.permute(NHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
     elif dim_order == NNHWC_ORDER:
-        tensor = to_torch_tensor()
-        if is_concrete_shape(shape):
-            tensor = tensor.reshape([shape[i] for i in NNHWC_ORDER])
-        return tensor.permute(NNHWC_INVERSE_ORDER)
+        shape_with_dim_order = [shape[i] for i in NNHWC_ORDER]
+        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
+        return tensor.permute(NNHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
     else:
-        tensor = to_torch_tensor()
-
-        if is_concrete_shape(shape):
-            return tensor.reshape(shape)
-        return tensor
+        if array.dtype.type is np.void:
+            # If dtype is void, "cheat" and use the output_tensor dtype.
+            tensor = torch.frombuffer(array, dtype=output_tensor.dtype)
+        else:
+            tensor = torch.from_numpy(array)
+        return tensor.reshape(shape)
 
 
 class TosaReferenceModelDispatch(TorchFunctionMode):
@@ -275,65 +234,12 @@ def __init__(self):
         self.ran_tosa_dispatch = False
         super().__init__()
 
-    def _generate_shape_inference_json(
-        self,
-        tosa_buffer: bytes,
-        artifact_path: Path,
-        test_case_path: Path,
-        input_names: list[str],
-        inputs: Tuple[torch.Tensor, ...],
-    ):
-        shapes = dict(
-            zip(input_names, [torch_tensor_to_tosa_shape(input) for input in inputs])
-        )
-        with open(test_case_path, "w", encoding="utf-8") as f:
-            json.dump({"tosa_file": str(artifact_path), "shapes": shapes}, f, indent=2)
-
-    def _run_infer_shapes(
-        self,
-        tosa_buffer: bytes,
-        input_names: list[str],
-        inputs: Tuple[torch.Tensor, ...],
-        temp_dir_path: Path,
-        infer_shapes_path: str = INFER_SHAPES_PATH,
-    ) -> bytes:
-        model_suffix = "model.tosa"
-        tosa_sym_int_model = temp_dir_path / model_suffix
-        tosa_sym_int_model.write_bytes(tosa_buffer)
-        test_case_file = temp_dir_path / "test_case.json"
-
-        self._generate_shape_inference_json(
-            tosa_buffer, tosa_sym_int_model, test_case_file, input_names, inputs
-        )
-        subprocess.run(
-            [
-                infer_shapes_path,
-                f"{test_case_file}",
-            ],
-            check=True,
-            capture_output=True,
-            text=True,
-        )  # nosec
-        resolved_file = temp_dir_path / f"resolved_{model_suffix}"
-        with open(resolved_file, "rb") as f:
-            return f.read()
-
     def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs):
         tosa_buffer = lowered_backend_module.processed_bytes
         compile_spec = TosaCompileSpec._from_list(lowered_backend_module.compile_specs)
-        tosa_spec = compile_spec.tosa_spec
-        output_node = lowered_backend_module.original_module.graph.output_node()
-        if tosa_spec.support_extension("shape") and user_inputs_need_shape_inference(
-            lowered_backend_module.original_module
-        ):
-            input_names = get_input_names(lowered_backend_module.original_module, True)
-            # Generate json file for shape inference extension, which is required by the reference model.
-            with tempfile.TemporaryDirectory() as temp_dir:
-                tosa_buffer = self._run_infer_shapes(
-                    tosa_buffer, input_names, inputs, Path(temp_dir)
-                )
 
-        return run_tosa_graph(tosa_buffer, tosa_spec, inputs, output_node)
+        output_node = lowered_backend_module.original_module.graph.output_node()
+        return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs, output_node)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
@@ -376,7 +282,7 @@ def __torch_function__(self, func, types, args=..., kwargs=None):
 
 def run_target(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor, ...],
+    inputs: Tuple[torch.Tensor],
     intermediate_path: str | Path,
     target_board: str,
     elf_path: str | Path,
@@ -404,7 +310,7 @@ def run_target(
 
 def save_inputs_to_file(
     exported_program: ExportedProgram,
-    inputs: Tuple[torch.Tensor, ...],
+    inputs: Tuple[torch.Tensor],
     intermediate_path: str | Path,
 ):
     input_file_paths: list[str] = []
@@ -436,7 +342,7 @@ def get_output_from_file(
 
 def run_vkml_emulation_layer(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor, ...],
+    inputs: Tuple[torch.Tensor],
     intermediate_path: str | Path,
     elf_path: str | Path,
 ):
@@ -484,7 +390,7 @@ def run_vkml_emulation_layer(
 
 def run_corstone(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor, ...],
+    inputs: Tuple[torch.Tensor],
     intermediate_path: str | Path,
     target_board: str,
     elf_path: str | Path,
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index d321766e8d8..4df310f6dc1 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -23,7 +23,6 @@ def define_arm_tests():
         "ops/test_log10.py",
         "ops/test_max_pool1d.py",
         "ops/test_mul.py",
-        "ops/test_mxfp_conv2d.py",
         "ops/mxfp/test_mxfp_linear.py",
         "ops/test_permute.py",
         "ops/test_rsqrt.py",
@@ -58,14 +57,12 @@ def define_arm_tests():
         # "misc/test_evaluate_model.py",
         "misc/test_pass_pipeline_config.py",
         "misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py",
-        "misc/tosa_dialect/test_tosa_dialect_mxfp_conv2d.py",
         "misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py",
         "misc/tosa_dialect/test_tosa_resize.py",
         "misc/test_tosa_spec.py",
         "misc/test_bn_relu_folding_qat.py",
         "misc/test_custom_partition.py",
         "misc/test_debug_hook.py",
-        "misc/test_mxfp_conv2d_ao.py",
         "misc/test_mxfp_linear_ao.py",
         "misc/test_post_quant_device_switch.py",
         "misc/test_vgf_check_env.py",
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
index 4d059b64efe..0585f7a1ff8 100644
--- a/backends/arm/tosa/dialect/__init__.py
+++ b/backends/arm/tosa/dialect/__init__.py
@@ -11,7 +11,6 @@
     binary_elementwise,
     cast_to_block_scaled,
     conv2d,
-    conv2d_block_scaled,
     conv3d,
     custom,
     data_layout_ops,
diff --git a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
index 8dbff7c11c5..ed109be6124 100644
--- a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
+++ b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
@@ -5,28 +5,24 @@
 
 from __future__ import annotations
 
-from typing import cast
-
 import torch
 
-from executorch.backends.arm.ao_ext.mxfp import mxfp_str_to_dtype
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
     get_context_spec,
     TosaSpecification,
 )
-from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 @register_fake_tosa_op(
-    "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, str output_dtype) -> (Tensor, Tensor)",
+    "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, ScalarType output_dtype) -> (Tensor, Tensor)",
     [TosaSpecification.create_from_string("TOSA-1.1+FP")],
 )
 def CAST_TO_BLOCK_SCALED(
     input: torch.Tensor,
     block_size: int,
-    output_dtype: str,
+    output_dtype: torch.dtype,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     tosa_spec = get_context_spec()
 
@@ -66,25 +62,12 @@ def CAST_TO_BLOCK_SCALED(
         )
 
     scale_tensor_dtype = torch.float8_e8m0fnu
-    elem_dtype = mxfp_str_to_dtype(output_dtype)
-    if elem_dtype not in (
-        torch.float4_e2m1fn_x2,
-        DTYPE_FP6_E2M3,
-        DTYPE_FP6_E3M2,
-        torch.float8_e4m3fn,
-        torch.float8_e5m2,
-    ):
+    if output_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
         raise TosaValueError(
             f"Unsupported block-scaled output dtype {output_dtype}",
             op="CAST_TO_BLOCK_SCALED",
         )
     scale_shape = (*input.shape[:-1], input.shape[-1] // block_size)
-    if elem_dtype == torch.float4_e2m1fn_x2:
-        output_shape = (*input.shape[:-1], input.shape[-1] // 2)
-        output_data = input.new_empty(output_shape, dtype=torch.uint8)
-    elif elem_dtype in (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2):
-        output_data = input.new_empty(input.shape, dtype=torch.uint8)
-    else:
-        output_data = torch.empty_like(input, dtype=cast(torch.dtype, elem_dtype))
+    output_data = torch.empty_like(input, dtype=output_dtype)
     output_scale = input.new_empty(scale_shape, dtype=scale_tensor_dtype)
     return output_data, output_scale
diff --git a/backends/arm/tosa/dialect/ops/conv2d.py b/backends/arm/tosa/dialect/ops/conv2d.py
index d0db2d60fcd..5af0ca1617a 100644
--- a/backends/arm/tosa/dialect/ops/conv2d.py
+++ b/backends/arm/tosa/dialect/ops/conv2d.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 from typing import Optional
 
 import torch
@@ -88,23 +89,6 @@ def validate_conv2d_args_dtypes(  # noqa: C901
     return output_dtype
 
 
-def conv_output_dim(
-    input_dim: int | torch.SymInt,
-    kernel_dim: int,
-    stride: int,
-    pad_before: int | torch.SymInt,
-    pad_after: int | torch.SymInt,
-    dilation: int,
-) -> int | torch.SymInt:
-    receptive_field = dilation * (kernel_dim - 1) + 1
-    total_pad = pad_before + pad_after
-
-    if stride == 1:
-        return input_dim + total_pad - receptive_field + 1
-
-    return (input_dim + total_pad - receptive_field) // stride + 1
-
-
 @register_fake_tosa_op(
     "CONV2D(Tensor input, "
     "Tensor weight, "
@@ -126,14 +110,17 @@ def CONV2D(
 
     output_dtype = validate_conv2d_args_dtypes(tosa_spec, x, weight, bias, op="CONV2D")
 
+    torch_pad = [pad[0], pad[2]]
     N = x.shape[0]
-    H_in, W_in = x.shape[1:3]
     C_out = weight.shape[0]
-    H_out = conv_output_dim(
-        H_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0]
+    H_in, W_in = x.shape[1], x.shape[2]
+    H_out = math.floor(
+        (H_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0]
+        + 1
     )
-    W_out = conv_output_dim(
-        W_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1]
+    W_out = math.floor(
+        (W_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1]
+        + 1
     )
     output_shape = [N, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/conv3d.py b/backends/arm/tosa/dialect/ops/conv3d.py
index a81ae0dae53..67ceb0596c6 100644
--- a/backends/arm/tosa/dialect/ops/conv3d.py
+++ b/backends/arm/tosa/dialect/ops/conv3d.py
@@ -3,14 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 from typing import Optional
 
 import torch
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
-from executorch.backends.arm.tosa.dialect.ops.conv2d import (
-    conv_output_dim,
-    validate_conv2d_args_dtypes,
-)
+from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
     get_context_spec,
@@ -37,7 +35,7 @@ def validate_conv3d_args_dtypes(
     "Tensor weight, "
     "Tensor bias, "
     "int[3] stride, "
-    "SymInt[6] pad, "
+    "int[6] pad, "
     "int[3] dilation) -> Tensor",
     TosaSpecification.all_versions_and_profiles(),
 )
@@ -46,24 +44,28 @@ def CONV3D(
     weight: torch.Tensor,
     bias: torch.Tensor,
     stride: list[int],
-    pad: list[int | torch.SymInt],
+    pad: list[int],
     dilation: list[int],
 ) -> torch.Tensor:
     tosa_spec = get_context_spec()
 
     output_dtype = validate_conv3d_args_dtypes(tosa_spec, x, weight, bias)
 
+    torch_pad = [pad[0], pad[2], pad[4]]
     N = x.shape[0]
     C_out = weight.shape[0]
-    D_in, H_in, W_in = x.shape[1:4]
-    D_out = conv_output_dim(
-        D_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0]
+    D_in, H_in, W_in = x.shape[1], x.shape[2], x.shape[3]
+    D_out = math.floor(
+        (D_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0]
+        + 1
     )
-    H_out = conv_output_dim(
-        H_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1]
+    H_out = math.floor(
+        (H_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1]
+        + 1
     )
-    W_out = conv_output_dim(
-        W_in, weight.shape[3], stride[2], pad[4], pad[5], dilation[2]
+    W_out = math.floor(
+        (W_in + 2 * torch_pad[2] - dilation[2] * (weight.shape[3] - 1) - 1) / stride[2]
+        + 1
     )
     output_shape = [N, D_out, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
index 83ef3ff72fb..ae864f29d62 100644
--- a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
+++ b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
@@ -3,11 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
+
 import torch
-from executorch.backends.arm.tosa.dialect.ops.conv2d import (
-    conv_output_dim,
-    validate_conv2d_args_dtypes,
-)
+from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 
 from executorch.backends.arm.tosa.specification import (
@@ -39,11 +38,17 @@ def DEPTHWISE_CONV2D(
         tosa_spec, x, weight, bias, op="DEPTHWISE_CONV2D"
     )
 
+    torch_pad = [pad[0], pad[2]]
+    # Weight format is [KH, KW, IC, M], where C_out = IC * M.
     kernel_h, kernel_w = weight.shape[0], weight.shape[1]
     C_out = weight.shape[2] * weight.shape[3]
     N = x.shape[0]
-    H_in, W_in = x.shape[1:3]
-    H_out = conv_output_dim(H_in, kernel_h, stride[0], pad[0], pad[1], dilation[0])
-    W_out = conv_output_dim(W_in, kernel_w, stride[1], pad[2], pad[3], dilation[1])
+    H_in, W_in = x.shape[1], x.shape[2]
+    H_out = math.floor(
+        (H_in + 2 * torch_pad[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1
+    )
+    W_out = math.floor(
+        (W_in + 2 * torch_pad[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1
+    )
     output_shape = [N, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
index fcea104320f..b42e2855e4c 100644
--- a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
+++ b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
@@ -7,11 +7,6 @@
 
 import torch
 
-from executorch.backends.arm.ao_ext.mxfp import (
-    mxfp_str_to_dtype,
-    MXFPDType,
-    SUPPORTED_MXFP_DTYPES,
-)
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
@@ -33,39 +28,18 @@ def _validate_block_size(block_size: int) -> None:
         )
 
 
-def _get_payload_dtype(
-    data: torch.Tensor,
-    payload_dtype: str = "",
-) -> MXFPDType:
-    if payload_dtype:
-        return mxfp_str_to_dtype(payload_dtype)
-    if data.dtype == torch.uint8:
-        return torch.float4_e2m1fn_x2
-    return data.dtype
-
-
-def _get_logical_last_dim(data: torch.Tensor, payload_dtype: str = "") -> int:
-    last_dim = data.shape[-1]
-    if _get_payload_dtype(data, payload_dtype) == torch.float4_e2m1fn_x2:
-        return last_dim * 2
-    return last_dim
-
-
 def _validate_dtypes(
     A_data: torch.Tensor,
     A_scale: torch.Tensor,
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
-    payload_dtype: str = "",
 ) -> None:
-    A_dtype = _get_payload_dtype(A_data, payload_dtype)
-    B_dtype = _get_payload_dtype(B_data, payload_dtype)
-    if A_dtype not in SUPPORTED_MXFP_DTYPES:
+    if A_data.dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
         raise TosaValueError(
             f"Unsupported A_data dtype {A_data.dtype}",
             op="MATMUL_T_BLOCK_SCALED",
         )
-    if B_dtype != A_dtype:
+    if B_data.dtype != A_data.dtype:
         raise TosaValueError(
             f"B_data dtype {B_data.dtype} must match A_data dtype {A_data.dtype}",
             op="MATMUL_T_BLOCK_SCALED",
@@ -83,7 +57,6 @@ def _validate_shapes(
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
     block_size: int,
-    payload_dtype: str = "",
 ) -> tuple[int, int, int]:
     if A_data.ndim != 3 or A_scale.ndim != 3 or B_data.ndim != 3 or B_scale.ndim != 3:
         raise TosaValueError(
@@ -91,10 +64,8 @@ def _validate_shapes(
             op="MATMUL_T_BLOCK_SCALED",
         )
 
-    N, H = A_data.shape[:2]
-    D, W = B_data.shape[:2]
-    C = _get_logical_last_dim(A_data, payload_dtype)
-    Cb = _get_logical_last_dim(B_data, payload_dtype)
+    N, H, C = A_data.shape
+    D, W, Cb = B_data.shape
     if C != Cb:
         raise TosaValueError(
             f"A_data last dim {C} must match B_data last dim {Cb}",
@@ -129,8 +100,7 @@ def _validate_shapes(
 
 
 @register_fake_tosa_op(
-    "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, "
-    "Tensor B_scale, SymInt block_size, str payload_dtype='') -> Tensor",
+    "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, Tensor B_scale, SymInt block_size) -> Tensor",
     [TosaSpecification.create_from_string("TOSA-1.1+FP")],
 )
 def MATMUL_T_BLOCK_SCALED(
@@ -139,7 +109,6 @@ def MATMUL_T_BLOCK_SCALED(
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
     block_size: int,
-    payload_dtype: str = "",
 ) -> torch.Tensor:
     tosa_spec = get_context_spec()
 
@@ -150,13 +119,12 @@ def MATMUL_T_BLOCK_SCALED(
         )
 
     _validate_block_size(block_size)
-    _validate_dtypes(A_data, A_scale, B_data, B_scale, payload_dtype)
+    _validate_dtypes(A_data, A_scale, B_data, B_scale)
     output_shape = _validate_shapes(
         A_data,
         A_scale,
         B_data,
         B_scale,
         block_size,
-        payload_dtype,
     )
     return A_data.new_empty(output_shape, dtype=torch.float32)
diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py
index 5e661676149..245a9c00235 100644
--- a/backends/arm/tosa/mapping.py
+++ b/backends/arm/tosa/mapping.py
@@ -35,9 +35,6 @@
 class TosaSpecialDtype(Enum):
     """Special TOSA dtypes not natively expressed in PyTorch."""
 
-    FP4E2M1 = ts.DType.FP4E2M1
-    FP6E2M3 = ts.DType.FP6E2M3
-    FP6E3M2 = ts.DType.FP6E3M2
     INT48 = ts.DType.INT48
     INT4 = ts.DType.INT4
     SHAPE = ts.DType.SHAPE
@@ -105,7 +102,6 @@ def map_dtype(data_type: torch.dtype) -> Any:
         torch.float8_e4m3fn: ts.DType.FP8E4M3,
         torch.float8_e5m2: ts.DType.FP8E5M2,
         torch.float8_e8m0fnu: ts.DType.FP8UE8M0,
-        torch.float4_e2m1fn_x2: ts.DType.FP4E2M1,
         torch.int8: ts.DType.INT8,
         # TOSA uses signless int8; unsigned semantics are expressed via RESCALE.
         torch.uint8: ts.DType.INT8,
@@ -160,10 +156,8 @@ def extract_tensor_meta(meta):
         raise ValueError(
             f"Expected first value in node.meta['val'] to be FakeTensor, got {val.__class__}"
         )
-    shape = tuple(val.size())
-    if special_dtype == TosaSpecialDtype.FP4E2M1 and val.dtype == torch.uint8:
-        shape = (*shape[:-1], shape[-1] * 2)
     dtype = map_dtype(val.dtype)
+    shape = tuple(val.size())
 
     return (dtype, shape)
 
@@ -255,15 +249,6 @@ def __validate(self, tosa_spec: TosaSpecification) -> bool:
                     or tosa_spec.support_extension("mxfp")
                 ):
                     return False
-            case ts.DType.FP4E2M1:
-                if not tosa_spec.support_extension("mxfp"):
-                    return False
-            case ts.DType.FP6E2M3:
-                if not tosa_spec.support_extension("mxfp"):
-                    return False
-            case ts.DType.FP6E3M2:
-                if not tosa_spec.support_extension("mxfp"):
-                    return False
 
         return True
 
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index 8c4257e9472..37b9cd7cc2a 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -309,9 +309,7 @@ def _detag_boundary_nodes(
             elif detag_first_fp_node and not is_q_node and not is_dq_node:
                 # For non Q/DQ nodes, remove tag from first node in partition if any input has fp dtype
                 for input in node.all_input_nodes:
-                    if is_partitioned(input, tag) or isinstance(
-                        input.meta["val"], torch.SymInt
-                    ):
+                    if is_partitioned(input, tag):
                         continue
                     if get_first_fake_tensor(input).dtype.is_floating_point:
                         reporter.report_reject(
@@ -358,13 +356,7 @@ def _partition_has_invalid_uint8(self, partition: Partition, tag: str) -> bool:
                 if dtype is None:
                     try:
                         dtype = get_first_fake_tensor(node).dtype
-                    except (
-                        AttributeError,
-                        KeyError,
-                        RuntimeError,
-                        ValueError,
-                        TypeError,
-                    ):
+                    except (AttributeError, KeyError, RuntimeError, ValueError):
                         dtype = None
             if dtype is None:
                 continue
diff --git a/backends/arm/tosa/utils.py b/backends/arm/tosa/utils.py
index b1d727d7d01..b44793cec5f 100644
--- a/backends/arm/tosa/utils.py
+++ b/backends/arm/tosa/utils.py
@@ -164,10 +164,6 @@ def build_reshape_tosa(
 def normalize_symint(shape):
     """Dynamic shapes in executorch are represented with torch.SymInt objects in
     the shapes, in TOSA we do not have this concept and instead use -1.
-
-    This function replaces each symbolic dimension with -1. Static dimensions
-    are preserved unchanged.
-
     """
     removed_symints = tuple([-1 if isinstance(d, torch.SymInt) else d for d in shape])
     return list(removed_symints)
diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py
index cc2e5a088f4..f062cdc90c6 100644
--- a/backends/arm/vgf/backend.py
+++ b/backends/arm/vgf/backend.py
@@ -14,12 +14,10 @@
 
 import logging
 import os  # nosec B404 - used alongside subprocess for tool invocation
-import shlex
 import shutil
 import subprocess  # nosec B404 - required to drive external converter CLI
 import tempfile
-from dataclasses import dataclass
-from typing import Any, final, List
+from typing import final, List
 
 from executorch.backends.arm._passes import RewriteConvPass
 from executorch.backends.arm._passes.arm_pass_manager import (
@@ -40,7 +38,7 @@
 )
 from executorch.backends.arm.vgf.model_converter import (  # type: ignore[import-not-found]
     model_converter_env,
-    require_model_converter_executable,
+    require_model_converter_binary,
 )
 from executorch.exir.backend.backend_details import (  # type: ignore[import-not-found]
     BackendDetails,
@@ -54,94 +52,6 @@
 # debug functionality
 logger = logging.getLogger(__name__)
 
-STATUS_OK = "PASS"
-STATUS_FAIL = "FAIL"
-VGF_BACKEND_NAME = "VgfBackend"
-
-
-@dataclass(frozen=True)
-class VgfRuntimeEnvironmentCheck:
-    """One VGF runtime backend environment preflight result.
-
-    This lives next to the Python VGF backend name and backend implementation,
-    while importing the actual ExecuTorch runtime lazily so AoT import behavior
-    remains unchanged.
-
-    """
-
-    name: str
-    status: str
-    detail: str
-    action: str | None = None
-
-    @property
-    def ok(self) -> bool:
-        return self.status != STATUS_FAIL
-
-    def to_dict(self) -> dict[str, str | None]:
-        return {
-            "name": self.name,
-            "status": self.status,
-            "detail": self.detail,
-            "action": self.action,
-        }
-
-
-def _load_runtime() -> Any:
-    from executorch.runtime import Runtime
-
-    return Runtime.get()
-
-
-def check_vgf_runtime_backend_environment() -> VgfRuntimeEnvironmentCheck:
-    """Check whether the installed runtime exposes the VGF backend."""
-
-    try:
-        runtime = _load_runtime()
-    except Exception as exc:
-        return VgfRuntimeEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_FAIL,
-            f"Could not initialize executorch.runtime.Runtime: {exc}",
-            "Install or rebuild ExecuTorch with runtime pybindings. For source "
-            "builds, enable the VGF runtime backend and reinstall the package.",
-        )
-
-    try:
-        registered_backend_names = list(
-            runtime.backend_registry.registered_backend_names
-        )
-        is_available = runtime.backend_registry.is_available(
-            backend_name=VGF_BACKEND_NAME
-        )
-    except Exception as exc:
-        return VgfRuntimeEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_FAIL,
-            f"Runtime backend registry query failed: {exc}",
-            "Reinstall or rebuild ExecuTorch with backend registry pybindings.",
-        )
-
-    if is_available:
-        return VgfRuntimeEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_OK,
-            f"{VGF_BACKEND_NAME} is available in the runtime backend registry.",
-        )
-
-    rendered = ", ".join(registered_backend_names[:20])
-    if len(registered_backend_names) > 20:
-        rendered += ", ..."
-
-    return VgfRuntimeEnvironmentCheck(
-        "VGF runtime backend",
-        STATUS_FAIL,
-        f"{VGF_BACKEND_NAME} is not available. Registered backends: "
-        f"{rendered or '<none>'}.",
-        "Use a runtime build/package that includes the VGF backend. For source "
-        "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.",
-    )
-
 
 def _register_grid_sampler_rewrite_pass() -> None:
     """Register VGF-only custom shader lowering passes."""
@@ -252,52 +162,6 @@ def preprocess(
         return PreprocessResult(processed_bytes=binary)
 
 
-def _format_repro_command(command: List[str]) -> str:
-    """Return a shell-safe command string for reproducing converter failures."""
-    return " ".join(shlex.quote(arg) for arg in command)
-
-
-def _copy_failure_artifacts(
-    tosa_path: str,
-    artifact_path: str | None,
-    tag_name: str,
-) -> str | None:
-    """Copy the failing TOSA input to the artifact directory, if configured.
-
-    Args:
-        tosa_path: Temporary TOSA flatbuffer passed to model-converter.
-        artifact_path: User-configured intermediate artifact directory.
-        tag_name: Optional delegation tag used to disambiguate artifacts.
-
-    Returns:
-        Path to the copied TOSA file, or None if no artifact path was configured.
-
-    """
-    if not artifact_path:
-        return None
-
-    os.makedirs(artifact_path, exist_ok=True)
-
-    suffix = f"_{tag_name}" if tag_name else ""
-    failure_tosa_path = os.path.join(
-        artifact_path,
-        f"failed_model_converter_input{suffix}.tosa",
-    )
-    shutil.copy2(tosa_path, failure_tosa_path)
-    return failure_tosa_path
-
-
-def _replace_converter_input_path(
-    conversion_command: List[str],
-    input_path: str,
-) -> List[str]:
-    """Return a converter command that uses a preserved TOSA input path."""
-    input_flag_index = conversion_command.index("-i")
-    repro_command = list(conversion_command)
-    repro_command[input_flag_index + 1] = input_path
-    return repro_command
-
-
 def vgf_compile(
     tosa_flatbuffer: bytes,
     compile_flags: List[str],
@@ -327,7 +191,7 @@ def vgf_compile(
             f.write(tosa_flatbuffer)
 
         compile_flags = [f for f in compile_flags if f and f.strip()]
-        converter_binary = str(require_model_converter_executable())
+        converter_binary = require_model_converter_binary()
         vgf_path = tosa_path + ".vgf"
         conversion_command = [
             converter_binary,
@@ -346,21 +210,11 @@ def vgf_compile(
                 env=model_converter_env(),
             )
         except subprocess.CalledProcessError as process_error:
-            failure_tosa_path = _copy_failure_artifacts(
-                tosa_path,
-                artifact_path,
-                tag_name,
-            )
-            repro_command = (
-                _replace_converter_input_path(conversion_command, failure_tosa_path)
-                if failure_tosa_path
-                else conversion_command
-            )
+            conversion_command_str = " ".join(conversion_command)
             raise RuntimeError(
-                "Vgf compiler failed.\n"
-                f"Repro command:\n  {_format_repro_command(repro_command)}\n"
-                f"Stderr:\n{process_error.stderr.decode()}\n"
-                f"Stdout:\n{process_error.stdout.decode()}"
+                f"Vgf compiler ('{conversion_command_str}') failed with error:\n \
+                {process_error.stderr.decode()}\n \
+                Stdout:\n{process_error.stdout.decode()}"
             )
 
         if artifact_path:
diff --git a/backends/arm/vgf/check_env.py b/backends/arm/vgf/check_env.py
index 2c7fb9c5396..576964df160 100644
--- a/backends/arm/vgf/check_env.py
+++ b/backends/arm/vgf/check_env.py
@@ -26,18 +26,25 @@
 import os
 import re
 import shutil
+import subprocess  # nosec B404 - invoked only for trusted local tools
 import sys
 from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
-from executorch.backends.arm.vgf import model_converter
+from executorch.backends.arm.vgf.model_converter import (
+    find_model_converter_binary,
+    model_converter_env,
+)
+
 
 STATUS_OK = "PASS"
 STATUS_WARN = "WARN"
 STATUS_FAIL = "FAIL"
 
+VGF_BACKEND_NAME = "VgfBackend"
+
 _REQUIRED_VKML_INSTANCE_LAYERS = {
     "VK_LAYER_ML_Graph_Emulation",
     "VK_LAYER_ML_Tensor_Emulation",
@@ -209,17 +216,6 @@ def _format_check(check: VgfEnvironmentCheck) -> str:
     return "\n".join(lines)
 
 
-def _as_environment_check(check: Any) -> VgfEnvironmentCheck:
-    """Convert a module-owned preflight result into the CLI report type."""
-
-    return VgfEnvironmentCheck(
-        check.name,
-        check.status,
-        check.detail,
-        getattr(check, "action", None),
-    )
-
-
 def _repo_root() -> Path:
     resolved = Path(__file__).resolve()
     for parent in resolved.parents:
@@ -301,22 +297,165 @@ def _check_tosa_serializer() -> VgfEnvironmentCheck:
     )
 
 
+def _resolve_executable(binary: str) -> Path | None:
+    path = Path(binary)
+    if path.is_absolute() or path.parent != Path("."):
+        if _safe_is_file(path) and os.access(path, os.X_OK):
+            return path
+        return None
+
+    resolved = shutil.which(binary)
+    if resolved:
+        return Path(resolved)
+    return None
+
+
+def _command_output(result: subprocess.CompletedProcess[str]) -> str:
+    text = "\n".join(
+        part.strip() for part in (result.stdout, result.stderr) if part.strip()
+    )
+    lines = text.splitlines()
+    if not lines:
+        return "<no output>"
+    return "\n".join(lines[:4])
+
+
 def _check_model_converter() -> VgfEnvironmentCheck:
-    """Convert a module-owned preflight result into the CLI report type."""
-    return _as_environment_check(model_converter.check_model_converter_environment())
+    binary = find_model_converter_binary()
+    if binary is None:
+        return VgfEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            "Could not find model-converter on PATH and MODEL_CONVERTER_PATH "
+            "does not point to an executable file.",
+            "Install VGF AoT dependencies with "
+            "python -m pip install 'executorch[vgf]' or, in a source checkout, "
+            "python -m pip install -r backends/arm/requirements-arm-vgf.txt. "
+            "Alternatively set MODEL_CONVERTER_PATH to the converter executable.",
+        )
+
+    executable = _resolve_executable(binary)
+    if executable is None:
+        return VgfEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"Resolved converter candidate {binary!r}, but it is not executable.",
+            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.",
+        )
+
+    try:
+        result = subprocess.run(  # nosec B603 - local converter executable
+            [str(executable), "--version"],
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=20,
+            env=model_converter_env(),
+        )
+    except Exception as exc:
+        return VgfEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"Found {executable}, but running '--version' failed: {exc}",
+            "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. "
+            "For source setup, source examples/arm/arm-scratch/setup_path.sh.",
+        )
+
+    if result.returncode != 0:
+        return VgfEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"{executable} --version exited with {result.returncode}:\n"
+            f"{_command_output(result)}",
+            "Check that the model-converter binary and its shared libraries are "
+            "from the same MLSDK install.",
+        )
+
+    return VgfEnvironmentCheck(
+        "MLSDK model converter",
+        STATUS_OK,
+        f"{executable} --version succeeded:\n{_command_output(result)}",
+    )
 
 
 def _check_model_converter_lib_dir() -> VgfEnvironmentCheck:
-    """Convert a module-owned preflight result into the CLI report type."""
-    return _as_environment_check(
-        model_converter.check_model_converter_lib_dir_environment()
+    lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR")
+    if not lib_dir:
+        return VgfEnvironmentCheck(
+            "MODEL_CONVERTER_LIB_DIR",
+            STATUS_OK,
+            "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader "
+            "paths. This is OK when model-converter --version succeeds.",
+        )
+
+    path = Path(lib_dir).expanduser()
+    if _safe_is_dir(path):
+        return VgfEnvironmentCheck(
+            "MODEL_CONVERTER_LIB_DIR",
+            STATUS_OK,
+            f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}",
+        )
+
+    return VgfEnvironmentCheck(
+        "MODEL_CONVERTER_LIB_DIR",
+        STATUS_FAIL,
+        f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.",
+        "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.",
     )
 
 
+def _load_runtime() -> Any:
+    from executorch.runtime import Runtime
+
+    return Runtime.get()
+
+
 def _check_runtime_vgf_backend() -> VgfEnvironmentCheck:
-    from executorch.backends.arm.vgf import backend as vgf_backend
+    try:
+        runtime = _load_runtime()
+    except Exception as exc:
+        return VgfEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_FAIL,
+            f"Could not initialize executorch.runtime.Runtime: {exc}",
+            "Install or rebuild ExecuTorch with runtime pybindings. For source "
+            "builds, enable the VGF runtime backend and reinstall the package.",
+        )
+
+    try:
+        registered_backend_names = list(
+            runtime.backend_registry.registered_backend_names
+        )
+        is_available = runtime.backend_registry.is_available(
+            backend_name=VGF_BACKEND_NAME
+        )
+    except Exception as exc:
+        return VgfEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_FAIL,
+            f"Runtime backend registry query failed: {exc}",
+            "Reinstall or rebuild ExecuTorch with backend registry pybindings.",
+        )
+
+    if is_available:
+        return VgfEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_OK,
+            f"{VGF_BACKEND_NAME} is available in the runtime backend registry.",
+        )
 
-    return _as_environment_check(vgf_backend.check_vgf_runtime_backend_environment())
+    rendered = ", ".join(registered_backend_names[:20])
+    if len(registered_backend_names) > 20:
+        rendered += ", ..."
+
+    return VgfEnvironmentCheck(
+        "VGF runtime backend",
+        STATUS_FAIL,
+        f"{VGF_BACKEND_NAME} is not available. Registered backends: "
+        f"{rendered or '<none>'}.",
+        "Use a runtime build/package that includes the VGF backend. For source "
+        "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.",
+    )
 
 
 def _package_dirs(package: str) -> list[Path]:
diff --git a/backends/arm/vgf/model_converter.py b/backends/arm/vgf/model_converter.py
index d76abbbcdf6..2d3868837b1 100644
--- a/backends/arm/vgf/model_converter.py
+++ b/backends/arm/vgf/model_converter.py
@@ -1,4 +1,4 @@
-# Copyright 2025-2026 Arm Limited and/or its affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,45 +6,12 @@
 from __future__ import annotations
 
 import os
-import subprocess  # nosec B404 - invoked only for trusted local converter tools
-from dataclasses import dataclass
-from pathlib import Path
 from shutil import which
 from typing import Optional
 
 MODEL_CONVERTER_BINARY = "model-converter"
 _MODEL_CONVERTER_FALLBACK_BINARY = "model_converter"
 
-STATUS_OK = "PASS"
-STATUS_FAIL = "FAIL"
-
-
-@dataclass(frozen=True)
-class ModelConverterEnvironmentCheck:
-    """One model-converter environment preflight result.
-
-    This lives in the same module that resolves and launches the converter so
-    the standalone VGF preflight CLI cannot drift from the actual compiler path.
-
-    """
-
-    name: str
-    status: str
-    detail: str
-    action: str | None = None
-
-    @property
-    def ok(self) -> bool:
-        return self.status != STATUS_FAIL
-
-    def to_dict(self) -> dict[str, str | None]:
-        return {
-            "name": self.name,
-            "status": self.status,
-            "detail": self.detail,
-            "action": self.action,
-        }
-
 
 def find_model_converter_binary() -> Optional[str]:
     """Return the path/name of the first model converter executable found."""
@@ -58,20 +25,6 @@ def find_model_converter_binary() -> Optional[str]:
     return None
 
 
-def _safe_is_file(path: Path) -> bool:
-    try:
-        return path.is_file()
-    except OSError:
-        return False
-
-
-def _safe_is_dir(path: Path) -> bool:
-    try:
-        return path.is_dir()
-    except OSError:
-        return False
-
-
 def model_converter_env() -> dict[str, str]:
     """Return an env dict suitable for running model-converter as a subprocess.
 
@@ -99,134 +52,3 @@ def require_model_converter_binary() -> str:
             f"Tried: {tried}. Ensure the Model Converter is installed and on PATH."
         )
     return binary
-
-
-def resolve_model_converter_executable(binary: str) -> Path | None:
-    """Resolve a converter candidate to an executable path, if possible.
-
-    This is shared by the VGF compiler path and the preflight checker so both
-    agree on what a usable converter executable means.
-
-    """
-
-    path = Path(binary)
-    if path.is_absolute() or path.parent != Path("."):
-        if _safe_is_file(path) and os.access(path, os.X_OK):
-            return path
-        return None
-
-    resolved = which(binary)
-    if resolved:
-        return Path(resolved)
-    return None
-
-
-def require_model_converter_executable() -> Path:
-    """Return a usable converter executable path or raise a helpful error."""
-
-    binary = require_model_converter_binary()
-    executable = resolve_model_converter_executable(binary)
-    if executable is None:
-        raise RuntimeError(
-            f"Resolved converter candidate {binary!r}, but it is not executable. "
-            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH."
-        )
-    return executable
-
-
-def _command_output(result: subprocess.CompletedProcess[str]) -> str:
-    text = "\n".join(
-        part.strip() for part in (result.stdout, result.stderr) if part.strip()
-    )
-    lines = text.splitlines()
-    if not lines:
-        return "<no output>"
-    return "\n".join(lines[:4])
-
-
-def check_model_converter_environment() -> ModelConverterEnvironmentCheck:
-    """Check the model-converter dependency used by VGF compilation."""
-
-    binary = find_model_converter_binary()
-    if binary is None:
-        return ModelConverterEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            "Could not find model-converter on PATH and MODEL_CONVERTER_PATH "
-            "does not point to an executable file.",
-            "Install VGF AoT dependencies with "
-            "python -m pip install 'executorch[vgf]' or, in a source checkout, "
-            "python -m pip install -r backends/arm/requirements-arm-vgf.txt. "
-            "Alternatively set MODEL_CONVERTER_PATH to the converter executable.",
-        )
-
-    executable = resolve_model_converter_executable(binary)
-    if executable is None:
-        return ModelConverterEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"Resolved converter candidate {binary!r}, but it is not executable.",
-            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.",
-        )
-
-    try:
-        result = subprocess.run(  # nosec B603 - local converter executable
-            [str(executable), "--version"],
-            check=False,
-            capture_output=True,
-            text=True,
-            timeout=20,
-            env=model_converter_env(),
-        )
-    except Exception as exc:
-        return ModelConverterEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"Found {executable}, but running '--version' failed: {exc}",
-            "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. "
-            "For source setup, source examples/arm/arm-scratch/setup_path.sh.",
-        )
-
-    if result.returncode != 0:
-        return ModelConverterEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"{executable} --version exited with {result.returncode}:\n"
-            f"{_command_output(result)}",
-            "Check that the model-converter binary and its shared libraries are "
-            "from the same MLSDK install.",
-        )
-
-    return ModelConverterEnvironmentCheck(
-        "MLSDK model converter",
-        STATUS_OK,
-        f"{executable} --version succeeded:\n{_command_output(result)}",
-    )
-
-
-def check_model_converter_lib_dir_environment() -> ModelConverterEnvironmentCheck:
-    """Check MODEL_CONVERTER_LIB_DIR used by model_converter_env()."""
-
-    lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR")
-    if not lib_dir:
-        return ModelConverterEnvironmentCheck(
-            "MODEL_CONVERTER_LIB_DIR",
-            STATUS_OK,
-            "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader "
-            "paths. This is OK when model-converter --version succeeds.",
-        )
-
-    path = Path(lib_dir).expanduser()
-    if _safe_is_dir(path):
-        return ModelConverterEnvironmentCheck(
-            "MODEL_CONVERTER_LIB_DIR",
-            STATUS_OK,
-            f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}",
-        )
-
-    return ModelConverterEnvironmentCheck(
-        "MODEL_CONVERTER_LIB_DIR",
-        STATUS_FAIL,
-        f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.",
-        "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.",
-    )
diff --git a/backends/cadence/fused_quant/op_add.cpp b/backends/cadence/fused_quant/op_add.cpp
index 1aea2ccfb6c..62e58c71c83 100644
--- a/backends/cadence/fused_quant/op_add.cpp
+++ b/backends/cadence/fused_quant/op_add.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_add.h b/backends/cadence/fused_quant/op_add.h
index b32710f41de..9db1e907294 100644
--- a/backends/cadence/fused_quant/op_add.h
+++ b/backends/cadence/fused_quant/op_add.h
@@ -19,18 +19,19 @@ executorch::aten::Tensor& add_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const std::optional<executorch::aten::Tensor>& inp_scale,
-    const std::optional<executorch::aten::Tensor>& inp_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const std::optional<executorch::aten::Tensor>& other_scale,
-    const std::optional<executorch::aten::Tensor>& other_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const std::optional<executorch::aten::Tensor>& out_scale,
-    const std::optional<executorch::aten::Tensor>& out_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp
index 8d071b48a33..7204ab6c88f 100644
--- a/backends/cadence/fused_quant/op_bmm.cpp
+++ b/backends/cadence/fused_quant/op_bmm.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h
index c6a4502983f..ef9598eac98 100644
--- a/backends/cadence/fused_quant/op_bmm.h
+++ b/backends/cadence/fused_quant/op_bmm.h
@@ -19,18 +19,19 @@ executorch::aten::Tensor& bmm_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const std::optional<executorch::aten::Tensor>& inp_scale,
-    const std::optional<executorch::aten::Tensor>& inp_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const std::optional<executorch::aten::Tensor>& other_scale,
-    const std::optional<executorch::aten::Tensor>& other_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const std::optional<executorch::aten::Tensor>& out_scale,
-    const std::optional<executorch::aten::Tensor>& out_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_hardswish.cpp b/backends/cadence/fused_quant/op_hardswish.cpp
index 4b968cebe6c..452ea90a405 100644
--- a/backends/cadence/fused_quant/op_hardswish.cpp
+++ b/backends/cadence/fused_quant/op_hardswish.cpp
@@ -16,10 +16,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_hardswish.h b/backends/cadence/fused_quant/op_hardswish.h
index de7d88b427b..ba9e09da23c 100644
--- a/backends/cadence/fused_quant/op_hardswish.h
+++ b/backends/cadence/fused_quant/op_hardswish.h
@@ -18,13 +18,13 @@ namespace native {
 executorch::aten::Tensor& hardswish_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
-    const std::optional<executorch::aten::Tensor>& inp_scale,
-    const std::optional<executorch::aten::Tensor>& inp_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const std::optional<executorch::aten::Tensor>& out_scale,
-    const std::optional<executorch::aten::Tensor>& out_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_mul.cpp b/backends/cadence/fused_quant/op_mul.cpp
index a2595104ae8..3d071f7c2da 100644
--- a/backends/cadence/fused_quant/op_mul.cpp
+++ b/backends/cadence/fused_quant/op_mul.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_mul.h b/backends/cadence/fused_quant/op_mul.h
index 62314c98003..f7afa016b79 100644
--- a/backends/cadence/fused_quant/op_mul.h
+++ b/backends/cadence/fused_quant/op_mul.h
@@ -19,18 +19,19 @@ executorch::aten::Tensor& mul_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const std::optional<executorch::aten::Tensor>& inp_scale,
-    const std::optional<executorch::aten::Tensor>& inp_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const std::optional<executorch::aten::Tensor>& other_scale,
-    const std::optional<executorch::aten::Tensor>& other_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const std::optional<executorch::aten::Tensor>& out_scale,
-    const std::optional<executorch::aten::Tensor>& out_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_relu.cpp b/backends/cadence/fused_quant/op_relu.cpp
index e8e58522d2e..ebe7933a7b9 100644
--- a/backends/cadence/fused_quant/op_relu.cpp
+++ b/backends/cadence/fused_quant/op_relu.cpp
@@ -16,10 +16,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_relu.h b/backends/cadence/fused_quant/op_relu.h
index 522144eacd0..e8527c7633f 100644
--- a/backends/cadence/fused_quant/op_relu.h
+++ b/backends/cadence/fused_quant/op_relu.h
@@ -18,13 +18,13 @@ namespace native {
 executorch::aten::Tensor& relu_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
-    const std::optional<executorch::aten::Tensor>& inp_scale,
-    const std::optional<executorch::aten::Tensor>& inp_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const std::optional<executorch::aten::Tensor>& out_scale,
-    const std::optional<executorch::aten::Tensor>& out_zero_point,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/quant_utils.h b/backends/cadence/fused_quant/quant_utils.h
index 78884bfcceb..fff669a9e0e 100644
--- a/backends/cadence/fused_quant/quant_utils.h
+++ b/backends/cadence/fused_quant/quant_utils.h
@@ -64,8 +64,8 @@ struct QParams {
 };
 
 inline QParams extract_qparams(
-    const std::optional<executorch::aten::Tensor>& scale_tensor,
-    const std::optional<executorch::aten::Tensor>& zp_tensor,
+    const executorch::aten::optional<executorch::aten::Tensor>& scale_tensor,
+    const executorch::aten::optional<executorch::aten::Tensor>& zp_tensor,
     int64_t quant_min,
     int64_t quant_max,
     const executorch::aten::Tensor& data_tensor) {
diff --git a/backends/cadence/fused_quant/tests/test_op_add.cpp b/backends/cadence/fused_quant/tests/test_op_add.cpp
index 61124f0b9b2..dca110cf0e1 100644
--- a/backends/cadence/fused_quant/tests/test_op_add.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_add.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
index bae04993a7a..5ede47ea8a9 100644
--- a/backends/cadence/fused_quant/tests/test_op_bmm.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
index eb6231161f2..502d680d2e3 100644
--- a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_mul.cpp b/backends/cadence/fused_quant/tests/test_op_mul.cpp
index da27c7287c9..0b9addabc5e 100644
--- a/backends/cadence/fused_quant/tests/test_op_mul.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_mul.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_relu.cpp b/backends/cadence/fused_quant/tests/test_op_relu.cpp
index 1096daae202..6b83551fd2b 100644
--- a/backends/cadence/fused_quant/tests/test_op_relu.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_relu.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
+using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
-using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.cpp b/backends/cadence/generic/operators/op_avg_pool2d.cpp
index c33f91151fb..b04187db62e 100644
--- a/backends/cadence/generic/operators/op_avg_pool2d.cpp
+++ b/backends/cadence/generic/operators/op_avg_pool2d.cpp
@@ -19,11 +19,11 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 // Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype,
 // and AT is the accumulation datatype. 'quantized' is true when the input is
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.h b/backends/cadence/generic/operators/op_avg_pool2d.h
index 85b5d55a84b..05f1810bb61 100644
--- a/backends/cadence/generic/operators/op_avg_pool2d.h
+++ b/backends/cadence/generic/operators/op_avg_pool2d.h
@@ -23,8 +23,9 @@ ::executorch::aten::Tensor& avg_pool2d_out(
     ::executorch::aten::IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    std::optional<int64_t> divisor_override,
-    const std::optional<::executorch::aten::Tensor>& in_zero_point_t,
+    ::executorch::aten::optional<int64_t> divisor_override,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>&
+        in_zero_point_t,
     bool channel_last,
     ::executorch::aten::Tensor& out);
 
diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp
index b65f8016880..36befc52102 100644
--- a/backends/cadence/generic/operators/op_fully_connected.cpp
+++ b/backends/cadence/generic/operators/op_fully_connected.cpp
@@ -15,10 +15,10 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 void linear(
     const Tensor& input,
diff --git a/backends/cadence/generic/operators/op_fully_connected.h b/backends/cadence/generic/operators/op_fully_connected.h
index 7e03f5ef664..d23bcbeb70c 100644
--- a/backends/cadence/generic/operators/op_fully_connected.h
+++ b/backends/cadence/generic/operators/op_fully_connected.h
@@ -15,9 +15,9 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 Tensor& fully_connected_out(
     KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_linalg_svd.cpp b/backends/cadence/generic/operators/op_linalg_svd.cpp
index 4cb4f6397ea..4974b617418 100644
--- a/backends/cadence/generic/operators/op_linalg_svd.cpp
+++ b/backends/cadence/generic/operators/op_linalg_svd.cpp
@@ -261,7 +261,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> linalg_svd_out(
     const Tensor& A,
     bool full_matrices,
     bool compute_uv,
-    std::optional<std::string_view> driver,
+    ::executorch::aten::optional<::executorch::aten::string_view> driver,
     Tensor& U,
     Tensor& S,
     Tensor& Vh) {
diff --git a/backends/cadence/generic/operators/op_linalg_svd.h b/backends/cadence/generic/operators/op_linalg_svd.h
index e8335b7fa0e..7635276c4f5 100644
--- a/backends/cadence/generic/operators/op_linalg_svd.h
+++ b/backends/cadence/generic/operators/op_linalg_svd.h
@@ -26,7 +26,7 @@ linalg_svd_out(
     const ::executorch::aten::Tensor& A,
     bool full_matrices,
     bool compute_uv,
-    std::optional<std::string_view> driver,
+    ::executorch::aten::optional<::executorch::aten::string_view> driver,
     ::executorch::aten::Tensor& U,
     ::executorch::aten::Tensor& S,
     ::executorch::aten::Tensor& Vh);
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
index 8a427045a83..6f42543cfc1 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
@@ -256,7 +256,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
-    __ET_UNUSED const std::optional<Tensor>& offset,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   (void)ctx;
   quantized_conv1d_nlc(
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
index f1780497f73..4f4d2877b27 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
@@ -54,7 +54,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const std::optional<Tensor>& offset,
+    const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.cpp b/backends/cadence/generic/operators/op_quantized_conv2d.cpp
index f6755f9dda8..0811267a3b8 100644
--- a/backends/cadence/generic/operators/op_quantized_conv2d.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv2d.cpp
@@ -16,11 +16,11 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 /* This implements a generic 2d conv kernel that operates on raw pointers.
  * The quantized version handles quantized convolutions for 2D inputs.
@@ -936,7 +936,7 @@ Tensor& quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     ET_UNUSED int64_t out_multiplier,
     ET_UNUSED int64_t out_shift,
-    ET_UNUSED const std::optional<Tensor>& offset,
+    ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   quantized_conv2d_nhwc(
       input,
diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.h b/backends/cadence/generic/operators/op_quantized_conv2d.h
index 02740d3afec..bb9476e2644 100644
--- a/backends/cadence/generic/operators/op_quantized_conv2d.h
+++ b/backends/cadence/generic/operators/op_quantized_conv2d.h
@@ -205,7 +205,7 @@ ::executorch::aten::Tensor& quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const std::optional<Tensor>& offset,
+    const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out);
 
 ::executorch::aten::Tensor& quantized_conv2d_depthwise_nhwc_out(
diff --git a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 05fb809cd51..a8f98a76ffc 100644
--- a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -57,7 +57,7 @@ ::executorch::aten::Tensor& quantized_depthwise_conv1d_nlc_per_tensor_out(
       output_zero_point,
       out_multiplier,
       out_shift,
-      std::optional<Tensor>(),
+      ::executorch::aten::optional<Tensor>(),
       out);
 }
 
diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
index d2e0d6a8bd9..55ca67648ca 100644
--- a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
+++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
@@ -19,11 +19,11 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 #define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
   _(uint8_t, Byte)                           \
diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.h b/backends/cadence/generic/operators/op_quantized_embedding_byte.h
index 84fc53620a0..a46bebe09df 100644
--- a/backends/cadence/generic/operators/op_quantized_embedding_byte.h
+++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.h
@@ -19,7 +19,8 @@ ::executorch::aten::Tensor& quantized_embedding_byte_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& weight,
     const ::executorch::aten::Tensor& weight_scales,
-    const std::optional<::executorch::aten::Tensor>& weight_zero_points,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>&
+        weight_zero_points,
     const ::executorch::aten::Tensor& indices,
     bool pruned_weights,
     ::executorch::aten::Tensor& out);
diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
index ce74b5b8b7f..55e29cb7f52 100644
--- a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
+++ b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
@@ -16,10 +16,10 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 Tensor& quantized_fully_connected_out(
     ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.h b/backends/cadence/generic/operators/op_quantized_fully_connected.h
index 408fbabe726..a7510fba95f 100644
--- a/backends/cadence/generic/operators/op_quantized_fully_connected.h
+++ b/backends/cadence/generic/operators/op_quantized_fully_connected.h
@@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out(
@@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
@@ -52,7 +52,7 @@ quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
@@ -66,7 +66,7 @@ quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
index 85825cff94d..e34ed342d22 100644
--- a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
+++ b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
@@ -24,6 +24,7 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
@@ -31,7 +32,6 @@ using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
diff --git a/backends/cadence/generic/operators/op_quantized_linear.cpp b/backends/cadence/generic/operators/op_quantized_linear.cpp
index 02ff97de74d..87f990a855b 100644
--- a/backends/cadence/generic/operators/op_quantized_linear.cpp
+++ b/backends/cadence/generic/operators/op_quantized_linear.cpp
@@ -18,11 +18,11 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::executorch::runtime::toString;
-using std::optional;
 
 Tensor& quantized_linear_out(
     ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_linear.h b/backends/cadence/generic/operators/op_quantized_linear.h
index 517357d5bf9..b5396cb9701 100644
--- a/backends/cadence/generic/operators/op_quantized_linear.h
+++ b/backends/cadence/generic/operators/op_quantized_linear.h
@@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_linear_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& quantized_linear_per_tensor_out(
@@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_linear_per_tensor_out(
     const int64_t out_multiplier,
     const int64_t out_shift,
     const int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
diff --git a/backends/cadence/generic/operators/op_quantized_matmul.cpp b/backends/cadence/generic/operators/op_quantized_matmul.cpp
index b84c879e65d..e3fb0f00fdc 100644
--- a/backends/cadence/generic/operators/op_quantized_matmul.cpp
+++ b/backends/cadence/generic/operators/op_quantized_matmul.cpp
@@ -21,12 +21,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 // The quantized matmul. The quantized matmul accumulates in a wider register,
 // whose type is TA.
diff --git a/backends/cadence/generic/operators/op_quantized_matmul.h b/backends/cadence/generic/operators/op_quantized_matmul.h
index c28862aa11e..70775380aac 100644
--- a/backends/cadence/generic/operators/op_quantized_matmul.h
+++ b/backends/cadence/generic/operators/op_quantized_matmul.h
@@ -15,9 +15,9 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 Tensor& quantized_matmul_out(
     KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_mul.cpp b/backends/cadence/generic/operators/op_quantized_mul.cpp
index 359a305b020..30352ee9d52 100644
--- a/backends/cadence/generic/operators/op_quantized_mul.cpp
+++ b/backends/cadence/generic/operators/op_quantized_mul.cpp
@@ -21,13 +21,13 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 DECLARE_POINTWISE_TENSOR_QUANTIZED_BINARY_OP(quantized_mul_, *);
 
diff --git a/backends/cadence/generic/operators/op_quantized_relu.cpp b/backends/cadence/generic/operators/op_quantized_relu.cpp
index ecb87bd1b90..9430951f65b 100644
--- a/backends/cadence/generic/operators/op_quantized_relu.cpp
+++ b/backends/cadence/generic/operators/op_quantized_relu.cpp
@@ -21,12 +21,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 template <typename T>
 void quantized_relu_per_tensor_out_(
diff --git a/backends/cadence/generic/operators/op_requantize.cpp b/backends/cadence/generic/operators/op_requantize.cpp
index b9df6f1f355..f846a1964a3 100644
--- a/backends/cadence/generic/operators/op_requantize.cpp
+++ b/backends/cadence/generic/operators/op_requantize.cpp
@@ -19,13 +19,13 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 // Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
 // The scale and zero_point for requantization are in the args.
diff --git a/backends/cadence/generic/operators/op_rope.cpp b/backends/cadence/generic/operators/op_rope.cpp
index fcc7d629cf7..17ee6d2a684 100644
--- a/backends/cadence/generic/operators/op_rope.cpp
+++ b/backends/cadence/generic/operators/op_rope.cpp
@@ -12,8 +12,8 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
-using std::optional;
 
 Tensor& rope_out(
     ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
@@ -75,8 +75,8 @@ namespace impl {
 namespace generic {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
-using std::optional;
 
 Tensor& rope_rotate_stacked_halves_out(
     ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_rope.h b/backends/cadence/generic/operators/op_rope.h
index d738cfda6c1..638677bf118 100644
--- a/backends/cadence/generic/operators/op_rope.h
+++ b/backends/cadence/generic/operators/op_rope.h
@@ -20,7 +20,7 @@ ::executorch::aten::Tensor& rope_out(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& sin_tensor,
     const ::executorch::aten::Tensor& cos_tensor,
-    const std::optional<::executorch::aten::Tensor>& pos,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& pos,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& rope_rotate_stacked_halves_out(
@@ -28,7 +28,7 @@ ::executorch::aten::Tensor& rope_rotate_stacked_halves_out(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& sin_tensor,
     const ::executorch::aten::Tensor& cos_tensor,
-    const std::optional<::executorch::aten::Tensor>& pos,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& pos,
     ::executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_softmax.cpp b/backends/cadence/generic/operators/op_softmax.cpp
index b680d1e2471..97c64a22511 100644
--- a/backends/cadence/generic/operators/op_softmax.cpp
+++ b/backends/cadence/generic/operators/op_softmax.cpp
@@ -125,7 +125,7 @@ Tensor& _softmax_f32_f32_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& X,
     int64_t dim,
-    __ET_UNUSED std::optional<bool> half_to_float,
+    __ET_UNUSED ::executorch::aten::optional<bool> half_to_float,
     Tensor& Y) {
   _softmax_out(ctx, X, dim, false, Y);
 
diff --git a/backends/cadence/generic/operators/op_softmax.h b/backends/cadence/generic/operators/op_softmax.h
index d83703117b0..ec51b1d00c0 100644
--- a/backends/cadence/generic/operators/op_softmax.h
+++ b/backends/cadence/generic/operators/op_softmax.h
@@ -26,7 +26,7 @@ ::executorch::aten::Tensor& _softmax_f32_f32_out(
     __ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& X,
     int64_t dim,
-    __ET_UNUSED std::optional<bool> half_to_float,
+    __ET_UNUSED ::executorch::aten::optional<bool> half_to_float,
     ::executorch::aten::Tensor& Y);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_transposed_convolution.cpp b/backends/cadence/generic/operators/op_transposed_convolution.cpp
index b742ec635b2..121b479e65f 100644
--- a/backends/cadence/generic/operators/op_transposed_convolution.cpp
+++ b/backends/cadence/generic/operators/op_transposed_convolution.cpp
@@ -16,12 +16,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
-using std::optional;
 
 // This implements a generic 2d transposed_conv kernel that operates on raw
 // pointers. The version handles both quantized and fp32 convolutions.
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index ccd54e80698..514813fbe05 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -17,8 +17,8 @@ using executorch::aten::RuntimeContext;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::ArrayRef;
-using std::optional;
 using torch::executor::Error;
+using torch::executor::optional;
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index 9d363469f74..5171c2908bc 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -238,7 +238,7 @@ void quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    __ET_UNUSED const std::optional<Tensor>& offset,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   // HiFi nnlib kernels only support dilation=1.
   // Fall back to generic implementation for dilation > 1.
diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
index 86ef244711d..ea3a756f995 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
@@ -17,7 +17,7 @@ using Tensor = executorch::aten::Tensor;
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
 using ScalarType = executorch::aten::ScalarType;
 using ::executorch::aten::IntArrayRef;
-using std::optional;
+using ::executorch::aten::optional;
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
index a8e2b42d77d..4299990b52a 100644
--- a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -206,7 +206,7 @@ void quantized_depthwise_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
-        std::optional<Tensor>(),
+        ::executorch::aten::optional<Tensor>(),
         out);
     return;
   }
diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h
index a567c7f650d..c53a07b58aa 100644
--- a/backends/cadence/hifi/operators/op_quantized_matmul_out.h
+++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.h
@@ -21,7 +21,7 @@ ::executorch::aten::Tensor& quantized_matmul_out(
     int64_t X_zero_point,
     const ::executorch::aten::Tensor& Y,
     int64_t Y_zero_point,
-    const std::optional<::executorch::aten::Tensor>& bias,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
diff --git a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
index 907156af1f7..074ff29b301 100644
--- a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
+++ b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
@@ -22,7 +22,7 @@ inline Tensor& _softmax_f32_f32_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
-    std::optional<bool> half_to_float,
+    ::executorch::aten::optional<bool> half_to_float,
     Tensor& out) {
   constexpr int kNnlibMaxDim = 16;
 
@@ -146,7 +146,7 @@ Tensor& softmax_f32_f32_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
-    std::optional<bool> half_to_float,
+    ::executorch::aten::optional<bool> half_to_float,
     Tensor& out) {
   return _softmax_f32_f32_out(ctx, in, dim, half_to_float, out);
 }
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
index fa6847f744b..3ca505d40cb 100644
--- a/backends/cadence/hifi/operators/operators.h
+++ b/backends/cadence/hifi/operators/operators.h
@@ -72,7 +72,7 @@ void quantized_linear_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 void quantized_linear_per_tensor_out(
@@ -85,7 +85,7 @@ void quantized_linear_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 void quantized_conv2d_nhwc_out(
@@ -158,7 +158,7 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const std::optional<::executorch::aten::Tensor>& offset,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& cat_out(
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index aaba9f5696d..be4b34bff03 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -582,7 +582,7 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    ET_UNUSED const std::optional<Tensor>& offset,
+    ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   quantized_conv_per_tensor_out(
       ctx,
diff --git a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
index c53f7f7667a..29aa8906414 100644
--- a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
@@ -13,10 +13,10 @@ namespace impl {
 namespace vision {
 namespace native {
 
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
-using std::optional;
 
 void quantized_fully_connected_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
index 7b3daed8ef6..b6b7cdd17bc 100644
--- a/backends/cadence/vision/operators/op_quantized_linear_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
@@ -84,7 +84,7 @@ void quantized_linear_out(
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
-    __ET_UNUSED const std::optional<Tensor>& offset,
+    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   // TODO: refactor to use switch case as quantized_linear_per_tensor_out
   if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
@@ -127,7 +127,7 @@ void quantized_linear_per_tensor_out(
     const int64_t out_multiplier,
     const int64_t out_shift,
     const int64_t out_zero_point,
-    __ET_UNUSED const std::optional<Tensor>& offset,
+    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
 #define typed_quantized_linear_per_tensor(ctype, dtype) \
   case executorch::aten::ScalarType::dtype: {           \
diff --git a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
index e63ae5bdda1..54a303288c3 100644
--- a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
@@ -60,7 +60,7 @@ void inline _typed_quantized_matmul(
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const std::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
@@ -114,7 +114,7 @@ void quantized_matmul_out(
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const std::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index 6b93709b226..58ca33c6a0b 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -50,7 +50,7 @@ Tensor& _softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
 
-  const std::optional<int64_t>& dim_t = dim;
+  const executorch::aten::optional<int64_t>& dim_t = dim;
   const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim());
   const size_t size = in.size(d);
 
diff --git a/backends/cadence/vision/operators/operators.h b/backends/cadence/vision/operators/operators.h
index 1c756c0b237..8b5db4161eb 100644
--- a/backends/cadence/vision/operators/operators.h
+++ b/backends/cadence/vision/operators/operators.h
@@ -31,7 +31,7 @@ using ::executorch::runtime::getLeadingDims;
 inline __attribute__((always_inline)) void linear_(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
-    const std::optional<::executorch::aten::Tensor>& bias,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
     ::executorch::aten::Tensor& output) {
   const float* __restrict__ input_data = input.const_data_ptr<float>();
   const float* __restrict__ weight_data = weight.const_data_ptr<float>();
diff --git a/backends/cortex_m/TARGETS b/backends/cortex_m/TARGETS
index 1b73bb03bfc..b84add05516 100644
--- a/backends/cortex_m/TARGETS
+++ b/backends/cortex_m/TARGETS
@@ -20,23 +20,12 @@ python_library(
     ],
 )
 
-python_library(
-    name = "cmsis_nn",
-    srcs = [
-        "library/__init__.py",
-        "library/cmsis_nn.py",
-    ],
-    deps = [
-        "fbsource//third-party/cmsis-nn:cmsis_nn_py",
-    ],
-)
-
 python_library(
     name = "target_config",
     srcs = [
         "target_config.py",
     ],
     deps = [
-        ":cmsis_nn",
+        "fbsource//third-party/cmsis-nn:cmsis_nn_py",
     ],
 )
diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
index 807cf18cebc..656309abcee 100644
--- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
+++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -49,7 +49,7 @@ class CMSISScratchBufferContext final {
       Tensor& scratch_buffer,
       const Tensor& weights,
       const Tensor& weight_zero_point,
-      const std::optional<Tensor>& bias)
+      const torch::executor::optional<Tensor>& bias)
       : scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
         total_size_(scratch_buffer.size(0)),
         base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index 13e8b132410..3d4f19e10d0 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -19,7 +19,7 @@ bool validate_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Tensor& output,
     const Int64ArrayRef& stride,
     const Int64ArrayRef& padding,
@@ -103,7 +103,7 @@ Tensor& quantized_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef dilation,
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index 0793606de44..a8e1fc21ed7 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -19,7 +19,7 @@ bool validate_depthwise_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Tensor& output,
     const Int64ArrayRef& stride,
     const Int64ArrayRef& padding,
@@ -140,7 +140,7 @@ Tensor& quantized_depthwise_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef dilation,
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
index c92ec493cd5..7448058de8e 100644
--- a/backends/cortex_m/ops/op_quantized_linear.cpp
+++ b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -18,8 +18,8 @@ Tensor& quantized_linear_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weights,
-    const std::optional<Tensor>& bias,
-    const std::optional<Tensor>& kernel_sum,
+    const torch::executor::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& kernel_sum,
     const int64_t input_offset,
     const int64_t filter_offset,
     const int64_t output_offset,
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
index 04d57d4c693..e7ecbc7c7b4 100644
--- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
@@ -21,7 +21,7 @@ bool validate_transpose_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Tensor& output,
     const Tensor& requantize_multipliers,
     const Tensor& requantize_shifts) {
@@ -88,7 +88,7 @@ Tensor& quantized_transpose_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const std::optional<Tensor>& bias,
+    const torch::executor::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef output_padding,
diff --git a/backends/cortex_m/passes/BUCK b/backends/cortex_m/passes/BUCK
index c792583f657..20444f16718 100644
--- a/backends/cortex_m/passes/BUCK
+++ b/backends/cortex_m/passes/BUCK
@@ -1,7 +1,6 @@
 load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -41,7 +40,6 @@ fbcode_target(_kind = runtime.python_library,
     deps=[
         "//caffe2:torch",
         "//executorch/backends/arm/_passes:passes",
-        "//executorch/backends/cortex_m:cmsis_nn",
         "//executorch/backends/cortex_m:target_config",
         "//executorch/backends/cortex_m/ops:ops",
         "//executorch/backends/cortex_m/passes:passes_utils",
diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py
index ec3d67c4d31..6d6783488fe 100644
--- a/backends/cortex_m/passes/__init__.py
+++ b/backends/cortex_m/passes/__init__.py
@@ -3,6 +3,36 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from importlib.util import find_spec
+
+
+def _missing_dependencies_error(missing: str) -> ModuleNotFoundError:
+    return ModuleNotFoundError(
+        "Cortex-M backend dependencies are not installed "
+        f"(missing: {missing}). Install ExecuTorch with "
+        "`pip install executorch[cortex_m]`, or if building from source run "
+        "`examples/arm/setup.sh --i-agree-to-the-contained-eula`."
+    )
+
+
+def _ensure_cortex_m_dependencies() -> None:
+    required_modules = {
+        "cmsis_nn": "cmsis_nn",
+    }
+    missing_packages = []
+    for module_name, package_name in required_modules.items():
+        try:
+            if find_spec(module_name) is None:
+                missing_packages.append(package_name)
+        except (ImportError, ValueError):
+            missing_packages.append(package_name)
+
+    if missing_packages:
+        raise _missing_dependencies_error(", ".join(missing_packages))
+
+
+_ensure_cortex_m_dependencies()
+
 from .cortex_m_pass import CortexMPass  # noqa  # usort: skip
 from .activation_fusion_pass import ActivationFusionPass  # noqa
 from .aten_to_cortex_m_pass import AtenToCortexMPass  # noqa
diff --git a/backends/cortex_m/passes/aten_to_cortex_m_pass.py b/backends/cortex_m/passes/aten_to_cortex_m_pass.py
index 3f5a6055331..ecc7187797d 100644
--- a/backends/cortex_m/passes/aten_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/aten_to_cortex_m_pass.py
@@ -8,12 +8,12 @@
 import math
 from typing import cast, Optional
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import executorch.backends.cortex_m.ops.operators  # noqa
 import executorch.exir as exir
 import torch
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
-from executorch.backends.cortex_m.library import cmsis_nn
 
 from executorch.backends.cortex_m.passes.passes_utils import (
     build_activation_lut,
diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py
index b247e2be944..95a9c441f61 100644
--- a/backends/cortex_m/passes/scratch_buffer_sizes.py
+++ b/backends/cortex_m/passes/scratch_buffer_sizes.py
@@ -6,11 +6,11 @@
 from collections.abc import Callable
 from typing import Any, cast
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import executorch.backends.cortex_m.ops.operators  # noqa
 
 import torch
 import torch.fx
-from executorch.backends.cortex_m.library import cmsis_nn
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
diff --git a/backends/cortex_m/target_config.py b/backends/cortex_m/target_config.py
index 341ae612cb5..23cb15c4a53 100644
--- a/backends/cortex_m/target_config.py
+++ b/backends/cortex_m/target_config.py
@@ -1,6 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,7 +10,7 @@
 from enum import auto, Enum
 from typing import Optional
 
-from executorch.backends.cortex_m.library import cmsis_nn
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 
 
 class CortexM(Enum):
diff --git a/backends/cortex_m/test/misc/test_cmsis_pybind.py b/backends/cortex_m/test/misc/test_cmsis_pybind.py
index 08a1d973234..f85a4bacece 100644
--- a/backends/cortex_m/test/misc/test_cmsis_pybind.py
+++ b/backends/cortex_m/test/misc/test_cmsis_pybind.py
@@ -1,4 +1,5 @@
 # Copyright 2026 Arm Limited and/or its affiliates.
+# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +11,7 @@
 
 def _import_cmsis_nn():
     try:
-        return importlib.import_module("executorch.backends.cortex_m.library.cmsis_nn")
+        return importlib.import_module("cmsis_nn")
     except Exception as exc:
         pytest.fail(f"Failed to resolve cmsis_nn: {exc}")
 
diff --git a/backends/cortex_m/test/misc/test_target_config.py b/backends/cortex_m/test/misc/test_target_config.py
index 472d1927886..3e648b0a81c 100644
--- a/backends/cortex_m/test/misc/test_target_config.py
+++ b/backends/cortex_m/test/misc/test_target_config.py
@@ -1,13 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import pytest
 
-from executorch.backends.cortex_m.library import cmsis_nn
 from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
 
 
diff --git a/backends/cortex_m/test/ops/test_avg_pool2d.py b/backends/cortex_m/test/ops/test_avg_pool2d.py
index a2992b50905..315d968188f 100644
--- a/backends/cortex_m/test/ops/test_avg_pool2d.py
+++ b/backends/cortex_m/test/ops/test_avg_pool2d.py
@@ -93,7 +93,7 @@ def test_dialect_avg_pool2d(test_case, cortex_m_target):
         qtol=1,
     )
 
-    from executorch.backends.cortex_m.library import cmsis_nn
+    import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 
     module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
     pool_target = exir_ops.edge.cortex_m.quantized_avg_pool2d.default
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index c3d7446eaa2..f7d095540ad 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -319,12 +319,8 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
-    std::string so_blob_key;
-    std::string weights_blob_key;
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        executorch::backends::aoti::resolve_blob_keys(
-            processed, method_name, so_blob_key, weights_blob_key),
-        "Malformed named-data key payload");
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
     auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
@@ -398,11 +394,11 @@ class ET_EXPERIMENTAL CudaBackend final
     // methods are independent sub-graphs that may have FQN collisions
     // (e.g. parakeet).
     if (is_weight_sharing_across_methods_enabled()) {
-      ET_CHECK_OK_OR_RETURN_ERROR(load_constants_with_cache(
-          handle, named_data_map, method_name, weights_blob_key));
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          load_constants_with_cache(handle, named_data_map, method_name));
     } else {
       ET_CHECK_OK_OR_RETURN_ERROR(
-          load_constants_legacy(handle, named_data_map, weights_blob_key));
+          load_constants_legacy(handle, named_data_map, method_name));
     }
 
     // Use shared CUDA stream if enabled via options, otherwise create one.
@@ -1015,14 +1011,13 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_constants_with_cache(
       cuda::CudaDelegateHandle* handle,
       const NamedDataMap* named_data_map,
-      const std::string& method_name,
-      const std::string& weights_blob_key) const {
+      const std::string& method_name) const {
     // Check if the required APIs are available
     if (!handle->get_num_constants || !handle->get_constant_name ||
         !handle->get_constant_original_fqn || !handle->extract_constants_map ||
         !handle->update_user_managed_constant_buffer_pairs) {
       // Fall back to the legacy path
-      return load_constants_legacy(handle, named_data_map, weights_blob_key);
+      return load_constants_legacy(handle, named_data_map, method_name);
     }
 
     // Step 1: Enumerate constants and partition into cached/uncached
@@ -1074,6 +1069,8 @@ class ET_EXPERIMENTAL CudaBackend final
     if (!uncached_fqns.empty()) {
       // Need to load from blob — use update_constants_from_blob for all,
       // then extract the new constants into the cache.
+      std::string weights_blob_key =
+          method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
       auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
 
       ET_CHECK_OR_RETURN_ERROR(
@@ -1193,7 +1190,9 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_constants_legacy(
       cuda::CudaDelegateHandle* handle,
       const NamedDataMap* named_data_map,
-      const std::string& weights_blob_key) const {
+      const std::string& method_name) const {
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
     auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
     if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
       ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py
index 89c1204ea00..0ee345be08a 100644
--- a/backends/cuda/tests/test_cuda_partitioner.py
+++ b/backends/cuda/tests/test_cuda_partitioner.py
@@ -12,18 +12,17 @@
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir.backend.partitioner import PartitionResult
 from executorch.exir.delegate import executorch_call_delegate
-from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+from torch._export.utils import is_buffer
 from torch.export import export
-from torch.fx.passes.utils.fuser_utils import validate_partition
 
 
 class TestCudaPartitioner(unittest.TestCase):
     """
     Test CUDA partitioner functionality.
 
-    A fully delegatable graph collapses to a single partition. When a
-    non-delegated node splits the delegatable ops, the partitioner emits one
-    convex partition per island.
+    After CUDA partitioning, there should be exactly one partitioned graph that contains
+    all operators from the input graph. This means all operators should be tagged with
+    the same delegation tag, indicating they will all be executed by the CUDA backend.
     """
 
     def _get_partition_result(
@@ -179,6 +178,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         for node in partition_result.tagged_exported_program.graph.nodes:
             if node.op == "placeholder":
                 # Check if this is a constant (param, buffer, or lifted tensor constant)
+                from torch._export.utils import (
+                    is_buffer,
+                    is_lifted_tensor_constant,
+                    is_param,
+                )
+
                 is_constant = (
                     is_param(partition_result.tagged_exported_program, node)
                     or is_buffer(partition_result.tagged_exported_program, node)
@@ -211,9 +216,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             f"All constant placeholders should be tagged. Found untagged constants: {untagged_constants}",
         )
 
-        # Verify all tagged constants share the (single) partition's tag.
-        self.assertEqual(len(partition_result.partition_tags), 1)
-        expected_tag = next(iter(partition_result.partition_tags))
+        # Verify all tagged constants have the expected tag
+        expected_tag = "tag0"
         for node in constant_placeholders:
             actual_tag = node.meta.get("delegation_tag")
             self.assertEqual(
@@ -316,143 +320,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.assertNotIn("delegation_tag", buffer_placeholder.meta)
         self.assertNotIn("delegation_tag", delegate.meta)
         self.assertIn("delegation_tag", aten_node.meta)
-
-    def test_multiple_partitions_for_split_graph(self) -> None:
-        """Ops split by a non-delegated node must land in separate partitions.
-
-        One tag over the disconnected islands would be non-convex and fail fusion.
-        """
-
-        class TwoAddModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                a = x + 1.0
-                return a + 2.0
-
-        exported_program = export(TwoAddModule(), (torch.randn(3, 4),), strict=True)
-        graph_module = exported_program.graph_module
-        graph = graph_module.graph
-
-        add_nodes = [
-            n
-            for n in graph.nodes
-            if n.op == "call_function" and n.target != operator.getitem
-        ]
-        first_add, second_add = add_nodes[0], add_nodes[1]
-
-        # Splice an already-lowered region between the two adds so the second add
-        # depends on the first only through that non-delegated node.
-        graph_module.lowered_module_0 = torch.nn.Module()
-        with graph.inserting_before(second_add):
-            lowered = graph.get_attr("lowered_module_0")
-            delegate = graph.call_function(
-                executorch_call_delegate, (lowered, first_add)
-            )
-            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
-        second_add.replace_input_with(first_add, delegate_output)
-        graph.lint()
-
-        result = CudaPartitioner([]).partition(exported_program)
-
-        # Separated by the delegate, the adds must land in different partitions.
-        self.assertEqual(len(result.partition_tags), 2)
-        self.assertIn("delegation_tag", first_add.meta)
-        self.assertIn("delegation_tag", second_add.meta)
-        self.assertNotEqual(
-            first_add.meta["delegation_tag"], second_add.meta["delegation_tag"]
-        )
-        self.assertNotIn("delegation_tag", delegate.meta)
-        self.assertNotIn("delegation_tag", delegate_output.meta)
-
-        # Each partition must be convex on its own so fusion does not cycle.
-        for tag in result.partition_tags:
-            tagged = [
-                n
-                for n in exported_program.graph.nodes
-                if n.meta.get("delegation_tag") == tag
-            ]
-            self.assertTrue(validate_partition(tagged))
-
-    def test_control_flow_get_attr_shares_op_tag(self) -> None:
-        """A control-flow op's branch get_attrs must share the op's partition tag.
-
-        They are not call_function nodes, so the capability partitioner does not
-        claim them; they must be lowered into the same submodule as the op.
-        """
-
-        class CondModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.cond(x.sum() > 0, torch.sin, torch.cos, (x,))
-
-        exported_program = export(CondModule(), (torch.randn(3, 4),), strict=True)
-        result = CudaPartitioner([]).partition(exported_program)
-
-        cond_node = next(
-            n
-            for n in exported_program.graph.nodes
-            if n.op == "call_function" and n.target is torch.ops.higher_order.cond
-        )
-        branch_get_attrs = [
-            arg
-            for arg in cond_node.args
-            if isinstance(arg, torch.fx.Node) and arg.op == "get_attr"
-        ]
-
-        self.assertEqual(len(branch_get_attrs), 2)
-        self.assertIn(cond_node.meta["delegation_tag"], result.partition_tags)
-        for get_attr in branch_get_attrs:
-            self.assertEqual(
-                get_attr.meta.get("delegation_tag"),
-                cond_node.meta["delegation_tag"],
-            )
-
-    def test_shared_constant_across_partitions(self) -> None:
-        """A constant read by two partitions is claimed, not dropped.
-
-        tag_constant_data assigns it one partition's tag; backend lowering later
-        duplicates it per consumer, so partitioning must not crash or drop it.
-        """
-
-        class SharedWeightModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.register_buffer("w", torch.randn(3, 4))
-
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return (x + self.w) + self.w
-
-        exported_program = export(
-            SharedWeightModule(), (torch.randn(3, 4),), strict=True
-        )
-        graph_module = exported_program.graph_module
-        graph = graph_module.graph
-
-        add_nodes = [
-            n
-            for n in graph.nodes
-            if n.op == "call_function" and n.target != operator.getitem
-        ]
-        first_add, second_add = add_nodes[0], add_nodes[1]
-
-        # Split the two adds (both reading w) with an already-lowered region.
-        graph_module.lowered_module_0 = torch.nn.Module()
-        with graph.inserting_before(second_add):
-            lowered = graph.get_attr("lowered_module_0")
-            delegate = graph.call_function(
-                executorch_call_delegate, (lowered, first_add)
-            )
-            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
-        second_add.replace_input_with(first_add, delegate_output)
-        graph.lint()
-
-        result = CudaPartitioner([]).partition(exported_program)
-
-        # Two islands, and the shared buffer is claimed by one of them, not dropped.
-        self.assertEqual(len(result.partition_tags), 2)
-        buffer_placeholder = next(
-            n
-            for n in graph.nodes
-            if n.op == "placeholder" and is_buffer(exported_program, n)
-        )
-        self.assertIn(
-            buffer_placeholder.meta.get("delegation_tag"), result.partition_tags
-        )
diff --git a/backends/cuda/tests/test_tq4_sdpa.py b/backends/cuda/tests/test_tq4_sdpa.py
index f9543b1ff18..9cf1e9e2d57 100644
--- a/backends/cuda/tests/test_tq4_sdpa.py
+++ b/backends/cuda/tests/test_tq4_sdpa.py
@@ -20,6 +20,7 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+
 from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.backends.cuda.triton.kernels.tq4_sdpa import tq4_sdpa
@@ -252,7 +253,7 @@ def test_gqa_prefill(self):
                 self._run_test(1, H_q, H_kv, 64, 64, 128, is_causal=True)
 
     def test_gqa_8x_head_dim_256(self):
-        """GQA 8:1 with head_dim=256."""
+        """GQA 8:1 with head_dim=256 — matches Qwen 3.5 MoE config."""
         self._run_test(1, 16, 2, 1, 128, 256)
         L = 64
         mask = torch.tril(torch.ones(1, 1, L, L, dtype=torch.bool, device="cuda"))
@@ -374,8 +375,8 @@ def test_float_mask_rejected(self):
                 float_mask,
             )
 
-    def test_config_hd256_gqa_16_2(self):
-        """head_dim=256, GQA 16:2, decode + prefill."""
+    def test_qwen35_moe_config(self):
+        """Qwen 3.5 MoE: head_dim=256, GQA 16:2, decode + prefill."""
         self._run_test(1, 16, 2, 1, 256, 256)
         self._run_test(1, 16, 2, 128, 128, 256, is_causal=True)
 
@@ -437,437 +438,6 @@ def test_output_shape_and_dtype(self):
                     self.assertEqual(out.shape, (1, H_q, Lq, D))
                     self.assertEqual(out.dtype, torch.bfloat16)
 
-    # ------------------------------------------------------------------
-    # 128k code path: kv_len clamp (decode) + mask_is_causal (prefill)
-    #
-    # Every test above calls tq4_sdpa WITHOUT kv_len and WITHOUT
-    # mask_is_causal, so they only exercise the kv_len=None fallback
-    # (full-Lk loop) at short KV. The cases below drive the actual
-    # long-context paths at two representative GQA shapes (head_dim=512
-    # GQA 8:4, and head_dim=256 GQA 16:2):
-    #   * the on-device kv_len scalar that bounds the KV loop to the
-    #     filled context (decode), and
-    #   * the mask_is_causal per-tile causal block-skip (prefill).
-    #
-    # "GARBAGE TAIL": in production the KV cache is a fixed buffer
-    # pre-allocated to max_seq_len (e.g. 131072). At any step only the
-    # first kv_len positions hold real K/V; the rest is stale /
-    # uninitialized memory that attention must ignore. We simulate that
-    # tail by writing large-magnitude (x1000) values into [kv_len:]. If
-    # the clamp / block-skip works the kernel never reads the tail and
-    # the output matches a reference built from [0, kv_len) only; if it
-    # is broken the huge tail values dominate the softmax and the cosine
-    # collapses to ~0. So the garbage tail is a built-in negative control
-    # (verified: dropping kv_len drives the cosine to ~-0.01 and fails).
-    #
-    # CAUSAL ALIGNMENT (top-left vs bottom-right): when L_q < L_kv (a
-    # chunked prefill / decode, where the Lq new queries sit at the END
-    # of a kv_len-long context) there are two ways to place the causal
-    # triangle. PyTorch F.sdpa(is_causal=True) uses TOP-LEFT alignment
-    # (query row i attends to keys [0, i]) -- wrong for a KV cache. This
-    # kernel (and a KV-cache decoder's mask builder) use BOTTOM-RIGHT
-    # alignment: query row i is absolute position (kv_len - Lq + i) and
-    # attends to keys [0, kv_len - Lq + i]. So the reference below builds
-    # an explicit bottom-right mask (q_pos >= cache_pos) rather than
-    # passing is_causal=True, which would otherwise mismatch the kernel.
-    # ------------------------------------------------------------------
-
-    def _run_long_kv_test(
-        self,
-        *,
-        H_q,
-        H_kv,
-        D,
-        Lq,
-        kv_len,
-        buffer_len,
-        causal=False,
-        garbage=True,
-        pass_kv_len=True,
-        min_cosine=0.99,
-        seed=42,
-    ):
-        """Drive tq4_sdpa over a buffer whose first ``kv_len`` positions are
-        real and whose ``[kv_len:]`` tail is large-magnitude garbage, then
-        compare against an fp32 reference built from the first ``kv_len``
-        positions only.
-
-        The kernel sees the full (garbage-tailed) compressed buffer; the
-        on-device ``kv_len`` scalar (and, for prefill, the bottom-right
-        causal mask) must confine attention to ``[0, kv_len)``.
-
-        ``causal=True`` builds a bottom-right-aligned mask (the Lq queries
-        are the last Lq positions of a kv_len-long context), mirroring a
-        KV-cache decoder's ``q_pos >= cache_pos`` mask and the kernel's
-        ``(kv_len - Lq) + seq_pos`` block bound. We deliberately do NOT use
-        ``F.sdpa(is_causal=True)`` for the reference: PyTorch aligns
-        is_causal top-left when L_q < L_kv, while this kernel (and such a
-        decoder) align bottom-right.
-        """
-        torch.manual_seed(seed)
-        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
-        centroids = centroids.cuda()
-        boundaries = boundaries.cuda()
-        rotation = rotation.cuda()
-
-        B = 1
-        k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
-        v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
-        if garbage and buffer_len > kv_len:
-            g = buffer_len - kv_len
-            k[:, :, kv_len:, :] = (
-                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
-            )
-            v[:, :, kv_len:, :] = (
-                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
-            )
-
-        q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda")
-
-        k_packed, k_norms = _compress(k, boundaries, rotation)
-        v_packed, v_norms = _compress(v, boundaries, rotation)
-
-        attn_mask = None
-        if causal:
-            cache_pos = torch.arange(buffer_len, device="cuda")
-            q_pos = torch.arange(kv_len - Lq, kv_len, device="cuda").unsqueeze(1)
-            attn_mask = (q_pos >= cache_pos.unsqueeze(0)).view(1, 1, Lq, buffer_len)
-
-        kv_len_t = (
-            torch.tensor([kv_len], dtype=torch.int32, device="cuda")
-            if pass_kv_len
-            else None
-        )
-
-        out = self.tq4_sdpa(
-            q,
-            k_packed,
-            k_norms,
-            v_packed,
-            v_norms,
-            centroids,
-            rotation,
-            attn_mask=attn_mask,
-            is_causal=False,
-            scale=None,
-            kv_len=kv_len_t,
-            mask_is_causal=causal,
-        )
-
-        # Reference: the same decompress-then-fp32-SDPA path the other tests
-        # use (_reference_tq4_sdpa), but over ONLY the first kv_len positions
-        # so the garbage tail can never influence it. _compress is per-row,
-        # so compressing the sliced K/V here is bit-identical to the kernel's
-        # view of the full buffer sliced to [:, :, :kv_len]; the helper also
-        # handles the GQA repeat_interleave and mask broadcast internally.
-        ref_mask = attn_mask[:, :, :, :kv_len] if attn_mask is not None else None
-        ref, *_ = _reference_tq4_sdpa(
-            q,
-            k[:, :, :kv_len],
-            v[:, :, :kv_len],
-            centroids,
-            boundaries,
-            rotation,
-            attn_mask=ref_mask,
-        )
-
-        self.assertFalse(torch.isnan(out).any(), "NaN in output")
-        cos = _cosine_sim(out, ref)
-        self.assertGreater(
-            cos,
-            min_cosine,
-            f"Cosine {cos:.5f} < {min_cosine} "
-            f"(H_q={H_q} H_kv={H_kv} D={D} Lq={Lq} kv_len={kv_len} "
-            f"buffer={buffer_len} causal={causal} kv_len_passed={pass_kv_len})",
-        )
-        return cos
-
-    def _run_splitk_vs_fused_test(
-        self,
-        *,
-        H_q,
-        H_kv,
-        D,
-        Lq,
-        kv_len,
-        buffer_len,
-        B=1,
-        seed=42,
-    ):
-        """Verify split-K output matches fused kernel output for same inputs.
-
-        Runs tq4_sdpa twice: once with kv_len (triggers split-K for Lq=1, kv_len>=256),
-        and once without kv_len (forces fused kernel path). Both outputs must match
-        within fp tolerance, proving split-K computes the same result.
-        """
-        torch.manual_seed(seed)
-        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
-        centroids = centroids.cuda()
-        boundaries = boundaries.cuda()
-        rotation = rotation.cuda()
-
-        k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
-        v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
-        # Add garbage tail to ensure split-K respects kv_len bound
-        if buffer_len > kv_len:
-            g = buffer_len - kv_len
-            k[:, :, kv_len:, :] = (
-                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
-            )
-            v[:, :, kv_len:, :] = (
-                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
-            )
-
-        q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda")
-
-        k_packed, k_norms = _compress(k, boundaries, rotation)
-        v_packed, v_norms = _compress(v, boundaries, rotation)
-
-        # Split-K path: with kv_len (triggers split-K for Lq=1, kv_len>=256)
-        kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
-        out_splitk = self.tq4_sdpa(
-            q,
-            k_packed,
-            k_norms,
-            v_packed,
-            v_norms,
-            centroids,
-            rotation,
-            attn_mask=None,
-            is_causal=False,
-            scale=None,
-            kv_len=kv_len_t,
-            mask_is_causal=False,
-        )
-
-        # Fused kernel path: without kv_len (forces fused kernel)
-        # But we need to slice the buffer to kv_len to avoid garbage
-        k_packed_sliced = k_packed[:, :, :kv_len, :]
-        k_norms_sliced = k_norms[:, :, :kv_len, :]
-        v_packed_sliced = v_packed[:, :, :kv_len, :]
-        v_norms_sliced = v_norms[:, :, :kv_len, :]
-
-        out_fused = self.tq4_sdpa(
-            q,
-            k_packed_sliced,
-            k_norms_sliced,
-            v_packed_sliced,
-            v_norms_sliced,
-            centroids,
-            rotation,
-            attn_mask=None,
-            is_causal=False,
-            scale=None,
-            kv_len=None,
-            mask_is_causal=False,
-        )
-
-        # Both outputs must match (split-K computes same result as fused)
-        self.assertFalse(torch.isnan(out_splitk).any(), "NaN in split-K output")
-        self.assertFalse(torch.isnan(out_fused).any(), "NaN in fused output")
-        cos = _cosine_sim(out_splitk, out_fused)
-        self.assertGreater(
-            cos,
-            0.99,
-            f"Split-K vs Fused cosine {cos:.5f} < 0.99 "
-            f"(B={B} H_q={H_q} H_kv={H_kv} D={D} kv_len={kv_len})",
-        )
-
-    def test_splitk_batch2(self):
-        """Split-K decode (Lq=1) with batch size B=2.
-
-        Exercises the per-batch indexing in the split-K and reduce kernels
-        (b = pid_bh // H_grid). Split-K output must match the fused-kernel
-        path for the same inputs."""
-        self._run_splitk_vs_fused_test(
-            H_q=16, H_kv=2, D=256, Lq=1, kv_len=512, buffer_len=1024, B=2
-        )
-
-    def test_splitk_noncontiguous_query(self):
-        """Split-K decode (Lq=1, B=2) with a non-contiguous query.
-
-        The host wrapper rotates Q (Q @ Pi^T) before launching the kernel,
-        so a strided query must yield the same result as its contiguous
-        copy. Builds a query whose last-dim stride is 2 by slicing a padded
-        buffer, then checks it matches the contiguous query."""
-        H_q, H_kv, D, kv_len, B = 16, 2, 256, 512, 2
-        torch.manual_seed(42)
-        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
-        centroids = centroids.cuda()
-        boundaries = boundaries.cuda()
-        rotation = rotation.cuda()
-
-        k = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda")
-        v = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda")
-        k_packed, k_norms = _compress(k, boundaries, rotation)
-        v_packed, v_norms = _compress(v, boundaries, rotation)
-
-        q = torch.randn(B, H_q, 1, D, dtype=torch.bfloat16, device="cuda")
-        # Non-contiguous alias with identical values (last-dim stride 2).
-        q_pad = torch.empty(B, H_q, 1, D, 2, dtype=torch.bfloat16, device="cuda")
-        q_pad[..., 0] = q
-        q_nc = q_pad[..., 0]
-        self.assertFalse(q_nc.is_contiguous(), "query should be non-contiguous")
-
-        kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
-
-        def _run(query):
-            return self.tq4_sdpa(
-                query,
-                k_packed,
-                k_norms,
-                v_packed,
-                v_norms,
-                centroids,
-                rotation,
-                attn_mask=None,
-                is_causal=False,
-                scale=None,
-                kv_len=kv_len_t,
-                mask_is_causal=False,
-            )
-
-        out_contig = _run(q)
-        out_nc = _run(q_nc)
-
-        self.assertFalse(torch.isnan(out_nc).any(), "NaN in non-contiguous output")
-        cos = _cosine_sim(out_nc, out_contig)
-        self.assertGreater(
-            cos, 0.999, f"non-contiguous vs contiguous query cosine {cos:.5f}"
-        )
-
-    def test_kv_len_clamp_decode_hd512_gqa_8_4(self):
-        """Decode (Lq=1) kv_len clamp at a head_dim=512, GQA 8:4 shape.
-        N=8192 leaves a 24k garbage tail in a 32k buffer (clamp guard);
-        N=32768 fills the buffer (full 32k loop)."""
-        for N in (8192, 32768):
-            with self.subTest(N=N):
-                self._run_long_kv_test(
-                    H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768
-                )
-
-    def test_kv_len_clamp_decode_hd512_gqa_8_4_splitk(self):
-        """Split-K decode (Lq=1) at a head_dim=512, GQA 8:4 shape with long
-        KV. Verifies split-K output matches BOTH (a) fp32 reference over first
-        kv_len positions AND (b) existing fused-kernel output (byte-identical
-        within fp tolerance). Uses garbage tail as negative control."""
-        for N in (8192, 32768):
-            with self.subTest(N=N):
-                # Run with split-K (kv_len >= 256 triggers split-K)
-                _ = self._run_long_kv_test(
-                    H_q=8,
-                    H_kv=4,
-                    D=512,
-                    Lq=1,
-                    kv_len=N,
-                    buffer_len=32768,
-                    min_cosine=0.99,
-                )
-                # Also verify split-K matches fused kernel by running without kv_len
-                # (which forces fused kernel path) and comparing outputs
-                self._run_splitk_vs_fused_test(
-                    H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768
-                )
-
-    def test_kv_len_clamp_decode_hd256_gqa_16_2(self):
-        """Decode (Lq=1) kv_len clamp at a head_dim=256, GQA 16:2 shape."""
-        for N in (8192, 32768):
-            with self.subTest(N=N):
-                self._run_long_kv_test(
-                    H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768
-                )
-
-    def test_kv_len_clamp_decode_hd256_gqa_16_2_splitk(self):
-        """Split-K decode (Lq=1) at a head_dim=256, GQA 16:2 shape with long
-        KV. Verifies split-K output matches BOTH fp32 reference AND fused
-        kernel."""
-        for N in (8192, 32768):
-            with self.subTest(N=N):
-                _ = self._run_long_kv_test(
-                    H_q=16,
-                    H_kv=2,
-                    D=256,
-                    Lq=1,
-                    kv_len=N,
-                    buffer_len=32768,
-                    min_cosine=0.99,
-                )
-                self._run_splitk_vs_fused_test(
-                    H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768
-                )
-
-    def test_mask_is_causal_prefill_hd512_gqa_8_4(self):
-        """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=512,
-        GQA 8:4 shape. The Lq queries are the last Lq of a kv_len-long
-        context; the per-tile causal block-skip plus bottom-right mask must
-        match the fp32 causal reference over the first kv_len positions. A
-        garbage tail beyond kv_len also exercises the clamp."""
-        for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)):
-            with self.subTest(Lq=Lq, kv_len=kv_len):
-                self._run_long_kv_test(
-                    H_q=8,
-                    H_kv=4,
-                    D=512,
-                    Lq=Lq,
-                    kv_len=kv_len,
-                    buffer_len=buf,
-                    causal=True,
-                )
-
-    def test_mask_is_causal_prefill_hd256_gqa_16_2(self):
-        """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=256,
-        GQA 16:2 shape."""
-        for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)):
-            with self.subTest(Lq=Lq, kv_len=kv_len):
-                self._run_long_kv_test(
-                    H_q=16,
-                    H_kv=2,
-                    D=256,
-                    Lq=Lq,
-                    kv_len=kv_len,
-                    buffer_len=buf,
-                    causal=True,
-                )
-
-    def test_kv_len_none_fallback_hd256_gqa_16_2(self):
-        """Regression: the kv_len=None fallback (HAS_KV_LEN False, full-Lk
-        loop) still matches the fp32 reference. This guards the original
-        behavior the kv_len feature must preserve for callers that pass
-        neither kv_len nor mask_is_causal."""
-        self._run_long_kv_test(
-            H_q=16,
-            H_kv=2,
-            D=256,
-            Lq=1,
-            kv_len=256,
-            buffer_len=256,
-            garbage=False,
-            pass_kv_len=False,
-        )
-
-    @unittest.skipUnless(
-        os.environ.get("TQ4_RUN_128K") == "1",
-        "128k case is heavy for the 24GB CI runner; set TQ4_RUN_128K=1 to run",
-    )
-    def test_kv_len_clamp_128k(self):
-        """Full 131072-entry buffer (head_dim=256, GQA 16:2). (a) kv_len=8192
-        with a ~123k garbage tail — the clamp keeps decode O(context) and
-        never touches the tail; (b) kv_len=131072 — correctness at true 128k
-        scale. Gated behind TQ4_RUN_128K because the fp32 reference for (b)
-        needs >~6GB and CI runs on a 24GB A10G."""
-        self._run_long_kv_test(
-            H_q=16, H_kv=2, D=256, Lq=1, kv_len=8192, buffer_len=131072
-        )
-        self._run_long_kv_test(
-            H_q=16,
-            H_kv=2,
-            D=256,
-            Lq=1,
-            kv_len=131072,
-            buffer_len=131072,
-            garbage=False,
-        )
-
     # ------------------------------------------------------------------
     # Validation errors
     # ------------------------------------------------------------------
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index ff8cbb660cb..6ec8ee80688 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -39,7 +39,6 @@
     exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
-    exir_ops.edge.aten.exp.default: ExpConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter,  # noqa F405
     exir_ops.edge.aten.log.default: LogConverter,  # noqa F405
diff --git a/backends/nxp/backend/graph_utils.py b/backends/nxp/backend/graph_utils.py
index f5d8e16475c..88cd996d6fd 100644
--- a/backends/nxp/backend/graph_utils.py
+++ b/backends/nxp/backend/graph_utils.py
@@ -56,7 +56,7 @@ def get_output_shape(node: Node) -> tuple[torch.Size] | torch.Size | None:
 
 
 def is_clamp_preserved_under_quantization(
-    node: Node, min_val: float = 0, max_val: float | None = None
+    node: Node, min_val: int = 0, max_val: int | None = None
 ) -> bool:
     """
     Checks if Clamp/ReLU/HardTanh is preserved under quantization and did
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index 93ba24e61bd..5f19b2e48dc 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -31,9 +31,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.convolution_converter import (
     ConvolutionConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.exp_converter import (
-    ExpConverter,
-)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.getitem_converter import (
     GetItemConverter,
 )
@@ -114,7 +111,6 @@
     "CloneConverter",
     "ConstantPadNDConverter",
     "ConvolutionConverter",
-    "ExpConverter",
     "GetItemConverter",
     "HardTanhConverter",
     "LeakyReluConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
index a1e8c19e9bd..25cf6074701 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
@@ -42,6 +42,17 @@
 from torch.nn import Parameter
 
 
+def _is_convertible_to_relu(node):
+    bounds = ClampConverter._get_clamp_bounds(node)
+    bounds = tuple(v if v is not None and math.isfinite(v) else None for v in bounds)
+
+    # Some specific bounds can be replaced with single op ReLU.
+    if bounds not in ClampConverter.RELU_COMPATIBLE_BOUNDS.values():
+        return False
+
+    return True
+
+
 class ClampConverter(NodeConverter):
     RELU_COMPATIBLE_BOUNDS = {
         "ReluN1To1": (-1, 1),
@@ -59,25 +70,12 @@ class ClampConverter(NodeConverter):
 
     # noinspection PyShadowingBuiltins
     @staticmethod
-    def _get_bounds(node: Node) -> tuple[float | None, float | None]:
+    def _get_clamp_bounds(clamp_node: Node) -> tuple[float | None, float | None]:
         """Extract min and max bounds from `aten.clamp.default` node."""
-        min = try_get_arg(node, 1)
-        max = try_get_arg(node, 2)
+        min = try_get_arg(clamp_node, 1)
+        max = try_get_arg(clamp_node, 2)
         return min, max
 
-    @classmethod
-    def _is_convertible_to_relu(cls, node):
-        bounds = cls._get_bounds(node)
-        bounds = tuple(
-            v if v is not None and math.isfinite(v) else None for v in bounds
-        )
-
-        # Some specific bounds can be replaced with single op ReLU.
-        if bounds not in cls.RELU_COMPATIBLE_BOUNDS.values():
-            return False
-
-        return True
-
     @staticmethod
     def _is_supported_in_IR(
         node: Node,
@@ -102,21 +100,20 @@ def _io_quant_is_same(node: Node):
         dq_params = dequant.args[1:]
         return all(q == dq for q, dq in zip(q_params, dq_params))
 
-    @classmethod
+    @staticmethod
     def _is_supported_on_target(
-        cls,
         node: Node,
         neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        relu_compatible = cls._is_convertible_to_relu(node)
-        bounds = cls._get_bounds(node)
+        relu_compatible = _is_convertible_to_relu(node)
+        bounds = ClampConverter._get_clamp_bounds(node)
 
         if all(b is None or math.isinf(b) for b in bounds):
             return False
 
-        io_quant_consistent = cls._io_quant_is_same(node)
+        io_quant_consistent = ClampConverter._io_quant_is_same(node)
         quant_supported = NodeConverter.uses_quantization_type_for_io(
             node,
             supported_types=[torch.int8, torch.uint8],
@@ -141,20 +138,19 @@ def supports_partitioning_result(
         neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
     ) -> bool:
-        bounds = cls._get_bounds(node)
+        bounds = cls._get_clamp_bounds(node)
 
         # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator
         # and at the same time the node does not satisfy delegation requirements.
-        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfully.
+        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly.
         if bounds in cls.RELU_COMPATIBLE_BOUNDS.values():
             is_alone_in_partition = cls.is_node_alone_in_partition(
                 node, partition_list, filter_fn=is_not_qdq_node
             )
             if is_alone_in_partition:
-                # noinspection PyTypeChecker
                 return is_clamp_preserved_under_quantization(
                     node,
-                    min_val=bounds[0] if bounds[0] is not None else 0,
+                    min_val=bounds[0],
                     max_val=bounds[1],
                 )
 
@@ -171,9 +167,9 @@ def convert(self, node: Node):
             ) -> Tensor
         """
         self.assert_convertible(node)
-        to_relu = self._is_convertible_to_relu(node)
+        to_relu = _is_convertible_to_relu(node)
 
-        bounds = self._get_bounds(node)
+        bounds = self._get_clamp_bounds(node)
         bounds = tuple(
             v if v is not None and math.isfinite(v) else None for v in bounds
         )
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
index 0159143c5f7..f67851895c2 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
@@ -3,16 +3,43 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
-from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import (
-    ClampConverter,
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    CustomDelegationOptions,
+    is_not_qdq_node,
+    NodeConverter,
+    Partition,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
 )
+from executorch.backends.nxp.backend.neutron_operator_support import (
+    activation_supported_on_target,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
+from torch.nn import Parameter
+
 
+class HardTanhConverter(NodeConverter):
+
+    # Maps possible input parameters of HardTanh to equivalent ReLU-based operators supported by TFLite.
+    SUPPORTED_MODES_MAP = {
+        (0.0, 6.0): BuiltinOperator.RELU6,
+        (-1.0, 1.0): BuiltinOperator.RELU_N1_TO_1,
+        (0.0, 1.0): BuiltinOperator.RELU_0_TO_1,
+        (0.0, float("inf")): BuiltinOperator.RELU,
+    }
+
+    # Maps possible modes of HardTanh to equivalent ReLU bounds.
+    SUPPORTED_BOUNDS_MAP = {
+        "ReluN1To1": (-1.0, 1.0),
+        "Relu0To1": (0.0, 1.0),
+        "Relu6": (0.0, 6.0),
+        "Relu": (0.0, float("inf")),
+    }
 
-class HardTanhConverter(ClampConverter):
     @staticmethod
-    def _get_bounds(node: Node) -> tuple[float | None, float | None]:
+    def _get_hardtanh_bounds(node: Node) -> tuple[float, float]:
         args = node.args
 
         match len(args):
@@ -35,3 +62,51 @@ def _get_bounds(node: Node) -> tuple[float | None, float | None]:
                 )
 
         return min_val, max_val
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        bounds = HardTanhConverter._get_hardtanh_bounds(node)
+        return bounds in HardTanhConverter.SUPPORTED_MODES_MAP
+
+    @classmethod
+    def supports_partitioning_result(
+        cls,
+        node: Node,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+    ) -> bool:
+        bounds = HardTanhConverter._get_hardtanh_bounds(node)
+
+        # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator
+        # and at the same time the node does not satisfy delegation requirements.
+        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly.
+        if bounds in [
+            cls.SUPPORTED_BOUNDS_MAP["Relu"],
+            cls.SUPPORTED_BOUNDS_MAP["Relu6"],
+        ]:
+            is_alone_in_partition = cls.is_node_alone_in_partition(
+                node, partition_list, filter_fn=is_not_qdq_node
+            )
+            if is_alone_in_partition:
+                return activation_supported_on_target(node)
+
+        return True
+
+    def convert(self, node: Node):
+        """Convert 'aten::hardtanh' to its supported ReLU equivalent."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        bounds = HardTanhConverter._get_hardtanh_bounds(node)
+
+        op = self.SUPPORTED_MODES_MAP[bounds]
+        t_op.opcode_index = self.builder.op_code_index_for_op_type(op)
+
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index 8674bf697c7..a76abfbef91 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -5,9 +5,6 @@
 
 import torch
 
-from executorch.backends.nxp.backend.data_format import DataFormat
-from executorch.backends.nxp.backend.ir.converter.conversion import translator
-from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
     create_channels_last_to_channels_first_permutation,
 )
@@ -92,15 +89,10 @@ def _is_supported_in_IR(
     def _to_pos_dim(d: int, rank: int):
         return d + rank if d < 0 else d
 
-    @staticmethod
-    def _normalize_dim(dim: list[int], rank: int) -> list[int]:
-        # convert negative index to positive
-        return [MeanDimConverter._to_pos_dim(d, rank) for d in dim]
-
     @staticmethod
     def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
         # convert negative index to positive
-        dim = MeanDimConverter._normalize_dim(dim, rank)
+        dim = [MeanDimConverter._to_pos_dim(d, rank) for d in dim]
 
         perm = create_channels_last_to_channels_first_permutation(rank, True)
         dim = [perm[d] for d in dim]
@@ -114,114 +106,6 @@ def _get_attrs(node: Node) -> tuple[list[int], bool]:
         keepdim = node.args[2] if len(node.args) >= 3 else False
         return dim, keepdim
 
-    def _get_dim_and_handle_io_formats(
-        self, ops: OpsList, dim: list[int], keep_dim: bool
-    ):
-        t_op = ops.middle_op
-        x = t_op.tmp_inputs[0]
-        y = t_op.tmp_outputs[0]
-
-        channels_last_input = x.tensor_format.is_channels_last()
-        channels_last_output = y.tensor_format.is_channels_last()
-        formatless_input = not channels_last_input
-        formatless_output = not channels_last_output
-
-        dim = self._normalize_dim(dim, x.rank)
-
-        if keep_dim:
-            # The rank is preserved and the io formats should always be equal.
-            assert (
-                x.tensor_format == y.tensor_format
-            ), "NXP backend: There is a bug in `mean.dim` format inference."
-
-            # Just adjust the dim to match the input format.
-            if channels_last_input:
-                dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
-
-        else:
-            # `keep_dim = False`, so the output rank != input rank, and the operator changes the tensor format.
-
-            if channels_last_input and formatless_output:
-                if 1 in dim:
-                    # If we are reducing over the channels, the channels dimension gets removed and the output ends up
-                    #  exactly equal in channels last and channels first, regardless of which other dimensions are
-                    #  removed. Therefore, we can just adjust the `dim` and we don't need to insert any `Transpose` ops.
-                    dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
-                elif all(spatial_dim in dim for spatial_dim in range(2, x.rank)):
-                    # All spatial dims are reduced, leaving only batch and channels (both optionally). So the result is
-                    #  equal in channels first and channels last as long as we adjust the `dim` to match a channels last
-                    #  input (similarly to the case above).
-                    dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
-                else:
-                    # If the channels dimension is preserved, we must transpose the input to channels first (to match
-                    #  the edge model) and we must keep the `dim` unchanged (referencing channels first dimensions).
-                    #  Otherwise, the output would not match the input.
-                    to_channels_first_perm = (
-                        translator.create_channels_last_to_channels_first_permutation(
-                            x.rank
-                        )
-                    )
-                    ops.add_pre(
-                        self.builder.create_transpose_operator_before(
-                            t_op, 0, to_channels_first_perm
-                        )
-                    )
-                    t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST
-
-            elif formatless_input and channels_last_output:
-                # We need apply the `mean` with the original `dim`, which will produce a channels first output. Then,
-                #  we need to append a `Transpose` operator to make the output channels last.
-                to_channels_last_perm = (
-                    translator.create_channels_first_to_channels_last_permutation(
-                        y.rank, True
-                    )
-                )
-                ops.add_post(
-                    self.builder.create_transpose_operator_after(
-                        t_op, 0, to_channels_last_perm
-                    )
-                )
-                t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST
-
-            elif formatless_input and formatless_output:
-                # No action needed.
-                pass
-
-            else:  # channels_last_input and channels_last_output
-                # This case cannot currently occur, as it would require the case:
-                #       channels last 4D -> mean -> channels_last 3D
-                #  which cannot currently happen as the 3D conv/pooling/... is supported by adding `view_copy` nodes in
-                #  the edge dialect and converting the node to 4D, and the `view_copy` nodes prevent the propagation of
-                #  the format to the `mean.dim` output.
-                # Therefore, the implementation cannot be tested. But from experience with other operators, it should
-                #  work correctly. We just need to add 2 `Transpose` ops to make the IO channels first, and keep the
-                #  `dim` unchanged.
-                to_channels_first_perm = (
-                    translator.create_channels_last_to_channels_first_permutation(
-                        x.rank
-                    )
-                )
-                ops.add_pre(
-                    self.builder.create_transpose_operator_before(
-                        t_op, 0, to_channels_first_perm
-                    )
-                )
-                t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST
-
-                to_channels_last_perm = (
-                    translator.create_channels_first_to_channels_last_permutation(
-                        y.rank, True
-                    )
-                )
-                ops.add_post(
-                    self.builder.create_transpose_operator_after(
-                        t_op, 0, to_channels_last_perm
-                    )
-                )
-                t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST
-
-        return dim
-
     def convert(self, node: Node):
         """Convert the 'mean.dim' operator to NeutronIR 'Mean'.
         The ExecuTorch schema is:
@@ -239,9 +123,10 @@ def convert(self, node: Node):
 
         t_op = self._create_tflite_op_with_io_tensors(node)
         t_op.builtin_options = mean_options.Mean(keepdim)
+        x = t_op.tmp_inputs[0]
 
-        ops = OpsList(middle_op=t_op)
-        dim = self._get_dim_and_handle_io_formats(ops, dim, keepdim)
+        if x.tensor_format.is_channels_last():
+            dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
 
         convert_axes_from_attribute(t_op, self.builder, dim)
-        self.builder.append_operators(ops.flatten())
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py
index f3fe868ae83..ba4ad14222b 100755
--- a/backends/nxp/backend/ir/converter/quantization_utils.py
+++ b/backends/nxp/backend/ir/converter/quantization_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2026 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -135,12 +135,11 @@ def set_quantization_parameters_to_tensor(
 def quantize_int8(
     data: np.ndarray, scale: List[float], zero_point: List[int]
 ) -> np.ndarray:
-    # noinspection PyTypeChecker
     return quantize(data, zero_point=zero_point, scale=scale)
 
 
 def quantize(
-    value: np.ndarray | float,
+    value: np.ndarray | int,
     zero_point: List[int] | int,
     scale: List[float] | float,
     quant_min: int = -128,
diff --git a/backends/nxp/backend/node_format_inference.py b/backends/nxp/backend/node_format_inference.py
index 030873c88ab..65e34b7fbde 100644
--- a/backends/nxp/backend/node_format_inference.py
+++ b/backends/nxp/backend/node_format_inference.py
@@ -9,27 +9,10 @@
 import torch
 
 from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT
-from executorch.backends.nxp.backend.edge_helper import (
-    is_channels_last_dim_order,
-    try_get_arg,
-)
+
+from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order
 from executorch.backends.nxp.backend.edge_program_converter import functions_converters
-from executorch.backends.nxp.tests.ops_aliases import (
-    AdaptiveAvgPool2D,
-    AvgPool2D,
-    Convolution,
-    DequantizePerChannel,
-    DequantizePerTensor,
-    GetItem,
-    MaxPool2D,
-    MaxPool2DWithIndices,
-    MeanDim,
-    PermuteCopy,
-    QuantizePerTensor,
-    UpsampleBilinear2D,
-    UpsampleNearest2D,
-    ViewCopy,
-)
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from torch.export import ExportedProgram
 from torch.fx import Node
@@ -42,22 +25,21 @@ class NodeFormatInference:
     # The op in the dictionary is mapped to a dictionary, which holds indices to input nodes
     # that are always channels first.
     ops_with_channels_first_nodes = {
-        AdaptiveAvgPool2D: {"inputs": [0]},
+        exir_ops.edge.aten._adaptive_avg_pool2d.default: {"inputs": [0]},
         torch.ops.aten.adaptive_avg_pool2d.default: {"inputs": [0]},
-        AvgPool2D: {"inputs": [0]},
-        Convolution: {"inputs": [0, 1]},
-        MaxPool2DWithIndices: {"inputs": [0]},
-        MaxPool2D: {"inputs": [0]},
-        UpsampleBilinear2D: {"inputs": [0]},
-        UpsampleNearest2D: {"inputs": [0]},
+        exir_ops.edge.aten.avg_pool2d.default: {"inputs": [0]},
+        exir_ops.edge.aten.convolution.default: {"inputs": [0, 1]},
+        exir_ops.edge.aten.max_pool2d_with_indices.default: {"inputs": [0]},
+        exir_ops.edge.aten.max_pool2d.default: {"inputs": [0]},
+        exir_ops.edge.aten.upsample_bilinear2d.vec: {"inputs": [0]},
+        exir_ops.edge.aten.upsample_nearest2d.vec: {"inputs": [0]},
     }
 
     # A set of Edge Aten ops, which have the ability to change the format (for example - input nodes
     # are channels first but output is formatless).
     ops_that_can_change_tensor_format = {
-        ViewCopy,
-        PermuteCopy,
-        MeanDim,
+        exir_ops.edge.aten.view_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
     }
 
     _type_changed_during_last_run: bool
@@ -89,10 +71,10 @@ def __init__(self, edge_program: ExportedProgram, only_for_op_support_check=Fals
         self._type_changed_during_last_run = False
 
         self._known_targets = list(functions_converters) + [
-            DequantizePerTensor,
-            DequantizePerChannel,
-            QuantizePerTensor,
-            GetItem,
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            operator.getitem,
         ]
 
     def identify_node_formats(self):
@@ -122,7 +104,10 @@ def _infer_format_of_nodes(self, node: Node):
             self._handle_node_which_uses_channels_first_format(node)
 
         elif op_type in self.ops_that_can_change_tensor_format:
-            if op_type in [ViewCopy, PermuteCopy]:
+            if op_type in [
+                exir_ops.edge.aten.view_copy.default,
+                exir_ops.edge.aten.permute_copy.default,
+            ]:
                 # Try to assign the `formatless` format to the input and output. The converter will then handle the
                 #  transition.
                 # Note: If the format for the input/output has already been assigned as channels first, it will NOT be
@@ -134,28 +119,10 @@ def _infer_format_of_nodes(self, node: Node):
                     self._node_inputs[node][0], DataFormat.FORMATLESS
                 )
 
-            elif op_type == MeanDim:
-                # The operator schema is:
-                #  mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-                keep_dim = try_get_arg(node, 2) or False
-                if keep_dim:
-                    # The operator preserves the rank, so we can handle it as an operator that can use any node format.
-                    self._handle_node_which_can_use_any_node_format(node)
-                else:
-                    # The operator removes dimensions, so the IO must be marked as `formatless` (unless overridden by
-                    #  channels first of course).
-                    self._assign_format_to_node(
-                        self._node_outputs[node][0], DataFormat.FORMATLESS
-                    )
-                    self._assign_format_to_node(
-                        self._node_inputs[node][0], DataFormat.FORMATLESS
-                    )
-
             else:
                 logger.error(
                     f"Node format inference for node type: {op_type} not found!"
                 )
-
         elif node.op != "call_function" or (
             hasattr(node, "target") and node.target in self._known_targets
         ):
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 9cc174b97e0..d4262b3a9f6 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -212,7 +212,6 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
     exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
-    exir_ops.edge.aten.exp.default: ExpConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter,  # noqa F405
     exir_ops.edge.aten.log.default: LogConverter,  # noqa F405
@@ -437,7 +436,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         graph_module.recompile()
 
-        operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
+        operators_not_to_delegate = self.delegation_spec[1][3].value.decode().split(",")
         logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
 
         parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index 1a84a418e92..f28eb34064c 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -9,9 +9,8 @@
 #
 
 import logging
-import os
 import struct
-from typing import final
+from typing import final, List, Optional
 
 import numpy as np
 import torch
@@ -46,11 +45,10 @@ class NeutronCompileSpecBuilder:
     config: NeutronTargetSpec
 
     def __init__(self):
-        self.compile_spec: list[CompileSpec] = []
+        self.compile_spec: List[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
-        self.intermediates_dir = None
-        self.operators_not_to_delegate: list[str] = []
+        self.operators_not_to_delegate: List[str] = []
         self.use_neutron_for_format_conversion = True
         self.fetch_constants_to_sram = False
         self.dump_kernel_selection_code = False
@@ -64,9 +62,8 @@ def _replace_colons(self, operator: str) -> str:
     def neutron_compile_spec(
         self,
         config: str,
-        intermediates_dir: str | None = None,
-        extra_flags: str | None = None,
-        operators_not_to_delegate: list[str] | None = None,
+        extra_flags: Optional[str] = None,
+        operators_not_to_delegate: Optional[List[str]] = None,
         use_neutron_for_format_conversion: bool = True,
         fetch_constants_to_sram: bool = False,
         dump_kernel_selection_code: bool = False,
@@ -74,7 +71,6 @@ def neutron_compile_spec(
         """Generate compile spec for Neutron NPU
 
         :param config: Neutron accelerator configuration, e.g. "imxrt700"
-        :param intermediates_dir: Directory to store intermediate artifact files.
         :param extra_flags: Extra flags for the Neutron compiler
         :param operators_not_to_delegate: List of operators that should not be delegated
         :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
@@ -87,7 +83,6 @@ def neutron_compile_spec(
         """
 
         self.config = NeutronTargetSpec(config)
-        self.intermediates_dir = intermediates_dir
 
         assert (
             self.output_format is None
@@ -118,7 +113,6 @@ def build(self):
                 CompileSpec("output_format", "tflite".encode()),
                 CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
                 CompileSpec("target", self.config.get_name().encode()),
-                CompileSpec("intermediates_dir", f"{self.intermediates_dir}".encode()),
                 CompileSpec(
                     "operators_not_to_delegate",
                     ",".join(self.operators_not_to_delegate).encode(),
@@ -142,19 +136,17 @@ def build(self):
 
 def generate_neutron_compile_spec(
     config: str,  # The target platform. For example "imxrt700".
-    system_config: str | None = None,
-    extra_flags: str | None = None,
-    intermediates_dir: str | None = None,
-    operators_not_to_delegate: list[str] | None = None,
+    system_config: Optional[str] = None,
+    extra_flags: Optional[str] = None,
+    operators_not_to_delegate: Optional[List[str]] = None,
     use_neutron_for_format_conversion: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
-) -> list[CompileSpec]:
+) -> List[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
         .neutron_compile_spec(
             config,
-            intermediates_dir=intermediates_dir,
             extra_flags=extra_flags,
             operators_not_to_delegate=operators_not_to_delegate,
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
@@ -171,7 +163,7 @@ class NeutronBackend(BackendDetails):
     @staticmethod
     def preprocess(  # noqa C901
         edge_program: ExportedProgram,
-        compile_spec: list[CompileSpec],
+        compile_spec: List[CompileSpec],
     ) -> PreprocessResult:
         logging.info("NeutronBackend::preprocess")
 
@@ -181,7 +173,6 @@ def preprocess(  # noqa C901
         compile_flags = []
         binary = bytes()
         target = ""
-        intermediates_dir = "None"
         use_neutron_for_format_conversion = None
         fetch_constants_to_sram = False
         dump_kernel_selection_code = None
@@ -190,8 +181,6 @@ def preprocess(  # noqa C901
                 output_format = spec.value.decode()
             if spec.key == "target":
                 target = spec.value.decode()
-            if spec.key == "intermediates_dir":
-                intermediates_dir = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
             if spec.key == "use_neutron_for_format_conversion":
@@ -205,10 +194,6 @@ def preprocess(  # noqa C901
         if not output_format:
             raise RuntimeError("output format is required")
 
-        # Check if provided intermediates_dir is a correct path (None is decoded to str)
-        if intermediates_dir != "None" and not os.path.isdir(intermediates_dir):
-            raise ValueError("intermediates_dir is not a directory path.")
-
         for node in edge_program.graph.nodes:
             if node.op == "call_function":
                 logging.debug(f"Operator to be processed: {node.target}")
@@ -243,22 +228,16 @@ def preprocess(  # noqa C901
                 fetch_constants_to_sram,
             )
 
-            # Dump the tflite file if intermediates_dir is set
-            if intermediates_dir != "None":
+            # Dump the tflite file if logging level is enabled
+            if logging.root.isEnabledFor(logging.DEBUG):
+                import os
+
                 logging.debug(
-                    f"Serializing converted graph with tag {delegation_tag} to {intermediates_dir}"
+                    f"Serializing converted graph with tag {delegation_tag} to {os.getcwd()}"
                 )
-                with open(
-                    os.path.join(intermediates_dir, f"{delegation_tag}_pure.et.tflite"),
-                    "wb",
-                ) as f:
+                with open(f"{delegation_tag}_pure.et.tflite", "wb") as f:
                     f.write(bytes(tflite_model))
-                with open(
-                    os.path.join(
-                        intermediates_dir, f"{delegation_tag}_neutron.et.tflite"
-                    ),
-                    "wb",
-                ) as f:
+                with open(f"{delegation_tag}_neutron.et.tflite", "wb") as f:
                     f.write(bytes(neutron_model))
 
             binary = PayloadComposer().get_binary_payload(io_formats, neutron_model)
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index 94ee8e8656a..048172ea212 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -25,7 +25,6 @@
     Conv2dPattern,
     ConvTranspose2dPattern,
     DropoutPattern,
-    ExpPattern,
     FlattenPattern,
     HardTanhInPlacePattern,
     HardTanhPattern,
@@ -271,7 +270,6 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec, is_qat: bool = False)
                     ConvTranspose2dPattern(self, is_qat=is_qat), static_qconfig
                 ),
                 OpQuantizer(DropoutPattern(is_qat=is_qat), static_qconfig),
-                OpQuantizer(ExpPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(FlattenPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(HardTanhPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(HardTanhInPlacePattern(is_qat=is_qat), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index d6cf1d7e063..9e21e4f1660 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -11,10 +11,7 @@
 
 import torch
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import (
-    ClampConverter,
-)
-from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.hardtanh_converter import (
-    HardTanhConverter,
+    _is_convertible_to_relu,
 )
 from executorch.backends.nxp.quantizer.utils import (
     get_bias_qparams,
@@ -441,7 +438,7 @@ def get_anchors(
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
 
-        if not ClampConverter._is_convertible_to_relu(node):
+        if not _is_convertible_to_relu(node):
             return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition)
         else:
             return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition)
@@ -712,15 +709,6 @@ def partition_types(self):
         return [torch.ops.aten.dropout.default]
 
 
-class ExpPattern(SharedSpecPattern):
-    """
-    Quantizer for Exp operator.
-    """
-
-    def partition_types(self):
-        return [torch.ops.aten.exp.default]
-
-
 class FlattenPattern(SharedSpecPattern):
     """
     Quantizer for Flatten operator.
@@ -738,21 +726,11 @@ class HardTanhPattern(SingleInputBasicPattern):
     def partition_types(self):
         return [torch.ops.aten.hardtanh.default]
 
-    def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
-    ) -> PartitionAnchors | None:
-        node = fused_partition[0].nodes[-1]
-
-        if not HardTanhConverter._is_convertible_to_relu(node):
-            return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition)
-        else:
-            return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition)
-
     def replacement_op(self):
         raise AssertionError()
 
 
-class HardTanhInPlacePattern(HardTanhPattern):
+class HardTanhInPlacePattern(SingleInputBasicPattern):
     """
     Quantizer for HardTanh operator with param inplace=True.
     """
@@ -760,6 +738,21 @@ class HardTanhInPlacePattern(HardTanhPattern):
     def partition_types(self):
         return [torch.ops.aten.hardtanh_.default]
 
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(node, NodeArgsIdx(0))],
+            weights=[],
+            biases=[],
+            output=[(node,)],
+        )
+
+    def replacement_op(self):
+        raise AssertionError()
+
 
 class LeakyReluPattern(SingleInputBasicPattern):
     """Quantizer for the `aten.leaky_relu.default` operator."""
diff --git a/backends/nxp/run_unittests.sh b/backends/nxp/run_unittests.sh
index 66e51c39a1d..78e35d2617a 100755
--- a/backends/nxp/run_unittests.sh
+++ b/backends/nxp/run_unittests.sh
@@ -11,6 +11,6 @@ EXECUTORCH_DIR=$(dirname $(dirname $SCRIPT_DIR))
 cd $EXECUTORCH_DIR
 
 # '-c /dev/null' is used to ignore root level pytest.ini.
-pytest -c /dev/null -n "logical" backends/nxp/tests/
+pytest -c /dev/null backends/nxp/tests/
 
 python -m unittest discover -s backends/nxp/tests/ -v
diff --git a/backends/nxp/tests/conftest.py b/backends/nxp/tests/conftest.py
index af2011a8000..34fe343ca6a 100644
--- a/backends/nxp/tests/conftest.py
+++ b/backends/nxp/tests/conftest.py
@@ -35,4 +35,4 @@ def pytest_sessionstart(session):
 
     # Remove all cached test files
     shutil.rmtree(outputs_dir.OUTPUTS_DIR, ignore_errors=True)
-    os.makedirs(outputs_dir.OUTPUTS_DIR, exist_ok=True)
+    os.mkdir(outputs_dir.OUTPUTS_DIR)
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
index 44a96010593..5cfcb37c8a8 100644
--- a/backends/nxp/tests/executorch_pipeline.py
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -180,7 +180,6 @@ def to_quantized_edge_program(
     operators_not_to_delegate: list[str] = None,
     get_calibration_inputs_fn: GetCalibrationInputsFn = get_random_calibration_inputs,
     target: str = "imxrt700",
-    intermediates_dir: str | None = None,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     remove_quant_io_ops: bool = False,
@@ -218,7 +217,6 @@ def to_quantized_edge_program(
     preserve_ops = [torch.ops.aten.prelu.default]
     compile_spec = generate_neutron_compile_spec(
         target,
-        intermediates_dir=intermediates_dir,
         operators_not_to_delegate=operators_not_to_delegate,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         fetch_constants_to_sram=fetch_constants_to_sram,
@@ -268,7 +266,6 @@ def to_quantized_edge_program(
 def to_quantized_executorch_program(
     model: torch.nn.Module,
     input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]],
-    intermediates_dir: str | None = None,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     use_neutron_for_format_conversion: bool = True,
@@ -290,7 +287,6 @@ def to_quantized_executorch_program(
     edge_program_manager = to_quantized_edge_program(
         model,
         input_spec,
-        intermediates_dir=intermediates_dir,
         use_qat=use_qat,
         train_fn=train_fn,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
diff --git a/backends/nxp/tests/generic_tests/test_cifarnet.py b/backends/nxp/tests/generic_tests/test_cifarnet.py
index c874ba24e47..1d795c938fe 100644
--- a/backends/nxp/tests/generic_tests/test_cifarnet.py
+++ b/backends/nxp/tests/generic_tests/test_cifarnet.py
@@ -34,7 +34,7 @@ def cifar_test_files(tmp_path_factory):
 
 
 @pytest.mark.parametrize("channels_last", [False, True])
-def test_cifarnet(mocker, request, cifar_test_files, channels_last):
+def test_cifarnet(mocker, cifar_test_files, channels_last):
     model = (
         CifarNet(
             pth_file=os.path.join(
@@ -64,10 +64,9 @@ def test_cifarnet(mocker, request, cifar_test_files, channels_last):
     lower_run_compare(
         model,
         [input_spec],
-        BaseGraphVerifier(1, non_dlg_nodes),
-        request,
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
+        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
         mocker=mocker,
         # Run the channels last reference in PyTorch as the ExecuTorch CPU model contains incorrectly
         #  lowered channels last convolution weights, which cause incorrect inference results. The issue
@@ -80,7 +79,7 @@ def test_cifarnet(mocker, request, cifar_test_files, channels_last):
     )
 
 
-def test_cifarnet_qat(mocker, request, cifar_test_files):
+def test_cifarnet_qat(mocker, cifar_test_files):
     model = CifarNet().get_eager_model().eval()
 
     input_shape = (1, 3, 32, 32)
@@ -95,10 +94,9 @@ def test_cifarnet_qat(mocker, request, cifar_test_files):
     lower_run_compare(
         model,
         input_shape,
-        BaseGraphVerifier(1, non_dlg_nodes),
-        request,
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
+        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
         mocker=mocker,
         use_qat=True,
     )
diff --git a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
index 3415b79a39d..fcd0aae2130 100644
--- a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
+++ b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
@@ -208,7 +208,7 @@ class TestConvertDivToMul:
         ids=lambda is_scalar: "scalar" if is_scalar else "tensor",
     )
     def test__static__full_pipeline(
-        self, mocker, request, input_shape: tuple[int, ...], is_scalar: bool
+        self, mocker, input_shape: tuple[int, ...], is_scalar: bool
     ):
         if is_scalar:
             divisor = np.random.uniform(0.01, 15)
@@ -231,6 +231,5 @@ def test__static__full_pipeline(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
         )
diff --git a/backends/nxp/tests/generic_tests/test_integration.py b/backends/nxp/tests/generic_tests/test_integration.py
index edefd905dbf..fe157b44c48 100644
--- a/backends/nxp/tests/generic_tests/test_integration.py
+++ b/backends/nxp/tests/generic_tests/test_integration.py
@@ -19,7 +19,7 @@ def test_conv_fc_softmax__to_executorch_program(use_qat):
     model = ConvFCSoftmaxModule()
     input_shape = (1, 4, 5, 5)
 
-    exec_prog = to_quantized_executorch_program(model, input_shape, use_qat=use_qat)
+    exec_prog = to_quantized_executorch_program(model, input_shape, use_qat)
 
     program = exec_prog.exported_program()
     assert (
diff --git a/backends/nxp/tests/generic_tests/test_quantized_input_data.py b/backends/nxp/tests/generic_tests/test_quantized_input_data.py
index a9f9f3e47e6..8b2f6823e8d 100644
--- a/backends/nxp/tests/generic_tests/test_quantized_input_data.py
+++ b/backends/nxp/tests/generic_tests/test_quantized_input_data.py
@@ -17,7 +17,7 @@
 from executorch.backends.nxp.tests.ops_aliases import AvgPool2D, MulTensor
 
 
-def test__single_quantized_inputs(mocker, request):
+def test__single_quantized_inputs(mocker):
     input_spec = ModelInputSpec((2, 4, 6, 7))
     model = AvgPool2dModule(False, 0)
     graph_verifier = DetailedGraphVerifier(
@@ -29,19 +29,19 @@ def test__single_quantized_inputs(mocker, request):
         model,
         [input_spec],
         graph_verifier,
-        request,
         remove_quant_io_ops=True,
     )
 
-    test_name = nsys_testing.get_test_name(request)
-    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists()
+    assert (
+        OUTPUTS_DIR / "test__single_quantized_inputs" / "dataset_quant" / "0000.bin"
+    ).exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__single_quantized_inputs_edge_python_reference(mocker, request):
+def test__single_quantized_inputs_edge_python_reference(mocker):
     input_spec = ModelInputSpec((2, 4, 6, 7))
     model = AvgPool2dModule(False, 0)
     graph_verifier = DetailedGraphVerifier(
@@ -53,20 +53,23 @@ def test__single_quantized_inputs_edge_python_reference(mocker, request):
         model,
         [input_spec],
         graph_verifier,
-        request,
         reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON,
         remove_quant_io_ops=True,
     )
 
-    test_name = nsys_testing.get_test_name(request)
-    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists()
+    assert (
+        OUTPUTS_DIR
+        / "test__single_quantized_inputs_edge_python_reference"
+        / "dataset_quant"
+        / "0000.bin"
+    ).exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__multiple_quantized_inputs(mocker, request):
+def test__multiple_quantized_inputs(mocker):
     x_input_spec = ModelInputSpec((1, 4, 8, 8))
     model = MulTensorModule()
     graph_verifier = DetailedGraphVerifier(
@@ -78,19 +81,23 @@ def test__multiple_quantized_inputs(mocker, request):
         model,
         [x_input_spec, x_input_spec],
         graph_verifier,
-        request,
         remove_quant_io_ops=True,
     )
 
-    test_name = nsys_testing.get_test_name(request)
-    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists()
+    assert (
+        OUTPUTS_DIR
+        / "test__multiple_quantized_inputs"
+        / "dataset_quant"
+        / "0000"
+        / "00.bin"
+    ).exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__multiple_quantized_inputs_edge_python_reference(mocker, request):
+def test__multiple_quantized_inputs_edge_python_reference(mocker):
     x_input_spec = ModelInputSpec((1, 4, 8, 8))
     model = MulTensorModule()
     graph_verifier = DetailedGraphVerifier(
@@ -102,13 +109,17 @@ def test__multiple_quantized_inputs_edge_python_reference(mocker, request):
         model,
         [x_input_spec, x_input_spec],
         graph_verifier,
-        request,
         reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON,
         remove_quant_io_ops=True,
     )
 
-    test_name = nsys_testing.get_test_name(request)
-    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists()
+    assert (
+        OUTPUTS_DIR
+        / "test__multiple_quantized_inputs_edge_python_reference"
+        / "dataset_quant"
+        / "0000"
+        / "00.bin"
+    ).exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
index d42ef4c6e7d..ebe782c5a98 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
@@ -68,7 +68,7 @@ def _get_dataset_creator():
         dataset = RandomDatasetCreator(low=low, high=high)
         return dataset
 
-    def test__basic_nsys_inference(self, mocker, request):
+    def test__basic_nsys_inference(self, mocker):
         input_shape = (2, 3, 6, 7)
         model = AbsModule()
         graph_verifier = DetailedGraphVerifier(
@@ -80,11 +80,10 @@ def test__basic_nsys_inference(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
         )
 
-    def test__basic_nsys_inference__big(self, mocker, request):
+    def test__basic_nsys_inference__big(self, mocker):
         # some operators have delegation requirement that size must be < 4096
         input_shape = (4097, 1)
         model = AbsModule()
@@ -97,6 +96,5 @@ def test__basic_nsys_inference__big(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
index 9646c04a3f2..8b8f2da8c4e 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
@@ -44,9 +44,7 @@ class TestAdaptiveAvgPool2D:
             ),
         ],
     )
-    def test__basic_nsys_inference(
-        self, mocker, request, use_qat, input_shape, output_size
-    ):
+    def test__basic_nsys_inference(self, mocker, use_qat, input_shape, output_size):
         model = AdaptiveAvgPool2dModule(output_size)
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -62,7 +60,6 @@ def test__basic_nsys_inference(
             model,
             input_shape,
             graph_verifier,
-            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
             use_qat=use_qat,
@@ -72,7 +69,7 @@ def test__basic_nsys_inference(
         strict=True,
         reason="Known Neutron bad compute issue. Will be fixed in Neutron SW 3.1.2.",
     )
-    def test__know_neutron_issue(self, mocker, request):
+    def test__know_neutron_issue(self, mocker):
         input_shape = (2, 3, 10, 15)
         output_size = (5, 5)
         model = AdaptiveAvgPool2dModule(output_size)
@@ -89,12 +86,11 @@ def test__know_neutron_issue(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
         )
 
-    def test__kernel_size_and_stride_limit(self, mocker, request):
+    def test__kernel_size_and_stride_limit(self, mocker):
         input_shape = (1, 3, 4, 4096)  # input_size = (1, 4096)
         output_size = (
             2,
@@ -118,7 +114,6 @@ def test__kernel_size_and_stride_limit(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 6ac96e41cd1..3ede2cfaadd 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -16,9 +16,6 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
-    AllCloseOutputComparator,
-)
 from executorch.backends.nxp.tests.models import AddTensorConvModule, AddTensorModule
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
@@ -41,49 +38,67 @@ class TestAddTensor:
         [
             pytest.param((1,), id="1D."),
             pytest.param((6, 5), id="2D."),
-            pytest.param((6, 82), id="2D alt."),
             pytest.param((1, 4, 7), id="3D."),
-            pytest.param((1, 68, 7), id="3D alt."),
             pytest.param((2, 4, 3, 15), id="4D."),
-            pytest.param((1, 4, 9, 11, 4), id="5D."),
+            pytest.param(
+                (6, 82),
+                id="2D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (1, 68, 7),
+                id="3D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (1, 4, 9, 11, 4),
+                id="5D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
         ],
     )
-    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
+    def test__basic_nsys_inference(self, x_input_shape, mocker):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
-    def test__basic_nsys_inference_qat(self, mocker, request):
-        x_input_spec = ModelInputSpec((1, 4, 7))
+    @pytest.mark.parametrize(
+        "x_input_shape",
+        [
+            pytest.param((1,), id="1D."),
+            pytest.param((6, 5), id="2D."),
+            pytest.param((1, 4, 7), id="3D."),
+            pytest.param((2, 4, 3, 15), id="4D."),
+            pytest.param(
+                (1, 4, 9, 11, 4),
+                id="5D.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+        ],
+    )
+    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
+        x_input_spec = ModelInputSpec(x_input_shape)
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
             use_qat=True,
         )
 
@@ -93,10 +108,6 @@ def test__basic_nsys_inference_qat(self, mocker, request):
             pytest.param(
                 [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
             ),
-            pytest.param(
-                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
-                id="2 inputs 2D alt.",
-            ),
             pytest.param(
                 [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
                 id="2 inputs 3D.",
@@ -104,24 +115,25 @@ def test__basic_nsys_inference_qat(self, mocker, request):
             pytest.param(
                 [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
             ),
+            pytest.param(
+                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+                id="2 inputs 2D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
         ],
     )
-    def test__broadcast(self, mocker, request, input_spec):
+    def test__broadcast(self, input_spec, mocker):
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -160,7 +172,7 @@ def test__broadcast_unsupported(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, mocker, request, x_input_shape):
+    def test__w_conv(self, x_input_shape, mocker):
         model = AddTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -175,11 +187,7 @@ def test__w_conv(self, mocker, request, x_input_shape):
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
 
         lower_run_compare(
-            model,
-            [x_input_spec, y_input_spec],
-            graph_verifier,
-            request,
-            dataset_creator,
+            model, [x_input_spec, y_input_spec], graph_verifier, dataset_creator
         )
 
     @pytest.mark.parametrize(
@@ -190,12 +198,13 @@ def test__w_conv(self, mocker, request, x_input_shape):
                 id="2 inputs 4D + 4D.",
             ),
             pytest.param(
-                [ModelInputSpec((1, 4, 1, 67)), ModelInputSpec((1, 8, 5, 67))],
-                id="2 inputs 4D + 4D same width.",
+                [ModelInputSpec((1, 4, 5, 67)), ModelInputSpec((1, 8, 5, 1))],
+                id="2 inputs 4D + 4D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
             ),
         ],
     )
-    def test__w_conv_broadcast(self, mocker, request, input_spec):
+    def test__w_conv_broadcast(self, input_spec, mocker):
         model = AddTensorConvModule()
 
         graph_verifier = DetailedGraphVerifier(
@@ -204,16 +213,12 @@ def test__w_conv_broadcast(self, mocker, request, input_spec):
             expected_non_delegated_ops={},
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
index 3db1158d637..120c3899ed4 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
@@ -41,16 +41,16 @@ def forward(self, x):
 
 
 class TestAvgPool2D:
-    def test__basic_nsys_inference(self, mocker, request):
+    def test__basic_nsys_inference(self, mocker):
         input_shape = (2, 4, 6, 7)
         model = AvgPool2dModule(False, 0)
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
 
-    def test__basic_nsys_inference_qat(self, mocker, request):
+    def test__basic_nsys_inference_qat(self, mocker):
         input_shape = (2, 9, 6, 15)
         model = AvgPool2dModule(False, 0)
         graph_verifier = DetailedGraphVerifier(
@@ -61,11 +61,10 @@ def test__basic_nsys_inference_qat(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             use_qat=True,
         )
 
-    def test__kernel_size_limit(self, mocker, request):
+    def test__kernel_size_limit(self, mocker):
         kernel_size = (1, 4096)
         input_shape = (1, 4) + kernel_size
         model = AvgPool2dModule(False, 0, kernel_size)
@@ -73,7 +72,7 @@ def test__kernel_size_limit(self, mocker, request):
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
 
     def test__kernel_size_limit_exceeded(self):
         kernel_size = (1, 4097)  # Exceeds the kernel size limit.
@@ -88,7 +87,7 @@ def test__kernel_size_limit_exceeded(self):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [AvgPool2D])
 
-    def test__stride_limit(self, mocker, request):
+    def test__stride_limit(self, mocker):
         stride = 4096
         input_shape = (1, 4, 1, 4096)
         model = AvgPool2dModule(False, 0, 1, stride)
@@ -96,7 +95,7 @@ def test__stride_limit(self, mocker, request):
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
 
     def test__stride_limit_exceeded(self):
         stride = 4097  # Exceeds the stride limit.
@@ -115,7 +114,7 @@ def test__stride_limit_exceeded(self):
 class TestAvgPool1D:
 
     # Just a basic test to verify that the operator gets extended to the 2D variant correctly.
-    def test__basic_nsys_inference(self, mocker, request):
+    def test__basic_nsys_inference(self, mocker):
         input_shape = (2, 4, 6)  # The old flow limited the batch size to 1.
         model = AvgPool1DModule()
         graph_verifier = DetailedGraphVerifier(
@@ -124,4 +123,4 @@ def test__basic_nsys_inference(self, mocker, request):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
index b28a431e3ca..9bb1f30ee60 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
@@ -56,7 +56,7 @@ def forward(self, *inputs: torch.Tensor):
 
 class TestCat:
 
-    def test__qat(self, mocker, request, use_qat):
+    def test__qat(self, mocker, use_qat):
         input_shape = (2, 3, 5)
         num_inputs = 2
 
@@ -66,11 +66,11 @@ def test__qat(self, mocker, request, use_qat):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, request, use_qat=use_qat)
+        lower_run_compare(model, input_shapes, graph_verifier, use_qat=use_qat)
 
     @pytest.mark.parametrize("dim", list(range(-3, 3)), ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__same_shapes(self, mocker, request, dim, num_inputs):
+    def test__same_shapes(self, mocker, dim, num_inputs):
         input_shape = (2, 3, 5)
         input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)]
 
@@ -79,11 +79,11 @@ def test__same_shapes(self, mocker, request, dim, num_inputs):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, request)
+        lower_run_compare(model, input_shapes, graph_verifier)
 
     @pytest.mark.parametrize("dim", [0, -3, 2, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__same_shapes__channels_first(self, mocker, request, dim, num_inputs):
+    def test__same_shapes__channels_first(self, mocker, dim, num_inputs):
         input_shape = (2, 3, 4, 5)
         input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)]
 
@@ -94,12 +94,12 @@ def test__same_shapes__channels_first(self, mocker, request, dim, num_inputs):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, request)
+        lower_run_compare(model, input_shapes, graph_verifier)
 
     @pytest.mark.parametrize("dim", [0, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("rank", [2, 3, 4], ids=lambda rank: f"rank={rank}")
     @pytest.mark.parametrize("num_inputs", [2, 3], ids=lambda n: f"n={n}")
-    def test__different_shapes(self, mocker, request, dim, rank, num_inputs):
+    def test__different_shapes(self, mocker, dim, rank, num_inputs):
         # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input.
         # e.g. [(2, 3, 4), (3, 3, 4), (4, 3, 4), (5, 3, 4), (6, 3, 4)]
         base_shape = [i + 2 for i in range(rank)]
@@ -113,11 +113,11 @@ def test__different_shapes(self, mocker, request, dim, rank, num_inputs):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, request)
+        lower_run_compare(model, input_shapes, graph_verifier)
 
     @pytest.mark.parametrize("dim", [1, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__different_shapes__channels_first(self, mocker, request, dim, num_inputs):
+    def test__different_shapes__channels_first(self, mocker, dim, num_inputs):
         # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input.
         # e.g. [(1, 3, 4, 5), (2, 3, 4, 5)]
         base_shape = (2, 3, 4, 5)
@@ -133,7 +133,7 @@ def test__different_shapes__channels_first(self, mocker, request, dim, num_input
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, request)
+        lower_run_compare(model, input_shapes, graph_verifier)
 
     def test__single_input__alone_in_partition__not_delegated(self):
         # The operator is a noop, and there is no other op in the model. The Neutron Converter would produce an empty
@@ -149,7 +149,7 @@ def test__single_input__alone_in_partition__not_delegated(self):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [Cat])
 
-    def test__single_input__not_alone_in_partition__delegated(self, mocker, request):
+    def test__single_input__not_alone_in_partition__delegated(self, mocker):
         # The operator is a noop, but there is another op in the model, so they are both delegated.
         input_shape = [ModelInputSpec((2, 3, 4, 5))]
 
@@ -160,4 +160,4 @@ def test__single_input__not_alone_in_partition__delegated(self, mocker, request)
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
index bd296bb856f..e0ae44b61f8 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
@@ -24,6 +24,9 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    NumericalStatsOutputComparator,
+)
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
     AddTensor,
@@ -65,35 +68,6 @@ def forward(self, x):
 
 
 class TestClamp:
-
-    @pytest.mark.parametrize(
-        "min, max",
-        [
-            pytest.param(-1, 2, id="min = -1, max = 2 (Max/Min)"),
-            pytest.param(0.0, None, id="min = 0, max = None (Relu)"),
-        ],
-    )
-    def test__qat(self, mocker, request, min, max, use_qat):
-        input_shape = (2, 7, 2)  # Indivisible by num_macs
-        model = AddClampModule(min, max)
-
-        x_input_spec = ModelInputSpec(input_shape)
-        graph_verifier = DetailedGraphVerifier(
-            mocker,
-            expected_delegated_ops={
-                AddTensor: 1,
-                Clamp: 1,
-            },
-            expected_non_delegated_ops={},
-        )
-
-        lower_run_compare(
-            model=model,
-            input_spec=[x_input_spec],
-            request=request,
-            dlg_model_verifier=graph_verifier,
-        )
-
     @pytest.mark.parametrize(
         "min, max",
         [
@@ -116,11 +90,12 @@ def test__qat(self, mocker, request, min, max, use_qat):
             pytest.param(0.0, None, id="min = 0, max = None (Relu)"),
         ],
     )
-    def test_convert_clamp__full_pipeline(self, mocker, request, min, max):
+    def test_convert_clamp__full_pipeline(self, mocker, min, max, use_qat):
         input_shape = (2, 7, 2)  # Indivisible by num_macs
         model = AddClampModule(min, max)
 
         x_input_spec = ModelInputSpec(input_shape)
+        comparator = NumericalStatsOutputComparator()
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={
@@ -134,7 +109,8 @@ def test_convert_clamp__full_pipeline(self, mocker, request, min, max):
             model=model,
             input_spec=[x_input_spec],
             dlg_model_verifier=graph_verifier,
-            request=request,
+            output_comparator=comparator,
+            use_qat=use_qat,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
index 32bbf93fae4..9ffa69139f6 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
@@ -34,7 +34,7 @@ class TestConstantPadND:
     """
 
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
+    def assert_delegated(self, model, input_shape, mocker, use_qat=False):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={ConstantPadND: 1},
@@ -45,16 +45,15 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
             model,
             input_shape,
             graph_verifier,
-            request,
             use_qat=use_qat,
         )
 
     def assert_delegated_and_output_shape_equals(
-        self, model, input_shape, expected_output_shape, mocker, request
+        self, model, input_shape, expected_output_shape, mocker
     ):
         model_builder_spy = mocker.spy(ModelBuilder, "finish")
 
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
         neutron_ir_subgraph = model_builder_spy.call_args[0][0].get_sub_graph()
         assert neutron_ir_subgraph.outputs.tmp_outputs[0].shape.vector == list(
@@ -75,14 +74,12 @@ def assert_delegated_and_output_shape_equals(
             pytest.param((1, 2, 3, 4, 5), tuple(range(4)), id="5D, padding W, D"),
         ],
     )
-    def test__basic_nsys_inference(
-        self, mocker, request, input_shape, paddings, use_qat
-    ):
+    def test__basic_nsys_inference(self, mocker, input_shape, paddings, use_qat):
         # These test cases are also supported by the old flow.
         model = ConstantPadNDModule(paddings)
-        self.assert_delegated(model, input_shape, mocker, request, use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat)
 
-    def test__channels_padding(self, mocker, request):
+    def test__channels_padding(self, mocker):
         input_shape = (2, 4, 6)
         # These paddings will be applied to the last dimension, which is the channels as the input is formatless.
         paddings = (1, 1)
@@ -90,25 +87,25 @@ def test__channels_padding(self, mocker, request):
         model = ConstantPadNDModule(paddings)
 
         self.assert_delegated_and_output_shape_equals(
-            model, input_shape, expected_output_shape, mocker, request
+            model, input_shape, expected_output_shape, mocker
         )
 
-    def test__batch_padding(self, mocker, request):
+    def test__batch_padding(self, mocker):
         input_shape = (2, 4, 6)
         paddings = (0, 0, 0, 0, 1, 1)  # Padding applied to the batch dimension.
         expected_output_shape = (4, 4, 6)  # Padded batch.
         model = ConstantPadNDModule(paddings)
 
         self.assert_delegated_and_output_shape_equals(
-            model, input_shape, expected_output_shape, mocker, request
+            model, input_shape, expected_output_shape, mocker
         )
 
     @pytest.mark.parametrize("constant", [0.0, -13.37])
-    def test__specific_constant(self, mocker, request, constant):
+    def test__specific_constant(self, mocker, constant):
         input_shape = (2, 4, 6)
         paddings = (1, 1)
         model = ConstantPadNDModule(paddings, constant)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     @pytest.mark.parametrize(
         "input_shape, paddings",
@@ -118,7 +115,7 @@ def test__specific_constant(self, mocker, request, constant):
             pytest.param((1, 2, 6, 8), (0, 1, 2, 3, 1, 1), id="4D, padding H, W"),
         ],
     )
-    def test__channels_first(self, mocker, request, input_shape, paddings):
+    def test__channels_first(self, mocker, input_shape, paddings):
         model = ConstantPadNDConvModule(paddings)
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -126,4 +123,4 @@ def test__channels_first(self, mocker, request, input_shape, paddings):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
index 3799aa91623..67d3add978c 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
@@ -4,31 +4,22 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
-
-# noinspection PyUnusedImports
 import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import (
-    AtenModelBuilderDirector,
-)
-from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
-    BuiltinOperator as Ops,
-)
-from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
-from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
-from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.models import Conv2dWithActivation, HardTanhModule
-from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
-from executorch.backends.nxp.tests.ops_aliases import (
-    Convolution,
-    ExecutorchDelegateCall,
-    HardTanh,
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    graph_contains_any_of_ops,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
 )
+from executorch.backends.nxp.tests.models import Conv2dWithActivation
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
 from executorch.backends.nxp.tests.use_qat import *  # noqa F403
 
 
@@ -38,237 +29,91 @@ def reseed_model_per_test_run():
     np.random.seed(23)
 
 
-class AddHardTanhModule(HardTanhModule):
-    def forward(self, x):
-        x = x + x
-        x = super().forward(x)
-        return x
-
-
-class TestHardTanh:
-    # noinspection PyMethodMayBeStatic
-    def assert_delegated(
-        self,
-        model,
-        input_shape,
-        mocker,
-        request,
-        use_qat=False,
-        expected_delegated_ops=None,
-    ):
-        graph_verifier = DetailedGraphVerifier(
-            mocker,
-            expected_delegated_ops=(
-                expected_delegated_ops
-                if expected_delegated_ops is not None
-                else {HardTanh: 1}
-            ),
-            expected_non_delegated_ops={},
-        )
+ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
+HardTanh = exir_ops.edge.aten.hardtanh.default
+HardTanh_ = exir_ops.edge.aten.hardtanh_.default
 
-        # Create a RandomDatasetCreator that covers also negative numbers to properly test the operator.
-        dataset_creator = RandomDatasetCreator(low=-2, high=2)
 
-        lower_run_compare(
-            model,
-            input_shape,
-            graph_verifier,
-            request,
-            dataset_creator,
-            use_qat=use_qat,
-        )
-
-    @pytest.mark.parametrize(
-        "activation_range",
-        [
-            (-1, 3),
-            (0, float("inf")),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
+@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128)])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_relu6_quant(mocker, input_shape: tuple[int], inplace: bool, use_qat: bool):
+    # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen.
+    # Testing the hardtanh originated from torch.nn.Relu6 op.
+    model = Conv2dWithActivation(
+        activation=torch.nn.ReLU6(inplace=inplace), in_channels=input_shape[1]
     )
-    def test__qat(
-        self, mocker, request, activation_range: tuple[float, float], use_qat, inplace
-    ):
-        input_shape = (23,)
-        model = HardTanhModule(*activation_range, inplace)
 
-        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
 
-    @pytest.mark.parametrize(
-        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
-    )
-    def test__from_relu6__after_conv(self, mocker, request, inplace: bool):
-        # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen.
-        # Testing the hardtanh originated from torch.nn.Relu6 op.
-        input_shape = (1, 3, 4, 5)
-        model = Conv2dWithActivation(
-            activation=torch.nn.ReLU6(inplace=inplace),
-            in_channels=input_shape[1],
-            out_channels=2,
-        )
+    quantized_program = to_quantized_edge_program(
+        model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False
+    ).exported_program()
 
-        self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops={HardTanh: 1, Convolution: 1},
-        )
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
-    @pytest.mark.parametrize(
-        "activation_range",
-        [
-            (0.0, 6.0),
-            (-1.0, 1),
-            (0, 1),
-            (0.0, float("inf")),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
-    )
-    def test__hardtanh__mappable_to_relu__after_conv(
-        self,
-        mocker,
-        request,
-        activation_range: tuple[float, float],
-        inplace: bool,
-    ):
-        input_shape = (1, 3, 4, 5)
-        model = Conv2dWithActivation(
-            activation=torch.nn.Hardtanh(*activation_range, inplace),
-            in_channels=input_shape[1],
-            out_channels=2,
-        )
+    assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_])
+    assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall])
 
-        self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops={HardTanh: 1, Convolution: 1},
-        )
-
-    @pytest.mark.parametrize(
-        "activation_range",
-        [
-            (-1, 3),
-            (2.27, 3.14),
-            (-0.1, 0),
-            (float("-inf"), 1.23),
-        ],
+    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+    convert_run_compare(
+        exported_program,
+        tfl_model=tflite_flatbuffers_model,
+        tflite_input_preprocess=ToChannelLastPreprocess(),
+        tflite_output_preprocess=ToChannelFirstPreprocess(),
+        input_data=input_data,
+        atol=2.0,
     )
-    def test__hardtanh__not_mappable_to_relu(
-        self,
-        mocker,
-        request,
-        activation_range: tuple[float, float],
-    ):
-        input_shape = (23,)
-        model = HardTanhModule(*activation_range)
-
-        self.assert_delegated(model, input_shape, mocker, request)
-
-    def test__unsupported_bounds(self):
-        # TODO ONLY WHEN ALONE IN PARTITION
-        input_shape = (2, 7, 2)
-        min_value, max_value = float("-inf"), float("inf")
-        model = HardTanhModule(min_value, max_value)
 
-        delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
 
-        # Make sure the `hardtanh` was NOT delegated.
-        assert graph_contains_any_of_ops(delegated_ep.graph, [HardTanh])
-
-    @pytest.mark.parametrize(
-        "activation_range",
-        [
-            pytest.param((None, float("inf")), id="min = None, max = inf"),
-            pytest.param((float("inf"), None), id="min = inf, max = None"),
-        ],
-    )
-    def test__invalid_bounds(self, activation_range):
-        # PyTorch doesn't allow these cases, so we cannot test our handling of this edge case.
-        with pytest.raises(TypeError, match="'<=' not supported between instances of"):
-            _ = HardTanhModule(*activation_range)
-
-    @pytest.mark.parametrize(
-        "min, max, expected_neutron_ir_ops",
-        [
-            pytest.param(
-                0.1,
-                0.5,
-                [Ops.ADD, Ops.MAXIMUM, Ops.MINIMUM],
-                id="min = 0.1, max = 0.5 (Max/Min)",
-            ),
-            pytest.param(
-                0.0, 1.0, [Ops.ADD, Ops.RELU_0_TO_1], id="min = 0, max = 1 (Relu0To1)"
-            ),
-            pytest.param(
-                -1.0,
-                1.0,
-                [Ops.ADD, Ops.RELU_N1_TO_1],
-                id="min = -1, max = 1 (ReluN1To1)",
-            ),
-            pytest.param(
-                0.0,
-                float("inf"),
-                [Ops.ADD, Ops.RELU],
-                id="min = 0, max = infinity (Relu)",
-            ),
-            pytest.param(
-                0,
-                1.0,
-                [Ops.ADD, Ops.RELU_0_TO_1],
-                id="min = 0, max = 1 (Relu0To1)",
-            ),
-            pytest.param(
-                0,
-                6.0,
-                [Ops.ADD, Ops.RELU6],
-                id="min = 0, max = 6 (Relu6)",
-            ),
-        ],
+@pytest.mark.parametrize("input_shape", [(1, 3, 16, 16), (1, 3, 32, 32)])
+@pytest.mark.parametrize(
+    "activation_range",
+    [
+        (0.0, 6.0),
+        (-1.0, 1.0),
+        (0.0, 1.0),
+        (0.0, float("inf")),
+        (0, 6),
+        (-1, 1),
+        (0, 1),
+        (0, float("inf")),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_custom_hardtanh_quant(
+    mocker,
+    input_shape: tuple[int],
+    activation_range: tuple[float, float],
+    inplace: bool,
+    use_qat: bool,
+):
+    # TODO(13063): This test suffers from non-ideal testing random quantization, because we always use range <0,1>.
+    #  We should update (decrease atol) when the Conv/Linear + Activation fuse at quantization is in place.
+    min_val, max_val = activation_range
+    model = Conv2dWithActivation(
+        activation=torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace),
+        in_channels=input_shape[1],
     )
-    def test_convert_clamp__relu_vs_maxmin(
-        self, mocker, min, max, expected_neutron_ir_ops
-    ):
-        input_shape = (23,)
-        model = AddHardTanhModule(min, max)
 
-        converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-        neutron_ir_spy = mocker.spy(AtenModelBuilderDirector, "finish")
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
 
-        delegated_ep = to_quantized_edge_program(
-            model,
-            input_shape,
-        ).exported_program()
+    quantized_program = to_quantized_edge_program(
+        model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False
+    ).exported_program()
 
-        # Make sure the `clamp` was delegated.
-        assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
-        assert not graph_contains_any_of_ops(delegated_ep.graph, [HardTanh])
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
-        intermediate_ep = converter_spy.call_args.args[1]
-        quant_node = list(intermediate_ep.graph.nodes)[-2]
-        dequant_node = list(intermediate_ep.graph.nodes)[-4]
-        neutron_ir_internal_ops = [
-            op.builtin_code for op in neutron_ir_spy.spy_return.operator_codes.vector
-        ]
+    assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_])
+    assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall])
 
-        assert graph_contains_any_of_ops(intermediate_ep.graph, [HardTanh])
-        assert (
-            len(neutron_ir_internal_ops) == len(expected_neutron_ir_ops) + 1
-        )  # Transpose
-        assert all(op in neutron_ir_internal_ops for op in expected_neutron_ir_ops)
-
-        if len(expected_neutron_ir_ops) == 3:
-            # Min/Max variant should have same input and output quantization
-            assert all(
-                q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:])
-            )
-        else:
-            assert not all(
-                q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:])
-            )
+    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+    convert_run_compare(
+        exported_program,
+        tfl_model=tflite_flatbuffers_model,
+        tflite_input_preprocess=ToChannelLastPreprocess(),
+        tflite_output_preprocess=ToChannelFirstPreprocess(),
+        input_data=input_data,
+        atol=2.0,
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
index 567cf85ebe5..81dbe9aa0fb 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
@@ -34,7 +34,7 @@ def forward(self, x):
 
 class TestLeakyRelu:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
+    def assert_delegated(self, model, input_shape, mocker, use_qat=False):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={LeakyRelu: 1},
@@ -48,7 +48,6 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -64,29 +63,28 @@ def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__default_alpha__input_shapes(self, mocker, request, input_shape):
+    def test__default_alpha__input_shapes(self, mocker, input_shape):
         model = LeakyReluModule()
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
-    def test__default_alpha__qat(self, mocker, request, use_qat):
+    def test__default_alpha__qat(self, mocker, use_qat):
         model = LeakyReluModule()
         input_shape = (23,)
-        self.assert_delegated(model, input_shape, mocker, request, use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat)
 
     @pytest.mark.parametrize(
         "alpha",
         [0.01, 3.14159, 0, 1, float("inf")],
         ids=lambda alpha: f"alpha = {alpha}",
     )
-    def test__specific_alpha(self, mocker, request, alpha):
+    def test__specific_alpha(self, mocker, alpha):
         model = LeakyReluModule(negative_slope=alpha)
-        self.assert_delegated(model, (23,), mocker, request)
+        self.assert_delegated(model, (23,), mocker)
 
-    def test__inplace(self, mocker, request):
+    def test__inplace(self, mocker):
         model = LeakyReluModule(inplace=True)
         self.assert_delegated(
             model,
             (23,),
             mocker,
-            request,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
index 0b7fe88cffc..3e1d066103a 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
@@ -35,7 +35,7 @@ def forward(self, x):
 
 
 class TestLog:
-    def test__basic_nsys_inference(self, mocker, request):
+    def test__basic_nsys_inference(self, mocker):
         # Use 256 elements so that, after quantization to int8, the input can
         # cover the full discrete range [-128, 127].
         # The dataset is generated as a linear float ramp and later quantized,
@@ -49,7 +49,6 @@ def test__basic_nsys_inference(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator=LinearRampDatasetCreator(low=0.0, high=1.0),
         )
 
@@ -61,7 +60,7 @@ def test__basic_nsys_inference(self, mocker, request):
             pytest.param((1, 3, 16, 16), id="4D"),
         ],
     )
-    def test__basic_nsys_inference__qat(self, mocker, request, input_shape, use_qat):
+    def test__basic_nsys_inference__qat(self, mocker, input_shape, use_qat):
         model = LogModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={Log: 1}, expected_non_delegated_ops={}
@@ -70,7 +69,6 @@ def test__basic_nsys_inference__qat(self, mocker, request, input_shape, use_qat)
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator=RandomDatasetCreator(low=1.0, high=10.0),
             use_qat=use_qat,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
index 55a47146bfc..c95b3cd3b8d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
@@ -51,14 +51,14 @@ def reseed_model_per_test_run():
 
 class TestMaxPool2D:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, request):
+    def assert_delegated(self, model, input_shape, mocker):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1},
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
 
     # noinspection PyMethodMayBeStatic
     def assert_not_delegated(self, model, input_shape):
@@ -70,12 +70,12 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [MaxPool2DWithIndices])
 
-    def test__basic_nsys_inference(self, mocker, request):
+    def test__basic_nsys_inference(self, mocker):
         input_shape = (2, 4, 6, 7)  # The old flow limited the batch size to 1.
         model = MaxPool2dModule()
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
-    def test__basic_nsys_inference_qat(self, mocker, request):
+    def test__basic_nsys_inference_qat(self, mocker):
         input_shape = (2, 11, 7, 16)  # The old flow limited the batch size to 1.
         model = MaxPool2dModule()
         graph_verifier = DetailedGraphVerifier(
@@ -88,21 +88,20 @@ def test__basic_nsys_inference_qat(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             use_qat=True,
         )
 
-    def test__large_kernel_size(self, mocker, request):
+    def test__large_kernel_size(self, mocker):
         kernel_size = (1, 5000)
         input_shape = (1, 4) + kernel_size
         model = MaxPool2dModule(kernel_size, stride=1)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
-    def test__stride_limit__no_padding(self, mocker, request):
+    def test__stride_limit__no_padding(self, mocker):
         stride = 4096
         input_shape = (1, 4, 1, 4096)
         model = MaxPool2dModule(1, stride=stride)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     def test__stride_limit_exceeded__no_padding(self):
         stride = 4097  # Exceeds the stride limit.
@@ -110,12 +109,12 @@ def test__stride_limit_exceeded__no_padding(self):
         model = MaxPool2dModule(1, stride=stride)
         self.assert_not_delegated(model, input_shape)
 
-    def test__stride_limit__padding(self, mocker, request):
+    def test__stride_limit__padding(self, mocker):
         padding = 1
         stride = 4096
         input_shape = (1, 2, 3, stride)
         model = MaxPool2dModule(3, stride=stride, padding=padding)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     def test__stride_limit_exceeded__padding(self):
         padding = 1
@@ -127,7 +126,7 @@ def test__stride_limit_exceeded__padding(self):
     @pytest.mark.skip(
         reason="Large padding requires large kernel size which results in an extremely slow test."
     )
-    def test__padding_limit(self, mocker, request):
+    def test__padding_limit(self, mocker):
         # As the padding is added wia a `Pad` operator (not the `MaxPool` arguments), there is no limit to the padded
         #  value. But as padding can be at most half of the kernel size (PyTorch requirement) and kernel size is limited
         #  to 4096, padding of 2048 is the limit.
@@ -135,16 +134,16 @@ def test__padding_limit(self, mocker, request):
         kernel_size = padding * 2
         input_shape = (1, 1, 2, 3)
         model = MaxPool2dModule(kernel_size, padding=padding)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
-    def test__padding__max_pool_limit_exceeded(self, mocker, request):
+    def test__padding__max_pool_limit_exceeded(self, mocker):
         # NeutronIR `MaxPool` padding is limited to 32. But as it is added by the `Pad` operator instead, there is no
         #  limit. This tests ensures the `MaxPool` padding limit is not a problem.
         padding = 33
         kernel_size = padding * 2
         input_shape = (1, 2, 3, 4)
         model = MaxPool2dModule(kernel_size, padding=padding)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     def test__padding_to_kernel_ratio_exceeded(self):
         # Both PyTorch and Neutron require the padding to be at most half of the kernel size.
@@ -161,7 +160,7 @@ def test__padding_to_kernel_ratio_exceeded(self):
 class TestMaxPool1D:
 
     # Just a basic test to verify that the operator gets extended to the 2D variant correctly.
-    def test__basic_nsys_inference__view_not_delegated(self, mocker, request):
+    def test__basic_nsys_inference__view_not_delegated(self, mocker):
         input_shape = (2, 4, 6)  # The old flow limited the batch size to 1.
         model = MaxPool1DModule()
 
@@ -171,4 +170,4 @@ def test__basic_nsys_inference__view_not_delegated(self, mocker, request):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier, request)
+        lower_run_compare(model, input_shape, graph_verifier)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
index f84471169ea..8195581c0f6 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
@@ -9,18 +9,6 @@
 import pytest
 import torch
 
-from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
-    ModelBuilder,
-)
-from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.max_pool_2d_options import (
-    MaxPool2D,
-)
-from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.mean_options import (
-    Mean,
-)
-from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.transpose_options import (
-    Transpose,
-)
 from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
@@ -62,81 +50,71 @@ def forward(self, x):
 
 
 class MaxPoolMeanDimModule(torch.nn.Module):
-    @staticmethod
-    def noop_max_pool_2d(x):
-        """Call `torch.max_pool2d` that is a NoOp, but it enforces the ChannelsFirst format in the `NodeFormatInference`."""
-        return torch.max_pool2d(x, kernel_size=1)
-
     def __init__(self, dim, keepdim):
         super().__init__()
         self.dim, self.keepdim = dim, keepdim
 
     def forward(self, x):
-        x = self.noop_max_pool_2d(x)
-        x = torch.mean(x, dim=self.dim, keepdim=self.keepdim)
-        return x
-
-
-class MeanDimMaxPoolModule(MaxPoolMeanDimModule):
-    def forward(self, x):
-        x = torch.mean(x, dim=self.dim, keepdim=self.keepdim)
-        x = self.noop_max_pool_2d(x)
-        return x
-
-
-def assert_delegated(
-    model,
-    input_shape,
-    mocker,
-    request,
-    use_qat=False,
-    expected_delegated_ops=None,
-):
-    if expected_delegated_ops is None:
-        expected_delegated_ops = {MeanDim: 1}
-
-    graph_verifier = DetailedGraphVerifier(
-        mocker,
-        expected_delegated_ops=expected_delegated_ops,
-        expected_non_delegated_ops={},
-    )
+        x = torch.max_pool2d(
+            x, kernel_size=1
+        )  # NoOp, but it enforces the channels first format.
+        return torch.mean(x, dim=self.dim, keepdim=self.keepdim)
 
-    # Cover also negative values to thoroughly test the operator.
-    dataset_creator = RandomDatasetCreator(low=-2, high=2)
 
-    remove_quant_io_ops = True  # Use quantized dataset.
-    output_comparator = AllCloseOutputComparator(atol=1)  # Allow single bit error.
+class TestMeanDim:
 
-    lower_run_compare(
+    # noinspection PyMethodMayBeStatic
+    def assert_delegated(
+        self,
         model,
         input_shape,
-        graph_verifier,
-        request,
-        dataset_creator,
-        output_comparator,
-        use_qat=use_qat,
-        remove_quant_io_ops=remove_quant_io_ops,
-    )
+        mocker,
+        use_qat=False,
+        atol=None,
+        expected_delegated_ops=None,
+    ):
+        if expected_delegated_ops is None:
+            expected_delegated_ops = {MeanDim: 1}
 
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops=expected_delegated_ops,
+            expected_non_delegated_ops={},
+        )
 
-def assert_not_delegated(model, input_shape):
-    delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
+        # Cover also negative values to thoroughly test the operator.
+        dataset_creator = RandomDatasetCreator(low=-2, high=2)
 
-    # Make sure the `mean` was NOT delegated.
-    assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
-    assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim])
+        kwargs = {"atol": atol} if atol is not None else {}
+        output_comparator = AllCloseOutputComparator(**kwargs)
 
+        lower_run_compare(
+            model,
+            input_shape,
+            graph_verifier,
+            dataset_creator,
+            output_comparator,
+            use_qat=use_qat,
+        )
 
-class TestMeanDim:
+    # noinspection PyMethodMayBeStatic
+    def assert_not_delegated(self, model, input_shape):
+        delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
+
+        # Make sure the `mean` was NOT delegated.
+        assert not graph_contains_any_of_ops(
+            delegated_ep.graph, [ExecutorchDelegateCall]
+        )
+        assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim])
 
     @pytest.fixture(params=[True, False], ids=lambda keep_dim: f"keep_dim = {keep_dim}")
     def keep_dim(self, request):
         return request.param
 
-    def test__basic_nsys_inference__qat(self, mocker, request, use_qat, keep_dim):
+    def test__basic_nsys_inference__qat(self, mocker, use_qat, keep_dim):
         input_shape = (23,)
         model = MeanDimModule(0, keep_dim)
-        assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -150,9 +128,12 @@ def test__basic_nsys_inference__qat(self, mocker, request, use_qat, keep_dim):
             pytest.param((3, 1, 4, 1, 5), 0, id="5D, dim = 0."),
         ],
     )
-    def test__single_dims(self, mocker, request, input_shape, dim, keep_dim):
+    def test__single_dims(self, mocker, input_shape, dim, keep_dim):
         model = MeanDimModule(dim, keep_dim)
-        assert_delegated(model, input_shape, mocker, request)
+        # Relatively large error, but it is actually equal to the output scale, so it is a single bit error.
+        # TODO Replace with quantized dataset testing and `atol = 1`.
+        atol = 0.014
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -164,9 +145,12 @@ def test__single_dims(self, mocker, request, input_shape, dim, keep_dim):
             pytest.param((3, 1, 4, 1, 5), (3, -5, -4), id="5D, dim = (3, -5 ,-4)."),
         ],
     )
-    def test__tuple_dims(self, mocker, request, input_shape, dim, keep_dim):
+    def test__tuple_dims(self, mocker, input_shape, dim, keep_dim):
         model = MeanDimModule(dim, keep_dim)
-        assert_delegated(model, input_shape, mocker, request)
+        # Relatively large error, but it is actually equal to the output scale, so it is a single bit error.
+        # TODO Replace with quantized dataset testing and `atol = 1`.
+        atol = 0.015
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -178,7 +162,7 @@ def test__tuple_dims(self, mocker, request, input_shape, dim, keep_dim):
     def test__noop__only_node__not_delegated(self, input_shape, dim):
         keep_dim = True  # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op.
         model = MeanDimModule(dim, keep_dim)
-        assert_not_delegated(model, input_shape)
+        self.assert_not_delegated(model, input_shape)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -187,14 +171,13 @@ def test__noop__only_node__not_delegated(self, input_shape, dim):
             pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."),
         ],
     )
-    def test__noop__not_only_node__delegated(self, mocker, request, input_shape, dim):
+    def test__noop__not_only_node__delegated(self, mocker, input_shape, dim):
         keep_dim = True  # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op.
         model = MeanDimAddModule(dim, keep_dim)
-        assert_delegated(
+        self.assert_delegated(
             model,
             input_shape,
             mocker,
-            request,
             expected_delegated_ops={MeanDim: 1, AddTensor: 1},
         )
 
@@ -203,207 +186,44 @@ def test__noop__not_only_node__delegated(self, mocker, request, input_shape, dim
         [
             pytest.param((3, 1, 4), 1, id="3D, dim = 1."),
             pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."),
-            pytest.param((1, 7, 3, 3), [0], id="4D, dim = [0]."),
         ],
     )
-    def test__no_reduction__keepdim_false__delegated(
-        self, mocker, request, input_shape, dim
-    ):
+    def test__no_reduction__keepdim_false__delegated(self, mocker, input_shape, dim):
         # These cases reduce over a dimension of size 1.
         # When `keep_dim=True` the node is a noop, and it's not delegated (see `test__noop__only_node__not_delegated`),
         # but with `keep_dim=False` it changes the shape so it's not a noop and is therefore delegated successfully.
         keep_dim = False
         model = MeanDimModule(dim, keep_dim)
-        assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
-    def test__channels_first__keep_dim__true(self, mocker, request):
+    @pytest.mark.parametrize(
+        "input_shape, dim",
+        [((1, 7, 3, 3), 1)],
+        ids=lambda val: f"shape={val}" if isinstance(val, tuple) else f"dim={val}",
+    )
+    @pytest.mark.parametrize(
+        "keep_dim",
+        [
+            pytest.param(True),
+            pytest.param(
+                False,
+                marks=pytest.mark.xfail(
+                    strict=True, reason="Known format inference bug (EIEX-937)."
+                ),
+            ),
+        ],
+        ids=lambda kd: f"keep_dim={kd}",
+    )
+    def test__channels_first__keep_dim__true(self, mocker, input_shape, dim, keep_dim):
         # Just 1 test case to verify correct handling of the `dim`.
         # Most cases fall into the single bit error case, and since this test uses 2 operators, the error accumulates
         #  and the final error is larger. We cannot with 100% certainty say that the error is only caused by the single
         #  bit errors and not related to the format. That's why only this 1 case with no errors is used.
-        input_shape, dim = (1, 7, 3, 3), 1
-        model = MaxPoolMeanDimModule(dim, True)
-        assert_delegated(
+
+        model = MaxPoolMeanDimModule(dim, keep_dim)
+        self.assert_delegated(
             model,
             input_shape,
             mocker,
-            request,
             expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1, MeanDim: 1},
         )
-
-    class TestKeepDimFalseFormatHandling:
-        """When `keep_dim = False`, the `mean.dim` operator changes the rank, so the format have to be explicitly
-        handled. The tests in this class focus on the related edge cases.
-        """
-
-        def _assert_neutron_ir_model_has_ops(
-            self, model_builder_finish_spy, expected_ops
-        ):
-            assert (
-                model_builder_finish_spy.call_count == 1
-            ), "Conversion to Neutron IR happened multiple times."
-
-            neutron_ir_ops = model_builder_finish_spy.spy_return.sub_graphs[
-                0
-            ].operators.vector
-            assert len(neutron_ir_ops) == len(
-                expected_ops
-            ), "Neutron IR model doesn't have the expected number of ops."
-
-            for op, expected_op in zip(neutron_ir_ops, expected_ops, strict=True):
-                assert isinstance(
-                    op.builtin_options, expected_op
-                ), f"Expected {expected_op}, got {op}."
-
-        @pytest.mark.parametrize(
-            "dim",
-            [
-                1,
-                [0, -3],
-                (-4, 1, 2),
-                [-3, 3],
-                [1, 2, 3],
-            ],
-            ids=lambda dim: f"dim={dim}",
-        )
-        def test__channels_first_input__reducing_channels(self, mocker, request, dim):
-            # If the channels dimension is reduced (removed), the `mean` output will always be equal in channels first
-            #  and channels last, so no `Transpose` ops are added.
-            input_shape = (1, 7, 3, 3)
-            model = MaxPoolMeanDimModule(dim, False)
-
-            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
-            assert_delegated(
-                model,
-                input_shape,
-                mocker,
-                request,
-                expected_delegated_ops={
-                    MaxPool2DWithIndices: 1,
-                    GetItem: 1,
-                    MeanDim: 1,
-                },
-            )
-            self._assert_neutron_ir_model_has_ops(
-                model_builder_finish_spy,
-                expected_ops=[
-                    Transpose,
-                    MaxPool2D,
-                    Mean,
-                ],
-            )
-
-        @pytest.mark.parametrize(
-            "dim",
-            [
-                (2, 3),
-                [1, -2, 3],
-                [-1, -2, 0],
-            ],
-            ids=lambda dim: f"dim={dim}",
-        )
-        def test__channels_first_input__reducing_all_spatial_dims(
-            self, mocker, request, dim
-        ):
-            # If tall he spatial dimensions are reduced (removed), the `mean` output will always be equal in channels
-            #  first and channels last, so no `Transpose` ops are added.
-            input_shape = (1, 7, 3, 3)
-            model = MaxPoolMeanDimModule(dim, False)
-
-            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
-            assert_delegated(
-                model,
-                input_shape,
-                mocker,
-                request,
-                expected_delegated_ops={
-                    MaxPool2DWithIndices: 1,
-                    GetItem: 1,
-                    MeanDim: 1,
-                },
-            )
-            self._assert_neutron_ir_model_has_ops(
-                model_builder_finish_spy,
-                expected_ops=[
-                    Transpose,
-                    MaxPool2D,
-                    Mean,
-                ],
-            )
-
-        @pytest.mark.xfail(strict=True, reason="Known Neutron bug (AIR-14726).")
-        @pytest.mark.parametrize(
-            "dim",
-            [
-                0,
-                (2,),
-                [-1, 0],
-            ],
-            ids=lambda dim: f"dim={dim}",
-        )
-        def test__channels_first_input__not_reducing_channels_or_all_spatial_dims(
-            self, mocker, request, dim
-        ):
-            # If the channels dimension is not reduced, a `Transpose` operator must be added to make the input channels
-            #  first in Neutron IR.
-
-            input_shape = (1, 7, 3, 3)
-            model = MaxPoolMeanDimModule(dim, False)
-
-            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
-            assert_delegated(
-                model,
-                input_shape,
-                mocker,
-                request,
-                expected_delegated_ops={
-                    MaxPool2DWithIndices: 1,
-                    GetItem: 1,
-                    MeanDim: 1,
-                },
-            )
-
-            self._assert_neutron_ir_model_has_ops(
-                model_builder_finish_spy,
-                expected_ops=[
-                    Transpose,
-                    MaxPool2D,
-                    Transpose,  # The necessary `Transpose` operator.
-                    Mean,
-                ],
-            )
-
-        @pytest.mark.parametrize(
-            "input_shape, dim",
-            [
-                pytest.param((2, 3, 4, 5, 6), 0, id="dim=0, 5D->4D"),
-                pytest.param((2, 3, 4, 5, 6), [-3], id="dim=[-3], 5D->4D"),
-                pytest.param((1, 2, 3, 4, 5, 6), (1, -1), id="dim=(1, -1), 6D->4D"),
-            ],
-            ids=lambda dim: f"dim={dim}",
-        )
-        def test__channels_first_output(self, mocker, request, input_shape, dim):
-            model = MeanDimMaxPoolModule(dim, False)
-
-            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
-            assert_delegated(
-                model,
-                input_shape,
-                mocker,
-                request,
-                expected_delegated_ops={
-                    MaxPool2DWithIndices: 1,
-                    GetItem: 1,
-                    MeanDim: 1,
-                },
-            )
-
-            self._assert_neutron_ir_model_has_ops(
-                model_builder_finish_spy,
-                expected_ops=[
-                    Mean,
-                    Transpose,  # The necessary `Transpose` operator.
-                    MaxPool2D,
-                    Transpose,
-                ],
-            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
index d112ff1e1ac..897c3efd850 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
@@ -41,7 +41,7 @@ class TestMulTensor:
             pytest.param((1, 4, 8, 8), id="4D."),
         ],
     )
-    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
+    def test__basic_nsys_inference(self, x_input_shape, mocker):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
@@ -52,7 +52,6 @@ def test__basic_nsys_inference(self, mocker, request, x_input_shape):
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
         )
 
     @pytest.mark.parametrize(
@@ -62,7 +61,7 @@ def test__basic_nsys_inference(self, mocker, request, x_input_shape):
             pytest.param((1, 4, 8, 8), id="4D."),
         ],
     )
-    def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape):
+    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
@@ -73,7 +72,6 @@ def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape):
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
             use_qat=True,
         )
 
@@ -92,13 +90,13 @@ def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape):
             ),
         ],
     )
-    def test__correct_broadcast(self, input_spec, mocker, request):
+    def test__correct_broadcast(self, input_spec, mocker):
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_spec, graph_verifier, request)
+        lower_run_compare(model, input_spec, graph_verifier)
 
     @pytest.mark.parametrize(
         "input_spec",
@@ -136,7 +134,7 @@ def test__incorrect_broadcast(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, mocker, request, x_input_shape):
+    def test__w_conv(self, x_input_shape, mocker):
         model = MulTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -153,7 +151,6 @@ def test__w_conv(self, mocker, request, x_input_shape):
             model,
             [x_input_spec, y_input_spec],
             graph_verifier,
-            request,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
index bdfd1e9da25..31436a3f200 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
@@ -77,13 +77,7 @@ def forward(self, x):
 class TestPermuteCopy:
     # noinspection PyMethodMayBeStatic
     def assert_delegated(
-        self,
-        model,
-        input_shape,
-        mocker,
-        request,
-        expected_delegated_ops=None,
-        use_qat=False,
+        self, model, input_shape, mocker, expected_delegated_ops=None, use_qat=False
     ):
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -95,7 +89,6 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
-            request,
             use_qat=use_qat,
         )
 
@@ -122,18 +115,18 @@ def _special_4d_permutations() -> list[ParameterSet]:
             pytest.param((3, 2, 1, 0), id="reverse"),
         ]
 
-    def test__qat(self, mocker, request, use_qat):
+    def test__qat(self, mocker, use_qat):
         input_shape = (2, 3, 5, 7)
         permutation = (0, 2, 3, 1)  # NCHW -> NHWC
         model = PermuteModule(permutation)
-        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "permutation",
         _all_permutations_for_rank(3),
         ids=lambda perm: f"permutation = {perm}",
     )
-    def test__all_permutations__3d(self, mocker, request, permutation: tuple[int]):
+    def test__all_permutations__3d(self, mocker, permutation: tuple[int]):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5)
         model = PermuteModule(permutation)
@@ -142,14 +135,14 @@ def test__all_permutations__3d(self, mocker, request, permutation: tuple[int]):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker, request)
+            self.assert_delegated(model, input_shape, mocker)
 
     @pytest.mark.parametrize(
         "permutation",
         _all_permutations_for_rank(4),
         ids=lambda perm: f"permutation = {perm}",
     )
-    def test__all_permutations__4d(self, mocker, request, permutation: tuple[int]):
+    def test__all_permutations__4d(self, mocker, permutation: tuple[int]):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteModule(permutation)
@@ -158,55 +151,43 @@ def test__all_permutations__4d(self, mocker, request, permutation: tuple[int]):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker, request)
+            self.assert_delegated(model, input_shape, mocker)
 
     @pytest.mark.parametrize("permutation", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_input(
-        self, mocker, request, permutation: tuple[int]
+        self, mocker, permutation: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = MaxPoolPermuteModule(permutation)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1}
         self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops=expected_delegated_ops,
+            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
         )
 
     @pytest.mark.parametrize("permutation", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_output(
-        self, mocker, request, permutation: tuple[int]
+        self, mocker, permutation: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteMaxPoolModule(permutation)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1}
         self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops=expected_delegated_ops,
+            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
         )
 
     @pytest.mark.parametrize("perm1", _special_4d_permutations())
     @pytest.mark.parametrize("perm2", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_io(
-        self, mocker, request, perm1: tuple[int], perm2: tuple[int]
+        self, mocker, perm1: tuple[int], perm2: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteMaxPoolPermuteModule(perm1, perm2)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 2}
         self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops=expected_delegated_ops,
+            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
         )
 
     @pytest.mark.parametrize(
@@ -219,7 +200,7 @@ def test__all_permutations__4d__channels_first_io(
             pytest.param((4, 2, 3, 0, 1), id="perm = (4, 2, 3, 0, 1)"),
         ],
     )
-    def test__5d(self, mocker, request, permutation):
+    def test__5d(self, mocker, permutation):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 3, 5)
         model = PermuteModule(permutation)
@@ -228,4 +209,4 @@ def test__5d(self, mocker, request, permutation):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker, request)
+            self.assert_delegated(model, input_shape, mocker)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
index ca2abd18f32..ab42560f075 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
@@ -62,7 +62,7 @@ def forward(self, x):
         return self.relu(x)
 
 
-class TestReLU:
+class TestReLUNewNeutronFlow:
     @pytest.mark.parametrize(
         ["model", "input_shape"],
         [
@@ -98,7 +98,7 @@ class TestReLU:
             ),
         ],
     )
-    def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shape):
+    def test_relu_conversion__full_pipeline(self, mocker, model, input_shape):
         model = model()  # Avoid model creation at import time
         is_conv_module = not hasattr(model, "linear")
 
@@ -108,20 +108,19 @@ def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shap
                 {Convolution: 1, Relu: 1} if is_conv_module else {AddMm: 1, Relu: 1}
             ),
             expected_non_delegated_ops={},
-            ops_to_ignore={
+            ops_to_ignore=[
                 PermuteCopy,
                 ViewCopy,
                 QuantizePerTensor,
                 DequantizePerTensor,
                 DequantizePerChannel,
-            },
+            ],
         )
 
         lower_run_compare(
             model,
             input_shape,
             graph_verifier,
-            request,
         )
 
     @pytest.mark.parametrize(
@@ -137,9 +136,7 @@ def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shap
             ),
         ],
     )
-    def test_relu_conversion__non_delegated_with_old_flow(
-        self, mocker, request, input_shape
-    ):
+    def test_relu_conversion__non_delegated_with_old_flow(self, mocker, input_shape):
         verifier = DetailedGraphVerifier(
             mocker=mocker,
             expected_delegated_ops={Relu: 1},
@@ -149,9 +146,8 @@ def test_relu_conversion__non_delegated_with_old_flow(
         lower_run_compare(
             ReLUModule(),
             input_shape,
-            verifier,
-            request,
-            RandomDatasetCreator(low=-1, high=1),
+            dlg_model_verifier=verifier,
+            dataset_creator=RandomDatasetCreator(low=-1, high=1),
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
index bdd41d1eab0..75a32254a1d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
@@ -30,9 +30,7 @@ def reseed_model_per_test_run():
 
 class TestSigmoid:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(
-        self, model, input_shape, mocker, request, use_qat=False, atol=None
-    ):
+    def assert_delegated(self, model, input_shape, mocker, use_qat=False, atol=None):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={Sigmoid: 1},
@@ -49,16 +47,15 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
             output_comparator,
             use_qat=use_qat,
         )
 
-    def test__basic_nsys_inference__qat(self, mocker, request, use_qat):
+    def test__basic_nsys_inference__qat(self, mocker, use_qat):
         input_shape = (23,)
         model = nn.Sigmoid()
-        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape",
@@ -71,13 +68,13 @@ def test__basic_nsys_inference__qat(self, mocker, request, use_qat):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__input_shapes(self, mocker, request, input_shape):
+    def test__input_shapes(self, mocker, input_shape):
         model = nn.Sigmoid()
 
         output_scale = 1.0 / 256.0
         lowering_spy = mocker.spy(NeutronPartitioner, "partition")
         self.assert_delegated(
-            model, input_shape, mocker, request, atol=output_scale
+            model, input_shape, mocker, atol=output_scale
         )  # Allow single bit error.
 
         # Verify that the `atol` is indeed equal to the output scale.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
index 98cc924ee85..cb0ec09bcce 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
@@ -41,9 +41,7 @@ def _slice_id(prefix, input_shape, dims, starts, ends):
         return f"{prefix}rank={len(input_shape)}_dims={str(dims)}_starts={str(starts)}_ends={str(ends)}"
 
     @staticmethod
-    def assert_delegated_and_correct(
-        model, input_shape, num_slices, mocker, request, use_qat
-    ):
+    def assert_delegated_and_correct(model, input_shape, num_slices, mocker, use_qat):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={SliceCopy: num_slices},
@@ -56,7 +54,6 @@ def assert_delegated_and_correct(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset,
             comparator,
             use_qat=use_qat,
@@ -185,14 +182,12 @@ def assert_not_delegated(model, input_shape):
             ),
         ],
     )
-    def test_nsys_inference__basic(
-        self, input_shape, dims, starts, ends, mocker, request
-    ):
+    def test_nsys_inference__basic(self, input_shape, dims, starts, ends, mocker):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, request, use_qat=False
+            model, input_shape, num_slices, mocker, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -214,9 +209,7 @@ def test_nsys_inference__basic(
             ),
         ],
     )
-    def test_nsys_inference__reduction(
-        self, input_shape, dims, starts, ends, mocker, request
-    ):
+    def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker):
         model = SliceTensorModule(dims, starts, ends)
 
         slice_lengths = [e - s for s, e in zip(starts, ends)]
@@ -226,7 +219,7 @@ def test_nsys_inference__reduction(
         else:
             num_slices = len(dims)
             self.assert_delegated_and_correct(
-                model, input_shape, num_slices, mocker, request, use_qat=False
+                model, input_shape, num_slices, mocker, use_qat=False
             )
 
     @pytest.mark.parametrize(
@@ -248,14 +241,12 @@ def test_nsys_inference__reduction(
             ),
         ],
     )
-    def test_nsys_inference__clipped(
-        self, input_shape, dims, starts, ends, mocker, request
-    ):
+    def test_nsys_inference__clipped(self, input_shape, dims, starts, ends, mocker):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, request, use_qat=False
+            model, input_shape, num_slices, mocker, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -278,13 +269,13 @@ def test_nsys_inference__clipped(
         ],
     )
     def test_nsys_inference__normalization(
-        self, input_shape, dims, starts, ends, mocker, request
+        self, input_shape, dims, starts, ends, mocker
     ):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, request, use_qat=False
+            model, input_shape, num_slices, mocker, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -313,14 +304,12 @@ def test_nsys_inference__normalization(
             ),
         ],
     )
-    def test_nsys_inference__big(
-        self, input_shape, dims, starts, ends, mocker, request
-    ):
+    def test_nsys_inference__big(self, input_shape, dims, starts, ends, mocker):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, request, use_qat=False
+            model, input_shape, num_slices, mocker, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -347,7 +336,7 @@ def test_nsys_inference__identity(self, input_shape, dims, starts, ends):
 
         self.assert_model_without_slices(model, input_shape)
 
-    def test_nsys_inference__with_conv(self, mocker, request):
+    def test_nsys_inference__with_conv(self, mocker):
         input_shape = (11, 13, 5, 7)
         in_channels = input_shape[1]
         out_channels = 19
@@ -371,13 +360,12 @@ def test_nsys_inference__with_conv(self, mocker, request):
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset,
             comparator,
             use_qat=False,
         )
 
-    def test_nsys_inference__qat(self, mocker, request):
+    def test_nsys_inference__qat(self, mocker):
         input_shape = (7, 13, 7, 9)
         dims = (0, 1, 2, 3)
         starts = (1, 2, 3, 2)
@@ -387,5 +375,5 @@ def test_nsys_inference__qat(self, mocker, request):
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, request, use_qat=True
+            model, input_shape, num_slices, mocker, use_qat=True
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
index e71ff7e8af5..9638f8fe0ec 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -16,9 +16,6 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
-    AllCloseOutputComparator,
-)
 from executorch.backends.nxp.tests.models import SubTensorConvModule, SubTensorModule
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
@@ -41,50 +38,76 @@ class TestSubTensor:
         [
             pytest.param((1,), id="1D."),
             pytest.param((6, 5), id="2D."),
-            pytest.param((6, 82), id="2D alt."),
             pytest.param((1, 4, 7), id="3D."),
-            pytest.param((1, 68, 7), id="3D alt."),
-            pytest.param((2, 4, 3, 15), id="4D."),
-            pytest.param((1, 4, 9, 11, 4), id="5D."),
+            pytest.param(
+                (6, 82),
+                id="2D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (1, 68, 7),
+                id="3D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (2, 4, 3, 15),
+                id="4D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (1, 4, 9, 11, 4),
+                id="5D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
         ],
     )
-    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
+    def test__basic_nsys_inference(self, x_input_shape, mocker):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
-    def test__basic_nsys_inference_qat(self, mocker, request):
-        x_input_spec = ModelInputSpec((2, 4, 3, 15))
+    @pytest.mark.parametrize(
+        "x_input_shape",
+        [
+            pytest.param((1,), id="1D."),
+            pytest.param((6, 5), id="2D."),
+            pytest.param((2, 4, 3, 15), id="4D."),
+            pytest.param(
+                (1, 4, 7),
+                id="3D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                (1, 4, 9, 11, 4),
+                id="5D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+        ],
+    )
+    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
+        x_input_spec = ModelInputSpec(x_input_shape)
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
             use_qat=True,
-            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -93,35 +116,33 @@ def test__basic_nsys_inference_qat(self, mocker, request):
             pytest.param(
                 [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
             ),
-            pytest.param(
-                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
-                id="2 inputs 2D alt.",
-            ),
             pytest.param(
                 [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
             ),
             pytest.param(
                 [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
-                id="2 inputs 3D.",
+                id="2 inputs 3D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+            ),
+            pytest.param(
+                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+                id="2 inputs 2D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
             ),
         ],
     )
-    def test__broadcast(self, mocker, request, input_spec):
+    def test__broadcast(self, input_spec, mocker):
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -160,7 +181,7 @@ def test__broadcast_unsupported(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, mocker, request, x_input_shape):
+    def test__w_conv(self, x_input_shape, mocker):
         model = SubTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -178,7 +199,6 @@ def test__w_conv(self, mocker, request, x_input_shape):
             model,
             [x_input_spec, y_input_spec],
             graph_verifier,
-            request,
             dataset_creator,
         )
 
@@ -191,11 +211,12 @@ def test__w_conv(self, mocker, request, x_input_shape):
             ),
             pytest.param(
                 [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))],
-                id="2 inputs 4D + 4D same height.",
+                id="2 inputs 4D + 4D incorrect.",
+                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
             ),
         ],
     )
-    def test__w_conv_broadcast(self, mocker, request, input_spec):
+    def test__w_conv_broadcast(self, input_spec, mocker):
         model = SubTensorConvModule()
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -203,16 +224,12 @@ def test__w_conv_broadcast(self, mocker, request, input_spec):
             expected_non_delegated_ops={},
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
-        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
-            request,
             dataset_creator,
-            comparator,
-            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
index 51b7ee484a7..6336308e40b 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
@@ -36,7 +36,6 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
-        request,
         use_qat=False,
         expected_delegated_ops=None,
     ):
@@ -56,7 +55,6 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -65,10 +63,10 @@ def assert_delegated(
     def inplace(self, request):
         return request.param
 
-    def test__qat__inplace(self, mocker, request, use_qat, inplace):
+    def test__qat__inplace(self, mocker, use_qat, inplace):
         shape = (23,)
         model = TanhModule(inplace)
-        self.assert_delegated(model, shape, mocker, request, use_qat=use_qat)
+        self.assert_delegated(model, shape, mocker, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "shape",
@@ -81,20 +79,16 @@ def test__qat__inplace(self, mocker, request, use_qat, inplace):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__shapes(self, mocker, request, shape):
+    def test__shapes(self, mocker, shape):
         model = TanhModule()
-        self.assert_delegated(model, shape, mocker, request)
+        self.assert_delegated(model, shape, mocker)
 
-    def test__with_convolution(self, mocker, request):
+    def test__with_convolution(self, mocker):
         input_shape = (1, 3, 12, 16)
         channels = input_shape[1]
         model = Conv2dWithActivation(
             activation=torch.tanh, in_channels=channels, out_channels=channels
         )
         self.assert_delegated(
-            model,
-            input_shape,
-            mocker,
-            request,
-            expected_delegated_ops={Tanh: 1, Convolution: 1},
+            model, input_shape, mocker, expected_delegated_ops={Tanh: 1, Convolution: 1}
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
index f9b2269751f..c4a698f4bfb 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
@@ -59,7 +59,6 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
-        request,
         use_qat=False,
         atol=None,
         expected_delegated_ops=None,
@@ -83,7 +82,6 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
             output_comparator,
             use_qat=use_qat,
@@ -98,25 +96,21 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D])
 
-    def test__qat__align_corners(self, mocker, request, use_qat):
+    def test__qat__align_corners(self, mocker, use_qat):
         align_corners = True
         input_shape = (1, 2, 3, 4)
         output_size = (5, 7)
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.015  # ~= output scale -> single bit error.
-        self.assert_delegated(
-            model, input_shape, mocker, request, use_qat=use_qat, atol=atol
-        )
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
 
-    def test__qat__not_align_corners(self, mocker, request, use_qat):
+    def test__qat__not_align_corners(self, mocker, use_qat):
         align_corners = False
         input_shape = (1, 2, 3, 4)
         output_size = (6, 8)
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.015  # ~= output scale -> single bit error.
-        self.assert_delegated(
-            model, input_shape, mocker, request, use_qat=use_qat, atol=atol
-        )
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
 
     @pytest.mark.parametrize(
         "input_shape, output_size",
@@ -131,13 +125,11 @@ def test__qat__not_align_corners(self, mocker, request, use_qat):
             pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
         ],
     )
-    def test__not_align_corners__output_size(
-        self, mocker, request, input_shape, output_size
-    ):
+    def test__not_align_corners__output_size(self, mocker, input_shape, output_size):
         align_corners = False
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     def test__not_align_corners__output_size__unsupported(self):
         align_corners = False
@@ -159,11 +151,11 @@ def test__not_align_corners__output_size__unsupported(self):
             pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
         ],
     )
-    def test__not_align_corners__scales(self, mocker, request, input_shape, scale):
+    def test__not_align_corners__scales(self, mocker, input_shape, scale):
         align_corners = False
         model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     def test__not_align_corners__scales__unsupported(self):
         align_corners = False
@@ -191,13 +183,11 @@ def test__not_align_corners__scales__unsupported(self):
             ),
         ],
     )
-    def test__align_corners__output_size(
-        self, mocker, request, input_shape, output_size
-    ):
+    def test__align_corners__output_size(self, mocker, input_shape, output_size):
         align_corners = True
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     def test__align_corners__output_size__unsupported(self):
         align_corners = True
@@ -250,11 +240,11 @@ def test__align_corners__output_size__input_size_equal_to_one(self):
             ),
         ],
     )
-    def test__align_corners__scales(self, mocker, request, input_shape, scale):
+    def test__align_corners__scales(self, mocker, input_shape, scale):
         align_corners = True
         model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, atol=atol)
 
     def test__align_corners__scales__unsupported(self):
         align_corners = True
@@ -269,7 +259,7 @@ def test__noop__alone_in_partition__not_delegated(self):
         model = UpsampleBilinearModule(scale=scale)
         self.assert_not_delegated(model, input_shape)
 
-    def test__noop__not_alone_in_partition__delegated(self, mocker, request):
+    def test__noop__not_alone_in_partition__delegated(self, mocker):
         input_shape = (1, 2, 3, 4)
         scale = 1
         model = UpsampleBilinearAddModule(scale=scale)
@@ -277,6 +267,5 @@ def test__noop__not_alone_in_partition__delegated(self, mocker, request):
             model,
             input_shape,
             mocker,
-            request,
             expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1},
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
index b3e28a7b2f8..438a580f6e8 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
@@ -53,7 +53,6 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
-        request,
         use_qat=False,
         expected_delegated_ops=None,
     ):
@@ -73,7 +72,6 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
-            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -87,11 +85,11 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D])
 
-    def test__qat(self, mocker, request, use_qat):
+    def test__qat(self, mocker, use_qat):
         input_shape = (1, 2, 3, 4)
         output_size = (6, 8)
         model = UpsampleNearestModule(size=output_size)
-        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape, output_size",
@@ -107,9 +105,9 @@ def test__qat(self, mocker, request, use_qat):
             pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
         ],
     )
-    def test__output_size(self, mocker, request, input_shape, output_size):
+    def test__output_size(self, mocker, input_shape, output_size):
         model = UpsampleNearestModule(size=output_size)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     def test__output_size__unsupported(self):
         input_shape = (1, 2, 3, 4)
@@ -133,9 +131,9 @@ def test__output_size__unsupported(self):
             pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
         ],
     )
-    def test__scales(self, mocker, request, input_shape, scale):
+    def test__scales(self, mocker, input_shape, scale):
         model = UpsampleNearestModule(scale=scale)
-        self.assert_delegated(model, input_shape, mocker, request)
+        self.assert_delegated(model, input_shape, mocker)
 
     def test__scales__unsupported(self):
         input_shape = (1, 2, 3, 4)
@@ -149,7 +147,7 @@ def test__noop__alone_in_partition__not_delegated(self):
         model = UpsampleNearestModule(scale=scale)
         self.assert_not_delegated(model, input_shape)
 
-    def test__noop__not_alone_in_partition__delegated(self, mocker, request):
+    def test__noop__not_alone_in_partition__delegated(self, mocker):
         input_shape = (1, 2, 3, 4)
         scale = 1
         model = UpsampleNearestAddModule(scale=scale)
@@ -157,6 +155,5 @@ def test__noop__not_alone_in_partition__delegated(self, mocker, request):
             model,
             input_shape,
             mocker,
-            request,
             expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1},
         )
diff --git a/backends/nxp/tests/model_output_comparator.py b/backends/nxp/tests/model_output_comparator.py
index 5563703ae20..f0dd7cd2d60 100644
--- a/backends/nxp/tests/model_output_comparator.py
+++ b/backends/nxp/tests/model_output_comparator.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import abc
-import logging
 import os
 from abc import abstractmethod
 from pathlib import Path
@@ -16,7 +15,6 @@
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
     torch_type_to_numpy_type,
 )
-from executorch.backends.nxp.tests.utils import archive_test_dir, store_txt_input_tensor
 
 
 class BaseOutputComparator(abc.ABC):
@@ -37,11 +35,6 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec):
         :param npu_results_dir: Path to directory with NPU (delegated) results.
         :param output_tensor_spec: List of output tensor specifications.
         """
-        if logging.root.isEnabledFor(logging.DEBUG):
-            diff_cpu_npu_results_dir = os.path.join(
-                os.path.dirname(cpu_results_dir), "diff_cpu_npu_results"
-            )
-
         sample_dirs = [
             os.path.join(cpu_results_dir, file) for file in os.listdir(cpu_results_dir)
         ]
@@ -72,28 +65,7 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec):
                 )
                 npu_output_tensors.append((output_tensor_name, npu_tensor))
 
-                if logging.root.isEnabledFor(logging.DEBUG):
-                    # Store diff results if logging level is enabled
-                    diff_cpu_npu_tensor = np.abs(cpu_tensor - npu_tensor)
-                    os.makedirs(
-                        os.path.join(diff_cpu_npu_results_dir, sample_dir),
-                        exist_ok=True,
-                    )
-                    diff_cpu_npu_tensor_path = os.path.join(
-                        diff_cpu_npu_results_dir, sample_dir, output_tensor_name
-                    )
-                    diff_cpu_npu_tensor.tofile(diff_cpu_npu_tensor_path)
-
-                    # Store text tensor results
-                    store_txt_input_tensor(cpu_tensor_path, tensor_spec)
-                    store_txt_input_tensor(npu_tensor_path, tensor_spec)
-                    store_txt_input_tensor(diff_cpu_npu_tensor_path, tensor_spec)
-
-        # We need to archive the test_dir before comparison, as comparison can cause AssertionError exception
-        test_dir = os.path.dirname(cpu_results_dir)
-        if logging.root.isEnabledFor(logging.DEBUG):
-            archive_test_dir(test_dir)
-        self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors)
+            self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors)
 
     @abstractmethod
     def compare_sample(
diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py
index d5ff3680f38..7631ee20ca1 100644
--- a/backends/nxp/tests/nsys_testing.py
+++ b/backends/nxp/tests/nsys_testing.py
@@ -3,22 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import datetime
 import functools
+import inspect
 import logging
 import os.path
-import re
 import shutil
 import subprocess
 from copy import deepcopy
 from enum import Enum
-from importlib.metadata import version
 from os import environ, mkdir
 from typing import Callable, Iterable
 
 import numpy as np
 import torch
-import yaml
 from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order
 from executorch.backends.nxp.backend.ir.converter.conversion import translator
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
@@ -43,11 +40,10 @@
     AllCloseOutputComparator,
 )
 from executorch.backends.nxp.tests.outputs_dir_importer import outputs_dir
-from executorch.backends.nxp.tests.utils import save_pte_program, store_txt_input_tensor
+from executorch.backends.nxp.tests.utils import save_pte_program
 from executorch.devtools.visualization.visualization_utils import (
     visualize_with_clusters,
 )
-from pytest import FixtureRequest
 from pytest_mock import MockerFixture
 from torch.export import ExportedProgram
 from torch.fx import GraphModule
@@ -59,7 +55,6 @@
 NSYS_CONFIG_PATH = test_config.NSYS_CONFIG_PATH
 NSYS_FIRMWARE_PATH = test_config.NSYS_FIRMWARE_PATH
 NEUTRON_TEST_PATH = test_config.NEUTRON_TEST_PATH
-PROJECT_DIR = test_config.PROJECT_DIR
 
 
 class ReferenceModel(Enum):
@@ -124,7 +119,6 @@ def wrapper(*args, **kwargs):
         delegated_program = to_quantized_executorch_program(
             model,
             input_spec,
-            intermediates_dir=test_dir,
             dataset_dir=calibration_dataset_dir,
             delegate_to_npu=True,
             use_qat=use_qat,
@@ -132,7 +126,6 @@ def wrapper(*args, **kwargs):
             operators_not_to_delegate=operators_not_to_delegate,
             remove_quant_io_ops=remove_quant_io_ops,
         )
-
     except RuntimeError as e:
         if "Model converted with neutron-converter has" in str(e) and hasattr(
             dlg_model_verifier, "check_num_delegated_nodes"
@@ -398,7 +391,6 @@ def lower_run_compare(
     model: torch.nn.Module,
     input_spec: Iterable[ModelInputSpec] | tuple[int, ...],
     dlg_model_verifier: GraphVerifier,
-    request: FixtureRequest,
     dataset_creator=None,
     output_comparator=None,
     mocker: MockerFixture = None,
@@ -416,12 +408,11 @@ def lower_run_compare(
     :param model: Executed PyTorch model.
     :param input_spec: Model input specification. Can be either tuple of ints - single float32 input model - or Iterable
         of ModelInputSpec.
-    :param dlg_model_verifier: Graph verifier instance.
-    :param request: PyTest request needed for correct test name extraction.
     :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples.
     :param output_comparator: Comparator of results produced by NPU and CPU runs of the program.
-    :param mocker: Mocker instance used by visualizer.
+    :param dlg_model_verifier: Graph verifier instance.
     :param reference_model: Version of the model which will be run to obtain reference output data.
+    :param mocker: Mocker instance used by visualizer.
     :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training).
     :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`.
     :param operators_not_to_delegate: list of operators not to delegate.
@@ -439,7 +430,7 @@ def lower_run_compare(
     model_to_delegate = model
     model_to_not_delegate = deepcopy(model)
 
-    test_name = get_test_name(request)
+    test_name = _get_caller_name()
     test_dir = os.path.join(OUTPUTS_DIR, test_name)
 
     shutil.rmtree(test_dir, ignore_errors=True)
@@ -547,11 +538,6 @@ def lower_run_compare(
 
     output_tensor_spec = _get_program_output_spec(delegated_program)
 
-    if logging.root.isEnabledFor(logging.DEBUG):
-        _generate_txt_test_data(
-            calibration_dataset_dir, testing_dataset_dir, list(input_spec)
-        )
-        dump_debug_test_summary(test_name, test_dir)
     npu_results_dir = os.path.join(test_dir, "results_npu")
     cpu_results_dir = os.path.join(test_dir, "results_cpu")
     output_comparator.compare_results(
@@ -563,12 +549,10 @@ def lower_run_compare_ptq_qat(
     model: torch.nn.Module,
     input_spec: list[ModelInputSpec] | tuple,
     dlg_model_verifier: GraphVerifier,
-    request: FixtureRequest,
     train_fn: Callable[[torch.fx.GraphModule], None],
     dataset_creator=None,
     output_comparator=None,
     mocker: MockerFixture = None,
-    operators_not_to_delegate: list[str] = None,
 ):
     """
     Run provided program twice and compare it's results.
@@ -578,12 +562,10 @@ def lower_run_compare_ptq_qat(
     :param input_spec: Model input specification. Can be either tuple - single float32 input model - or list
         of ModelInputSpec.
     :param dlg_model_verifier: Graph verifier instance.
-    :param request: PyTest request needed for correct test name extraction.
     :param train_fn: Train/finetune function for QAT training.
     :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples.
     :param output_comparator: Comparator of results produced by NPU and CPU runs of the program.
     :param mocker: Mocker instance used by visualizer.
-    :param operators_not_to_delegate: list of operators not to delegate.
     """
     assert_NSYS()
 
@@ -595,7 +577,7 @@ def lower_run_compare_ptq_qat(
     model_ptq = model
     model_qat = deepcopy(model)
 
-    test_name = get_test_name(request)
+    test_name = _get_caller_name()
     test_dir = os.path.join(OUTPUTS_DIR, test_name)
 
     shutil.rmtree(test_dir, ignore_errors=True)
@@ -624,7 +606,6 @@ def lower_run_compare_ptq_qat(
         ptq_results_dir,
         mocker,
         use_qat=False,
-        operators_not_to_delegate=operators_not_to_delegate,
     )
 
     _ = _run_delegated_executorch_program(
@@ -639,14 +620,10 @@ def lower_run_compare_ptq_qat(
         mocker,
         use_qat=True,
         train_fn=train_fn,
-        operators_not_to_delegate=operators_not_to_delegate,
     )
 
     output_tensor_spec = _get_program_output_spec(delegated_program_ptq)
 
-    if logging.root.isEnabledFor(logging.DEBUG):
-        dump_debug_test_summary(test_name, test_dir)
-        shutil.make_archive(test_dir, "zip", test_dir)
     ptq_results_dir = os.path.join(test_dir, "results_ptq")
     qat_results_dir = os.path.join(test_dir, "results_qat")
     output_comparator.compare_results(
@@ -680,13 +657,13 @@ def _parse_input_quant_params(
     return q_params
 
 
-def get_test_name(request):
-    # PyTest request is available, extract correct name including test class and params
-    test_name = request.node.nodeid.lstrip(":")
-    # Escape unacceptable characters from test name to make sure it is a valid filesystem directory name
-    test_name = re.sub(r'[<>:"/\\|?* ,()`]', "_", test_name)
-    test_name = test_name.strip(" .")
-    return test_name
+def _get_caller_name():
+    test_function_names = ["lower_run_compare", "lower_run_compare_ptq_qat"]
+    for idx, frame in enumerate(inspect.stack()):
+        if frame.function in test_function_names:
+            # Look one index above to get caller
+            return inspect.stack()[idx + 1].function
+    return None
 
 
 def execute_cmd(cmd, cwd="."):
@@ -748,60 +725,3 @@ def _get_program_output_spec(exported_program) -> list[torch.Tensor]:
     output_tensors_spec = list(exported_program.graph.output_node().meta["val"])
 
     return output_tensors_spec
-
-
-def get_executorch_git_info() -> dict[str, str]:
-    git_branch_cmd = f"git -C {PROJECT_DIR} branch --show-current"
-    git_branch, _, _ = execute_cmd(git_branch_cmd)
-    git_commit_cmd = f"git -C {PROJECT_DIR} rev-parse --short HEAD"
-    git_commit, _, _ = execute_cmd(git_commit_cmd)
-    return {"git_branch": git_branch, "git_commit": git_commit}
-
-
-def dump_debug_test_summary(test_name: str, test_dir: str):
-    git_info = get_executorch_git_info()
-
-    summary = {
-        "test_name": test_name,
-        "date_time": datetime.datetime.now().isoformat(),
-        "git_branch": git_info["git_branch"],
-        "git_commit": git_info["git_commit"],
-        "eiq_neutron_sdk_version": version("eiq_neutron_sdk"),
-        "eiq_nsys_version": version("eiq_nsys"),
-    }
-    with open(os.path.join(test_dir, "summary.yaml"), "w") as f:
-        yaml.dump(summary, f)
-
-
-def _generate_txt_test_data(
-    calibration_dataset_dir: str,
-    testing_dataset_dir: str,
-    input_tensor_spec: list[ModelInputSpec],
-):
-    # Generates txt tensor variants for input datasets
-    # Testing dataset can point to calibration dataset
-    dataset_paths = (
-        [calibration_dataset_dir, testing_dataset_dir]
-        if calibration_dataset_dir != testing_dataset_dir
-        else [testing_dataset_dir]
-    )
-    for d_path in dataset_paths:
-        quant_dataset = d_path.endswith("dataset_quant")
-
-        # For multiple input tests, list each sample dir, for single input tests the input files are in d_path
-        sample_dirs = [os.path.join(d_path, file) for file in os.listdir(d_path)]
-        sample_dirs = [file for file in sample_dirs if os.path.isdir(file)]
-        # Single input dataset has tensor directly in dataset path
-        if len(sample_dirs) == 0:
-            for input_tensor_name in sorted(os.listdir(d_path)):
-                input_tensor_path = os.path.join(d_path, input_tensor_name)
-                tensor_spec = input_tensor_spec[0]
-                store_txt_input_tensor(input_tensor_path, tensor_spec, quant_dataset)
-        else:
-            for sample_dir in sample_dirs:
-                for idx, input_tensor_name in enumerate(os.listdir(sample_dir)):
-                    input_tensor_path = os.path.join(sample_dir, input_tensor_name)
-                    tensor_spec = input_tensor_spec[idx]
-                    store_txt_input_tensor(
-                        input_tensor_path, tensor_spec, quant_dataset
-                    )
diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py
index da50d4dc0d9..46002ba8883 100644
--- a/backends/nxp/tests/ops_aliases.py
+++ b/backends/nxp/tests/ops_aliases.py
@@ -26,13 +26,11 @@
 DequantizePerChannel = exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
 DequantizePerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
-Exp = exir_ops.edge.aten.exp.default
 GetItem = operator.getitem
 HardTanh = exir_ops.edge.aten.hardtanh.default
 HardTanh_ = exir_ops.edge.aten.hardtanh_.default
 LeakyRelu = exir_ops.edge.aten.leaky_relu.default
 Log = exir_ops.edge.aten.log.default
-MaxPool2D = exir_ops.edge.aten.max_pool2d.default
 MaxPool2DWithIndices = exir_ops.edge.aten.max_pool2d_with_indices.default
 MeanDim = exir_ops.edge.aten.mean.dim
 MulTensor = exir_ops.edge.aten.mul.Tensor
diff --git a/backends/nxp/tests/utils.py b/backends/nxp/tests/utils.py
index 00b7c364a31..c210d9db8bc 100644
--- a/backends/nxp/tests/utils.py
+++ b/backends/nxp/tests/utils.py
@@ -7,19 +7,11 @@
 
 import logging
 import os
-import shutil
 
-import numpy as np
-
-from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
-    torch_type_to_numpy_type,
-)
-from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec
 from executorch.devtools.visualization.visualization_utils import (
     visualize_with_clusters,
 )
 from executorch.exir import ExecutorchProgramManager
-from torch._subclasses import FakeTensor
 
 
 def save_pte_program(
@@ -40,27 +32,3 @@ def save_pte_program(
 
     visualize_with_clusters(prog.exported_program(), visualize_file_name, False)
     return filename
-
-
-def change_filepath_extension(path: str, extension: str) -> str:
-    base, _ = os.path.splitext(path)
-    return base + "." + extension
-
-
-def store_txt_input_tensor(
-    input_tensor_path: str,
-    tensor_spec: ModelInputSpec | FakeTensor,
-    quant_dataset: bool = False,
-):
-    dtype = np.int8 if quant_dataset else torch_type_to_numpy_type(tensor_spec.dtype)
-    input_tensor = np.fromfile(input_tensor_path, dtype=dtype)
-    int__max = np.iinfo(np.int32).max
-
-    with open(change_filepath_extension(input_tensor_path, "txt"), "w") as f:
-        f.write("Flattened tensor shape:" + str(input_tensor.shape))
-        f.write("\nOriginal tensor shape:" + str(list(tensor_spec.shape)) + "\n")
-        f.write(np.array2string(input_tensor, threshold=int__max))
-
-
-def archive_test_dir(test_dir: str):
-    shutil.make_archive(test_dir, "zip", test_dir)
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 3336a394510..ca853de6f86 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -14,7 +14,6 @@
 from .convert_mha_to_sha import ConvertMhaToSha
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_acos import DecomposeAcos
-from .decompose_addmm import DecomposeAddmm
 from .decompose_any import DecomposeAny
 from .decompose_atan2 import DecomposeAtan2
 from .decompose_binary_alpha import DecomposeBinaryAlpha
@@ -27,7 +26,6 @@
 from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_hardsigmoid import DecomposeHardsigmoid
-from .decompose_hyperbolic_variants import DecomposeHyperbolicVariants
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_log_variants import DecomposeLogVariants
 from .decompose_maxpool3d import DecomposeMaxPool3d
@@ -78,7 +76,6 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
-    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
@@ -90,7 +87,6 @@
     DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
-    DecomposeHyperbolicVariants,
     DecomposeHardsigmoid,
     DecomposeLinalgVectorNorm,
     DecomposeLogVariants,
diff --git a/backends/qualcomm/_passes/decompose_acos.py b/backends/qualcomm/_passes/decompose_acos.py
index d546cf6d92d..f83b18f11fc 100644
--- a/backends/qualcomm/_passes/decompose_acos.py
+++ b/backends/qualcomm/_passes/decompose_acos.py
@@ -9,7 +9,7 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, create_const_node
+from .utils import copy_meta, get_const_node
 
 
 class DecomposeAcos(ExportPass):
@@ -52,7 +52,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             )
 
             if is_edge and pi_half_node is None:
-                pi_half_node = create_const_node(
+                pi_half_node = get_const_node(
                     graph, graph_module, "_pi_half_constant", pi_half, node
                 )
 
diff --git a/backends/qualcomm/_passes/decompose_atan2.py b/backends/qualcomm/_passes/decompose_atan2.py
index a411f997b61..0f54e555e03 100644
--- a/backends/qualcomm/_passes/decompose_atan2.py
+++ b/backends/qualcomm/_passes/decompose_atan2.py
@@ -9,7 +9,7 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, create_const_node, create_node
+from .utils import copy_meta, create_node, get_const_node
 
 
 class DecomposeAtan2(ExportPass):
@@ -68,7 +68,7 @@ def _get_constants(self, graph, graph_module, node, is_edge, const_cache):
 
             def make_const(name, val):
                 if name not in const_cache:
-                    const_cache[name] = create_const_node(
+                    const_cache[name] = get_const_node(
                         graph, graph_module, name, val, node
                     )
                 return const_cache[name]
diff --git a/backends/qualcomm/_passes/decompose_log_variants.py b/backends/qualcomm/_passes/decompose_log_variants.py
index 904900dd205..2b394806b68 100644
--- a/backends/qualcomm/_passes/decompose_log_variants.py
+++ b/backends/qualcomm/_passes/decompose_log_variants.py
@@ -11,7 +11,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, create_const_node
+from .utils import copy_meta, get_const_node
 
 
 class DecomposeLogVariants(ExportPass):
@@ -50,7 +50,7 @@ def _decompose_log_n(self, node, graph, graph_module, const_cache, n):
             div_op = exir_ops.edge.aten.div.Tensor
             attr_name = f"_log_base_{n}_constant"
             if attr_name not in const_cache:
-                const_cache[attr_name] = create_const_node(
+                const_cache[attr_name] = get_const_node(
                     graph, graph_module, attr_name, math.log(n), node
                 )
             div_arg = const_cache[attr_name]
@@ -81,7 +81,7 @@ def _decompose_log_p(self, node, graph, graph_module, const_cache, p):
             log_op = exir_ops.edge.aten.log.default
             attr_name = f"_log1p_addend_{p}_constant"
             if attr_name not in const_cache:
-                const_cache[attr_name] = create_const_node(
+                const_cache[attr_name] = get_const_node(
                     graph, graph_module, attr_name, p, node
                 )
             add_arg = const_cache[attr_name]
diff --git a/backends/qualcomm/_passes/decompose_remainder.py b/backends/qualcomm/_passes/decompose_remainder.py
index a6c260d217b..4e5ea739856 100644
--- a/backends/qualcomm/_passes/decompose_remainder.py
+++ b/backends/qualcomm/_passes/decompose_remainder.py
@@ -10,7 +10,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
-from .utils import copy_meta, create_const_node
+from .utils import copy_meta, get_const_node
 
 
 class DecomposeRemainder(ExportPass):
@@ -69,7 +69,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
                             graph_module
                         )
-                        const_cache[x_arg] = create_const_node(
+                        const_cache[x_arg] = get_const_node(
                             graph, graph_module, attr_name, x_arg, node
                         )
                     x_node = const_cache[x_arg]
@@ -82,7 +82,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
                             graph_module
                         )
-                        const_cache[y_arg] = create_const_node(
+                        const_cache[y_arg] = get_const_node(
                             graph, graph_module, attr_name, y_arg, node
                         )
                     y_node = const_cache[y_arg]
diff --git a/backends/qualcomm/_passes/decompose_var.py b/backends/qualcomm/_passes/decompose_var.py
index c89929fa50e..923fae4977f 100644
--- a/backends/qualcomm/_passes/decompose_var.py
+++ b/backends/qualcomm/_passes/decompose_var.py
@@ -10,7 +10,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
-from .utils import copy_meta, create_const_node
+from .utils import copy_meta, get_const_node
 
 
 class DecomposeVar(ExportPass):
@@ -155,7 +155,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                                 attr_name = get_new_attr_name_with_prefix(
                                     "_var_scale_const_"
                                 )(graph_module)
-                                const_cache[cache_key] = create_const_node(
+                                const_cache[cache_key] = get_const_node(
                                     graph, graph_module, attr_name, scale, node
                                 )
                             scale_node = const_cache[cache_key]
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index 7efb4a293e1..e3e4b8c8e51 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -20,7 +20,6 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
-    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
@@ -32,7 +31,6 @@
     DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
-    DecomposeHyperbolicVariants,
     DecomposeLinalgVectorNorm,
     DecomposeLogVariants,
     DecomposeMaxPool3d,
@@ -124,14 +122,12 @@ def get_default_pass_activations(cls):
             (AnnotateUnbind, True),
             (ConvertBmmToMatmul, False),
             (DecomposeAcos, True),
-            (DecomposeAddmm, True),
             (DecomposeAny, True),
             (DecomposeAtan2, True),
             (DecomposeColIm, True),
             (DecomposeCDist, True),
             (DecomposeDivMode, True),
             (DecomposeFill, True),
-            (DecomposeHyperbolicVariants, True),
             (DecomposeLogVariants, True),
             (DecomposeMaxPool3d, True),
             (DecomposeMinMaxDim, True),
@@ -164,7 +160,6 @@ def get_annotation_passes(cls):
             RecomposeRmsNorm,
             ReplaceArangeArgs,
             DecomposeAcos,
-            DecomposeAddmm,
             DecomposeAtan2,
             DecomposeBinaryAlpha,
             DecomposeCDist,
@@ -184,7 +179,6 @@ def get_annotation_passes(cls):
             DecomposeExpM1,
             DecomposeFill,
             DecomposeGlu,
-            DecomposeHyperbolicVariants,
             DecomposeRemainder,
             DecomposeSelectScatter,
             DecomposeLinalgVectorNorm,
@@ -281,14 +275,12 @@ def get_passes_dependency_for_capture_program(cls):
             AnnotateUnbind: [RemoveRedundancy],
             ConvertBmmToMatmul: [RecomposePixelUnshuffle],
             DecomposeAcos: [RemoveRedundancy],
-            DecomposeAddmm: [RemoveRedundancy],
             DecomposeAny: [RemoveRedundancy],
             DecomposeAtan2: [RemoveRedundancy],
             DecomposeColIm: [FoldQDQ],
             DecomposeCDist: [RemoveRedundancy],
             DecomposeDivMode: [RemoveRedundancy],
             DecomposeFill: [RemoveRedundancy],
-            DecomposeHyperbolicVariants: [RemoveRedundancy],
             DecomposeLinalgVectorNorm: [RemoveRedundancy],
             DecomposeLogVariants: [RemoveRedundancy],
             DecomposeMaxPool3d: [RemoveRedundancy],
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index 2a580ab11a4..92a75703bbd 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -343,7 +343,7 @@ def append_qdq(
     return dq_node
 
 
-def create_const_node(
+def get_const_node(
     graph: torch.fx.Graph,
     graph_module: torch.fx.GraphModule,
     attr_name: str,
diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl
index 1ea7e6679d5..89f8efdea3e 100644
--- a/backends/qualcomm/aot/wrappers/targets.bzl
+++ b/backends/qualcomm/aot/wrappers/targets.bzl
@@ -1,7 +1,6 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
-    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -21,7 +20,7 @@ def define_common_targets():
             "*.h",
         ]),
         define_static_target = True,
-        platforms = [ANDROID, CXX],
+        platforms = [ANDROID],
         visibility = ["PUBLIC"],
         deps = [
             qnn_third_party_dep("api"),
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index b8d86b9d6da..7b7f4ef8139 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -498,16 +498,11 @@ The following PyTorch operators are supported through decomposition or annotatio
 | PyTorch Op | Decomposition Pass |
 |---|---|
 | `aten.acos` | `DecomposeAcos` |
-| `aten.acosh` | `DecomposeHyperbolicVariants` |
-| `aten.addmm` | `DecomposeAddmm` |
 | `aten.adaptive_avg_pool1d`, `aten.avg_pool1d` | `AnnotateAvgPool1D` |
 | `aten.any` | `DecomposeAny` |
-| `aten.asinh` | `DecomposeHyperbolicVariants` |
 | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` |
-| `aten.atanh` | `DecomposeHyperbolicVariants` |
 | `aten.add` (with alpha), `aten.sub` (with alpha) | `DecomposeBinaryAlpha` |
 | `aten.cdist`, `aten._cdist_forward` | `DecomposeCDist` |
-| `aten.cosh` | `DecomposeHyperbolicVariants` |
 | `aten.div.Tensor_mode` | `DecomposeDivMode` |
 | `aten.div.Scalar_mode` | `LiftConstantScalarOperands` → `DecomposeDivMode` |
 | `aten.im2col`, `aten.col2im` | `DecomposeColIm` |
@@ -527,7 +522,6 @@ The following PyTorch operators are supported through decomposition or annotatio
 | `aten.roll` | `DecomposeRoll` |
 | `aten.select_scatter` | `DecomposeSelectScatter` |
 | `aten.silu` | `DecomposeSilu` |
-| `aten.sinh` | `DecomposeHyperbolicVariants` |
 | `aten.tan` | `DecomposeTan` |
 | `aten.threshold` | `DecomposeThreshold` |
 | `aten.triu` | `DecomposeTriu` |
diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md
index 09b4c1918df..8300920d1d5 100644
--- a/backends/qualcomm/debugger/README.md
+++ b/backends/qualcomm/debugger/README.md
@@ -156,8 +156,6 @@ After `build_executorch_binary()`, the debugger holds:
 
 Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported.
 
-**Note:** Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
-
 ```python
 from executorch.examples.qualcomm.utils import SimpleADB
 
@@ -268,7 +266,7 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build
 3. Does not support graphs with partitions (partial delegation).
 4. Does not support LLM models.
 5. Does not support graphs with multiple methods.
-6. Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
+
 
 ## ExecuTorch QNN HTP Heap Profiling
 
diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py
index bcba08ecc5a..28b7952ef33 100644
--- a/backends/qualcomm/export_utils.py
+++ b/backends/qualcomm/export_utils.py
@@ -276,10 +276,6 @@ def __init__(
         self.skip_push = qnn_config.skip_push
         self.backend_library_paths = {}
 
-        if self.direct_build_folder and self.dump_intermediate_outputs:
-            raise ValueError(
-                "Per-tensor dumping is currently not supported in direct mode."
-            )
         if self.direct_build_folder:
             direct_general_artifacts = [
                 f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so",
@@ -441,8 +437,9 @@ def execute(
                         f"--input_list_path {self.input_list_filename}",
                         f"--etdump_path {self.etdump_path}",
                         "--shared_buffer" if self.shared_buffer else "",
+                        f"--debug_output_path {self.debug_output_path}",
                         (
-                            f"--debug_output_path {self.debug_output_path} --dump_intermediate_outputs"
+                            "--dump_intermediate_outputs"
                             if self.dump_intermediate_outputs
                             else ""
                         ),
diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py
index ca8abb246bf..0c5be07fcdc 100644
--- a/backends/qualcomm/quantizer/annotators/htp_rules.py
+++ b/backends/qualcomm/quantizer/annotators/htp_rules.py
@@ -1077,11 +1077,7 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [
-        torch.ops.aten.bmm.default,
-        torch.ops.aten.matmul.default,
-        torch.ops.aten.mm.default,
-    ],
+    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/quantizer/annotators/lpai_rules.py b/backends/qualcomm/quantizer/annotators/lpai_rules.py
index 6e5b343c5c7..2623e4a6524 100644
--- a/backends/qualcomm/quantizer/annotators/lpai_rules.py
+++ b/backends/qualcomm/quantizer/annotators/lpai_rules.py
@@ -601,11 +601,7 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [
-        torch.ops.aten.bmm.default,
-        torch.ops.aten.matmul.default,
-        torch.ops.aten.mm.default,
-    ],
+    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index 5ad312020be..335f4a5c4cb 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -1,7 +1,6 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
-    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -22,7 +21,7 @@ def define_common_targets():
             "Logging.h",
         ],
         define_static_target = True,
-        platforms = [ANDROID, CXX],
+        platforms = [ANDROID],
         visibility = ["PUBLIC"],
         deps = [
             qnn_third_party_dep("api"),
@@ -92,7 +91,7 @@ def define_common_targets():
             ),
             define_static_target = True,
             link_whole = True,  # needed for executorch/examples/models/llama:main to register QnnBackend
-            platforms = [ANDROID, CXX],
+            platforms = [ANDROID],
             visibility = ["PUBLIC"],
             resources = ({
                 "qnn_lib": qnn_third_party_dep("qnn_offline_compile_libs"),
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index c98a8bc83ac..a53e5823aff 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -1,7 +1,6 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
-    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -70,7 +69,7 @@ def define_common_targets():
         },
         exported_external_deps = ["flatbuffers-api"],
         define_static_target = True,
-        platforms = [ANDROID, CXX],
+        platforms = [ANDROID],
     )
 
     runtime.cxx_library(
@@ -88,32 +87,5 @@ def define_common_targets():
         exported_deps = [
             ":schema",
         ],
-        platforms = [ANDROID, CXX],
-    )
-
-    # Host-side AOT variant of qnn_executorch_backend. Pulls in the QNN
-    # offline-compile libraries as a Buck resource (via :runtime, which
-    # itself depends on qnn_third_party_dep("qnn_offline_compile_libs")),
-    # so a host-side gtest or runner can dlopen the QNN libraries
-    # without a manual path setup.
-    #
-    # Mirrors qnn_executorch_backend's structure but swaps the on-device
-    # runtime_android_build dep for the host runtime which bundles the
-    # x86 simulator libraries as a Buck resource.
-    runtime.cxx_library(
-        name = "qnn_executorch_backend_aot",
-        srcs = [],
-        headers = [],
-        define_static_target = True,
-        visibility = ["PUBLIC"],
-        deps = [
-            qnn_third_party_dep("api"),
-            "//executorch/runtime/backend:interface",
-            "//executorch/runtime/core:core",
-            "//executorch/backends/qualcomm/runtime:runtime",
-        ],
-        exported_deps = [
-            ":schema",
-        ],
-        platforms = [ANDROID, CXX],
+        platforms = [ANDROID],
     )
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 0201edb6dee..9f043ea56a9 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -49,14 +49,6 @@ def forward(self, x):
         return torch.acos(x)
 
 
-class Acosh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.acosh(x)
-
-
 class AcosMultiNode(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -152,16 +144,6 @@ def forward(self, x):
         return 10 + x
 
 
-class AddMM(torch.nn.Module):
-    def __init__(self, alpha=1, beta=1):
-        super().__init__()
-        self.alpha = alpha
-        self.beta = beta
-
-    def forward(self, bias, input, mat2):
-        return torch.addmm(bias, input, mat2, alpha=self.alpha, beta=self.beta)
-
-
 class Any(torch.nn.Module):
     def __init__(self, dim=None, keepdim=False):
         super().__init__()
@@ -265,14 +247,6 @@ def forward(self, x, y):
         return squeeze_out, conv_out
 
 
-class Asinh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.asinh(x)
-
-
 class Asin(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -305,14 +279,6 @@ def forward(self, x1, y1, x2, y2):
         return torch.atan2(x1, y1), torch.atan2(x2, y2)
 
 
-class Atanh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.atanh(x)
-
-
 class AvgPool1D(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1023,14 +989,6 @@ def forward(self, x):
         return torch.cos(x)
 
 
-class Cosh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.cosh(x)
-
-
 class CumSum(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -2332,14 +2290,6 @@ def forward(self, x):
         return torch.sin(x)
 
 
-class Sinh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.sinh(x)
-
-
 class SimpleModel(torch.nn.Module):
     def __init__(self, kernel_size=3):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index fcb365292ee..914afa077e4 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -150,11 +150,6 @@ def test_qnn_backend_acos(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_acosh(self):
-        module = Acosh()  # noqa: F405
-        sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_adaptive_avg_pool1d(self):
         module = AdaptiveAvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -195,30 +190,6 @@ def test_qnn_backend_adaptive_max_pool2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_addmm(self):
-        test_comb = [
-            {
-                QCOM_MODULE: [AddMM()],  # noqa: F405
-                QCOM_SAMPLE_INPUTS: [
-                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
-                ],
-            },
-            {
-                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
-                QCOM_SAMPLE_INPUTS: [
-                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
-                ],
-            },
-        ]
-
-        index = 0
-        for comb in test_comb:
-            for module in comb[QCOM_MODULE]:
-                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
-                    with self.subTest(i=index):
-                        index += 1
-                        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)
@@ -329,11 +300,6 @@ def test_qnn_backend_argmin(self):
                     case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
                 )
 
-    def test_qnn_backend_asinh(self):
-        module = Asinh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        self.lower_module_and_test_output(module, sample_input)
-
     @unittest.expectedFailure
     def test_qnn_backend_asin(self):
         sample_input = (torch.rand(3, 4) * 2 - 1,)
@@ -385,11 +351,6 @@ def test_qnn_backend_atan2(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_atanh(self):
-        module = Atanh()  # noqa: F405
-        sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_avg_pool1d(self):
         module = AvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -652,11 +613,6 @@ def test_qnn_backend_cos(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_cosh(self):
-        module = Cosh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_cumsum(self):
         sample_input = ()
         test_comb = [
@@ -2191,11 +2147,6 @@ def test_qnn_backend_sin(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_sinh(self):
-        module = Sinh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_select_copy(self):
         module = SelectCopy()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3, 3]),)
@@ -2974,12 +2925,6 @@ def test_qnn_backend_acos(self):
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_acosh(self):
-        module = Acosh()  # noqa: F405
-        sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_adaptive_avg_pool1d(self):
         module = AdaptiveAvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -3024,31 +2969,6 @@ def test_qnn_backend_adaptive_max_pool2d(self):
                 module_one = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module_one, sample_input)
 
-    def test_qnn_backend_addmm(self):
-        test_comb = [
-            {
-                QCOM_MODULE: [AddMM()],  # noqa: F405
-                QCOM_SAMPLE_INPUTS: [
-                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
-                ],
-            },
-            {
-                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
-                QCOM_SAMPLE_INPUTS: [
-                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
-                ],
-            },
-        ]
-
-        index = 0
-        for comb in test_comb:
-            for module in comb[QCOM_MODULE]:
-                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
-                    with self.subTest(i=index):
-                        index += 1
-                        qdq_module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(qdq_module, sample_input)
-
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)
@@ -3173,12 +3093,6 @@ def test_qnn_backend_asin(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_asinh(self):
-        module = Asinh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_atan(self):
         sample_input = (torch.randn(3, 4),)
         module = Atan()  # noqa: F405
@@ -3218,12 +3132,6 @@ def test_qnn_backend_atan2(self):
                         qdq_module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(qdq_module, sample_input)
 
-    def test_qnn_backend_atanh(self):
-        module = Atanh()  # noqa: F405
-        sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_avg_pool1d(self):
         module = AvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -3570,12 +3478,6 @@ def test_qnn_backend_cos(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_cosh(self):
-        module = Cosh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_cumsum(self):
         module = CumSum()  # noqa: F405
         sample_input = (torch.randn(4),)
@@ -5360,12 +5262,6 @@ def test_qnn_backend_sin(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_sinh(self):
-        module = Sinh()  # noqa: F405
-        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_slice_copy(self):
         modules = [
             SliceCopyDefaultParameter(),  # noqa: F405
@@ -6222,10 +6118,6 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        if self.direct_build_folder:
-            self.skipTest(
-                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
-            )
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -6956,38 +6848,20 @@ def output_callback(log_msg):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        # TODO: LPAI direct mode support per-tensor dumping.
-        if self.direct_build_folder:
-            self.skipTest(
-                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
-            )
-        match get_backend_type(self.backend):
-            case QnnExecuTorchBackendType.kHtpBackend:
-                backend_options = generate_htp_compiler_spec(use_fp16=False)
-                expected_compared_events = 14
-            case QnnExecuTorchBackendType.kLpaiBackend:
-                backend_options = generate_lpai_compiler_spec(
-                    target_env=self.get_lpai_target_env()
-                )
-                # I/O q/dq nodes fall back to CPU via FoldQDQ LPAI workaround
-                # and are excluded from QNN etdump; update after first LPAI run
-                expected_compared_events = 17
-            case _:
-                raise ValueError("Backend is not implemented yet")
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
             backend_options=backend_options,
             dump_intermediate_outputs=True,
         )
         module = SimpleModel()  # noqa: F405
-        torch.manual_seed(8)
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
-        qdq_module = self.get_qdq_module(module, sample_input)
+        module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(
-            qdq_module,
+            module,
             sample_input,
             expected_partitions=1,
-            expected_compared_events=expected_compared_events,
+            expected_compared_events=14,
         )
 
     def test_qnn_backend_dump_intermediate_outputs_topk(self):
@@ -8199,6 +8073,7 @@ def test_static_llm_model(self):  # noqa: C901
             "1024",
             "--max_context_len",
             "1024",
+            "--skip_user_prompt_calibration",
         ]
 
         match self.static_llm_eval_method:
@@ -8248,17 +8123,10 @@ def test_static_llm_model(self):  # noqa: C901
                     ]
                 )
             case _:
+                cmds.remove("--skip_user_prompt_calibration")
                 logging.warning(
                     "No llm eval method chosen. Only generate model output."
                 )
-                cmds.extend(
-                    [
-                        "--calib_tasks",
-                        "wikitext",
-                        "--calib_limit",
-                        "1",
-                    ]
-                )
 
         if is_llama_model:
             cmds.extend(
@@ -8431,10 +8299,6 @@ def test_codegen2_1b(self):
             "128",
             "--max_context_len",
             "128",
-            "--calib_tasks",
-            "wikitext",
-            "--calib_limit",
-            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8496,10 +8360,6 @@ def test_llama_stories_260k(self):
             "128",
             "--max_context_len",
             "128",
-            "--calib_tasks",
-            "wikitext",
-            "--calib_limit",
-            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8563,10 +8423,6 @@ def test_llama_stories_110m(self):
             "128",
             "--max_context_len",
             "128",
-            "--calib_tasks",
-            "wikitext",
-            "--calib_limit",
-            "1",
         ]
         if self.use_fp16:
             cmds.append("--use_fp16")
@@ -8720,7 +8576,7 @@ class VLMSpecs(MLLMSpecs):
     def setUp(self):
         self.alm_specs = {
             "granite_speech_3_3-2b": TestExampleMultimodalityScript.ALMSpecs(
-                max_seq_len=1024,
+                max_seq_len=512,
                 sm8650_token_rate=5,
                 sm8750_token_rate=8,
                 encoder_pte_size=900_000_000,  # 900MB
@@ -8732,7 +8588,7 @@ def setUp(self):
         }
         self.vlm_specs = {
             "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=1024,
+                max_seq_len=128,
                 sm8650_token_rate=50,
                 sm8750_token_rate=55,
                 encoder_pte_size=110_000_000,  # 110MB
@@ -8742,7 +8598,7 @@ def setUp(self):
                 golden_image_feature="city",
             ),
             "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=1024,
+                max_seq_len=320,
                 sm8650_token_rate=11,
                 sm8750_token_rate=13,
                 encoder_pte_size=425_000_000,  # 425MB
@@ -8794,8 +8650,6 @@ def test_static_asr(self):
             "kv",
             "--max_seq_len",
             f"{alm_specs.max_seq_len}",
-            "--calib_samples",
-            "./examples/qualcomm/oss_scripts/llama/assets/samples/audio.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -8879,8 +8733,6 @@ def test_static_vlm(self):
             "kv",
             "--max_seq_len",
             f"{vlm_specs.max_seq_len}",
-            "--calib_samples",
-            "./examples/qualcomm/oss_scripts/llama/assets/samples/vision.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
diff --git a/backends/transforms/postpone_permute_below_squeeze_view.py b/backends/transforms/postpone_permute_below_squeeze_view.py
index e0e9a3ec198..f676e19fb65 100644
--- a/backends/transforms/postpone_permute_below_squeeze_view.py
+++ b/backends/transforms/postpone_permute_below_squeeze_view.py
@@ -1,12 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
+import copy
 from typing import cast, List
 
 import torch
@@ -108,7 +108,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
                 # view_node_shape is almost same as permute_node_shape
                 # except it has one more dim somewhere
                 # and the extra dim has value of 1.
-                new_view_shape = list(pred_shape)
+                new_view_shape = copy.deepcopy(pred_shape)
                 new_view_shape.insert(index, 1)
                 new_permute_dims = [x + 1 if x >= index else x for x in permute_dims]
                 new_permute_dims.insert(index, index)
@@ -132,7 +132,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
                 # and the extra dim has value of 1.
                 # Convert permute_dims to list of ints
                 index_to_remove = permute_dims[index]
-                new_view_shape = list(pred_shape)
+                new_view_shape = copy.deepcopy(pred_shape)
                 del new_view_shape[index_to_remove]
                 new_permute_dims = [
                     x - 1 if x > index_to_remove else x for x in permute_dims
diff --git a/backends/transforms/test/test_permute_optimization_passes.py b/backends/transforms/test/test_permute_optimization_passes.py
index 550446da562..dd356aad8a2 100644
--- a/backends/transforms/test/test_permute_optimization_passes.py
+++ b/backends/transforms/test/test_permute_optimization_passes.py
@@ -1,6 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -26,8 +25,6 @@
 from executorch.backends.transforms.replace_nop_transpose_or_permute_with_view import (
     ReplaceNopTransposeOrPermuteWithViewPass,
 )
-
-from executorch.exir import EdgeCompileConfig, to_edge
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from torch.utils import _pytree as pytree
@@ -480,38 +477,6 @@ def test_permute4_view3_chains(self) -> None:
             "PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView",
         )
 
-    def test_postpone_permute_with_symbolic_shapes(self) -> None:
-        class DynamicPermuteViewModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                y = x.view(x.shape[0], 12, 64)
-                y = y.permute(1, 0, 2)
-                y = y.view(1, 12, x.shape[0], 64)
-                return y.permute(0, 1, 3, 2)
-
-        exported_program = torch.export.export(
-            DynamicPermuteViewModule(),
-            (torch.randn(3, 1, 768),),
-            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=8)}},
-        )
-        edge_program = to_edge(
-            exported_program,
-            compile_config=EdgeCompileConfig(_check_ir_validity=False),
-        )
-        graph_module = edge_program.exported_program().graph_module
-
-        result = cast(
-            PassResult,
-            PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView().call(graph_module),
-        )
-
-        self.assertTrue(result.modified)
-        self.assertEqual(
-            count_node(result.graph_module, exir_ops.edge.aten.view_copy.default), 2
-        )
-        self.assertEqual(
-            count_node(result.graph_module, exir_ops.edge.aten.permute_copy.default), 2
-        )
-
     def test_negative_not_squeeze_like(self) -> None:
         """View that reshapes (not just squeeze/unsqueeze) should NOT be reordered."""
         builder = GraphBuilder()
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
index 44fbc4bc8f6..05bdd9431c8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
@@ -9,36 +9,10 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-//
-// Resize
-//
-
-// resize_args = { block_config_ref } (unused here)
-//
-// Elementwise binary with broadcasting: output = broadcast(in_a, in_b). Without
-// this the DynamicDispatchNode freezes the output at the build-time upper
-// bound. Mirrors the fp32 resize_binary_op_node (same arg-group layout: inputs
-// are args[1].refs[0] and [1]).
-void resize_q8ta_binary_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in_a = args.at(1).refs.at(0);
-  const ValueRef in_b = args.at(1).refs.at(1);
-
-  const std::vector<int64_t> a_sizes = graph->sizes_of(in_a);
-  const std::vector<int64_t> b_sizes = graph->sizes_of(in_b);
-  graph->virtual_resize(
-      out, calculate_broadcasted_output_size(a_sizes, b_sizes));
-}
-
 //
 // Dispatch nodes
 //
@@ -137,7 +111,7 @@ void add_q8ta_binary_node(
       // Resize args
       {block_config_ref},
       // Resizing Logic
-      resize_q8ta_binary_node));
+      nullptr));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
index b9f17021ea0..f6e89bef03d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -13,7 +13,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -219,51 +218,6 @@ ValueRef prepack_quantized_conv2d_weight(
   return packed_weight;
 }
 
-//
-// Resize
-//
-
-// resize_args = { input, kernel_size, stride, padding, dilation }
-//
-// The q8ta_conv2d output is statically allocated at the build-time upper-bound
-// shape. Without this resize function the DynamicDispatchNode would never
-// virtual_resize the output on trigger_resize(), so a dynamic-shape graph would
-// freeze the conv output at its upper bound — feeding e.g. a 238-row input into
-// a 241-row buffer leaves garbage rows that GroupNorm's global statistics then
-// smear across the whole tensor. Recompute H/W from the current input (N and C
-// are shape-independent and stay as currently allocated).
-void resize_q8ta_conv2d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef stride = resize_args.at(2);
-  const ValueRef padding = resize_args.at(3);
-  const ValueRef dilation = resize_args.at(4);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  // H/W from the current input via the shared conv-output helper. kernel dims
-  // come from the kernel_size IntList (kernel_size_only=true); the args[3] slot
-  // is consulted only as an optional ceil_mode and dilation (non-bool) resolves
-  // it to false. transposed=false.
-  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
-      *graph,
-      in_sizes,
-      kernel_size,
-      /*kernel_size_only=*/true,
-      {stride, padding, dilation, dilation},
-      /*transposed=*/false);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(out);
-  const size_t ndim = new_sizes.size();
-  new_sizes.at(ndim - 2) = out_hw.at(0);
-  new_sizes.at(ndim - 1) = out_hw.at(1);
-  graph->virtual_resize(out, new_sizes);
-}
-
 //
 // Dispatch nodes
 //
@@ -373,10 +327,8 @@ void add_q8ta_conv2d_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args: { input, kernel_size, stride, padding, dilation }
-      {packed_int8_input, kernel_size, stride, padding, dilation},
-      // Resize function: propagate dynamic H/W to the output.
-      resize_q8ta_conv2d_node));
+      // Resize args
+      {}));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
index 5d16cb3b78c..f463589c50a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
@@ -123,12 +123,7 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef packed_bias,
     const uint32_t activation_type,
     const ValueRef packed_int8_output,
-    const int32_t groups = 1,
-    const ValueRef conv_input = kDummyValueRef,
-    const ValueRef kernel_size = kDummyValueRef,
-    const ValueRef stride = kDummyValueRef,
-    const ValueRef padding = kDummyValueRef,
-    const ValueRef dilation = kDummyValueRef);
+    const int32_t groups = 1);
 
 std::vector<int64_t> calculate_q8ta_im2col_sizes(
     ComputeGraph* graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
index 914ca1a23ef..e690ff435a8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
@@ -12,7 +12,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -173,45 +172,6 @@ ValueRef prepack_quantized_conv2d_dw_weight(
   return packed_weight;
 }
 
-//
-// Resize
-//
-
-// resize_args = { input, kernel_size, stride, padding, dilation }
-//
-// Depthwise conv output H/W follows the same formula as a regular conv (channel
-// count is unchanged: groups == in_channels == out_channels). Without this the
-// DynamicDispatchNode freezes the output at the build-time upper bound. N/C are
-// shape-independent and stay as currently allocated. Mirrors the regular q8ta
-// conv resize (resize_q8ta_conv2d_node).
-void resize_q8ta_conv2d_dw_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef stride = resize_args.at(2);
-  const ValueRef padding = resize_args.at(3);
-  const ValueRef dilation = resize_args.at(4);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
-      *graph,
-      in_sizes,
-      kernel_size,
-      /*kernel_size_only=*/true,
-      {stride, padding, dilation, dilation},
-      /*transposed=*/false);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(out);
-  const size_t ndim = new_sizes.size();
-  new_sizes.at(ndim - 2) = out_hw.at(0);
-  new_sizes.at(ndim - 1) = out_hw.at(1);
-  graph->virtual_resize(out, new_sizes);
-}
-
 //
 // Dispatch nodes
 //
@@ -298,10 +258,10 @@ void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args: { input, kernel_size, stride, padding, dilation }
-      {packed_int8_input, kernel_size, stride, padding, dilation},
+      // Resize args
+      {},
       // Resizing Logic
-      resize_q8ta_conv2d_dw_node));
+      nullptr));
 }
 
 void add_q8ta_conv2d_dw_node(
@@ -403,10 +363,8 @@ void add_q8ta_conv2d_dw_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args: { input, kernel_size, stride, padding, dilation }
-      {packed_int8_input, kernel_size, stride, padding, dilation},
-      // Resizing Logic
-      resize_q8ta_conv2d_dw_node));
+      // Resize args
+      {}));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
index 9aa6e7b05d1..b43fe9eacc6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
@@ -13,7 +13,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -96,59 +95,6 @@ std::vector<int64_t> calculate_q8ta_im2col_sizes(
   return {K, H, W};
 }
 
-//
-// Resize
-//
-
-// resize_args = { input, kernel_size, stride, padding, dilation, groups }
-//
-// The im2col scratch tensor is [K, H_out, align_up_4(W_out)] where K (the
-// flattened conv window, channel/kernel-derived) is shape-independent and
-// H_out/W_out are the conv output spatial dims. The downstream PW GEMM that
-// consumes this scratch is resized separately (it preserves H/W). Without this,
-// the scratch freezes at the build-time upper bound and feeds garbage rows into
-// the GEMM. Recompute H_out/W_out from the CURRENT input (NOT the conv output
-// tensor, which may itself still be frozen at this point in the resize order).
-void resize_q8ta_im2col_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef im2col_out = args.at(0).refs.at(0);
-  const ValueRef in = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef stride = resize_args.at(2);
-  const ValueRef padding = resize_args.at(3);
-  const ValueRef dilation = resize_args.at(4);
-  const ValueRef groups = resize_args.at(5);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  // Conv output H/W from the current input.
-  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
-      *graph,
-      in_sizes,
-      kernel_size,
-      /*kernel_size_only=*/true,
-      {stride, padding, dilation, dilation},
-      /*transposed=*/false);
-  const int64_t out_height = out_hw.at(0);
-  const int64_t out_width = out_hw.at(1);
-
-  // K (flattened conv window) is shape-independent — recompute from channels +
-  // kernel exactly as calculate_q8ta_im2col_sizes does.
-  const int64_t in_channels = utils::val_at(-3, in_sizes);
-  const int64_t groups_val = graph->extract_scalar<int64_t>(groups);
-  const int64_t in_channels_per_group = in_channels / groups_val;
-  const auto kernel_size_list = graph->get_int_list(kernel_size);
-  const int64_t flattened_kernel_len = utils::align_up_4(
-      in_channels_per_group * kernel_size_list->at(0) *
-      kernel_size_list->at(1));
-  const int64_t K = flattened_kernel_len * groups_val;
-  const int64_t W = utils::align_up_4(out_width);
-
-  graph->virtual_resize(im2col_out, {K, out_height, W});
-}
-
 //
 // Dispatch nodes
 //
@@ -222,11 +168,10 @@ void add_q8ta_im2col_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args: { input, kernel_size, stride, padding, dilation, groups }
-      {packed_int8_input, kernel_size, stride, padding, dilation, groups},
-      // Resizing Logic: recompute the im2col scratch dims from the current
-      // input
-      resize_q8ta_im2col_node));
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
 }
 
 //
@@ -327,14 +272,7 @@ void q8ta_conv2d_im2col(
       packed_bias,
       activation_type_val,
       packed_int8_output,
-      groups_val,
-      // Original activation + conv geometry so the PW output H/W is recomputed
-      // from the true conv result, not the width-padded im2col scratch.
-      packed_int8_input,
-      kernel_size,
-      stride,
-      padding,
-      dilation);
+      groups_val);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
index 4fb7f0fa775..7a2380f728a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
@@ -11,7 +11,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -182,69 +181,6 @@ ValueRef prepack_quantized_conv2d_pw_weight(
   return packed_weight;
 }
 
-//
-// Resize
-//
-
-// resize_args = { input }
-//
-// Standalone 1x1 pointwise conv: stride 1, padding 0, dilation 1, so the output
-// H/W equals the input activation H/W. Without this resize the output would
-// freeze at the build-time upper bound. N/C are shape-independent and stay as
-// currently allocated.
-void resize_q8ta_conv2d_pw_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = resize_args.at(0);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  std::vector<int64_t> new_sizes = graph->sizes_of(out);
-  const size_t out_ndim = new_sizes.size();
-  const size_t in_ndim = in_sizes.size();
-  // Copy H (dim -2) and W (dim -1) from the input; keep output N/C.
-  new_sizes.at(out_ndim - 2) = in_sizes.at(in_ndim - 2);
-  new_sizes.at(out_ndim - 1) = in_sizes.at(in_ndim - 1);
-  graph->virtual_resize(out, new_sizes);
-}
-
-// resize_args = { conv_input, kernel_size, stride, padding, dilation }
-//
-// im2col-path PW conv. Here the PW node's bound input is the im2col scratch
-// tensor sized {K, H_out, align_up_4(W_out)} — its width is rounded up to a
-// multiple of 4 for texel alignment, so it must NOT be used to size the output.
-// Recompute the TRUE conv H_out/W_out from the ORIGINAL activation + conv
-// geometry, exactly as resize_q8ta_conv2d_node does. N/C are shape-independent
-// and stay as currently allocated.
-void resize_q8ta_conv2d_pw_im2col_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef conv_input = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef stride = resize_args.at(2);
-  const ValueRef padding = resize_args.at(3);
-  const ValueRef dilation = resize_args.at(4);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(conv_input);
-
-  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
-      *graph,
-      in_sizes,
-      kernel_size,
-      /*kernel_size_only=*/true,
-      {stride, padding, dilation, dilation},
-      /*transposed=*/false);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(out);
-  const size_t ndim = new_sizes.size();
-  new_sizes.at(ndim - 2) = out_hw.at(0);
-  new_sizes.at(ndim - 1) = out_hw.at(1);
-  graph->virtual_resize(out, new_sizes);
-}
-
 //
 // Dispatch nodes
 //
@@ -263,12 +199,7 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef packed_bias,
     const uint32_t activation_type,
     const ValueRef packed_int8_output,
-    const int32_t groups,
-    const ValueRef conv_input,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation) {
+    const int32_t groups) {
   VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info(
       graph.packed_dim_info_of(packed_int8_input)));
   VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
@@ -320,21 +251,6 @@ void add_q8ta_conv2d_pw_node(
       graph.hashed_layout_of(packed_int8_input),
   };
 
-  // The im2col path passes the original activation + conv geometry so the
-  // output H/W can be recomputed from the true conv result (the bound input is
-  // the width-padded im2col scratch and must not size the output). The
-  // standalone 1x1 PW conv passes only its real activation input, whose H/W the
-  // output matches directly.
-  std::vector<ValueRef> resize_args;
-  ExecuteNode::ResizeFunction resize_fn;
-  if (conv_input == kDummyValueRef) {
-    resize_args = {packed_int8_input};
-    resize_fn = resize_q8ta_conv2d_pw_node;
-  } else {
-    resize_args = {conv_input, kernel_size, stride, padding, dilation};
-    resize_fn = resize_q8ta_conv2d_pw_im2col_node;
-  }
-
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
@@ -350,8 +266,7 @@ void add_q8ta_conv2d_pw_node(
       param_buffers,
       push_constants,
       spec_constants,
-      resize_args,
-      resize_fn));
+      {}));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
index 7e3c4166e3c..bdbdaa14fec 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
@@ -13,50 +13,10 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-// resize_args = { input, kernel_size, stride, padding, dilation, output_padding
-// }
-//
-// Transposed conv output H/W uses the transposed formula
-//   out = (in - 1) * stride - 2 * padding + dilation * (kernel - 1)
-//         + output_padding + 1
-// (computed by calc_out_sizes_hw's transposed=true path, where the 4th args
-// slot is output_padding). Channels stay as allocated. Without this the
-// DynamicDispatchNode freezes the output at the build-time upper bound. Mirrors
-// the fp32 transposed path of resize_conv2d_node.
-void resize_q8ta_conv2d_transposed_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef stride = resize_args.at(2);
-  const ValueRef padding = resize_args.at(3);
-  const ValueRef dilation = resize_args.at(4);
-  const ValueRef output_padding = resize_args.at(5);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
-      *graph,
-      in_sizes,
-      kernel_size,
-      /*kernel_size_only=*/true,
-      {stride, padding, dilation, output_padding},
-      /*transposed=*/true);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(out);
-  const size_t ndim = new_sizes.size();
-  new_sizes.at(ndim - 2) = out_hw.at(0);
-  new_sizes.at(ndim - 1) = out_hw.at(1);
-  graph->virtual_resize(out, new_sizes);
-}
-
 // Dedicated workgroup size functions for transposed convolution.
 // Unlike regular conv2d, transposed conv with stride > 1 causes branch
 // divergence along the height dimension (different rows have different
@@ -123,7 +83,6 @@ void add_q8ta_conv2d_transposed_node(
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation,
-    const ValueRef output_padding,
     const ValueRef groups,
     const uint32_t activation_type,
     const ValueRef packed_int8_output) {
@@ -216,16 +175,8 @@ void add_q8ta_conv2d_transposed_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args: { input, kernel_size, stride, padding, dilation,
-      // output_padding }
-      {packed_int8_input,
-       kernel_size,
-       stride,
-       padding,
-       dilation,
-       output_padding},
-      // Resizing Logic
-      resize_q8ta_conv2d_transposed_node));
+      // Resize args
+      {}));
 }
 
 void q8ta_conv2d_transposed(
@@ -244,9 +195,7 @@ void q8ta_conv2d_transposed(
   const ValueRef kernel_size = args.at(idx++);
   const ValueRef stride = args.at(idx++);
   const ValueRef padding = args.at(idx++);
-  // output_padding does not affect the shader, but it IS needed to compute the
-  // transposed-conv output H/W on resize (dynamic shapes).
-  const ValueRef output_padding = args.at(idx++);
+  args.at(idx++); // output_padding: only affects output size, not shader
   const ValueRef dilation = args.at(idx++);
   const ValueRef groups = args.at(idx++);
   const ValueRef activation = args.at(idx++);
@@ -306,7 +255,6 @@ void q8ta_conv2d_transposed(
       stride,
       padding,
       dilation,
-      output_padding,
       groups,
       activation_type_val,
       packed_int8_output);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
index 92daf9d8ac5..210bd0cd78b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
@@ -63,34 +63,6 @@ utils::uvec3 q8ta_linear_local_wg_size(
       graph, shader, global_workgroup_size, args, resize_args);
 }
 
-//
-// Resize
-//
-
-// resize_args = {}
-//
-// Quantized linear/matmul: output = [*input.shape[:-1], out_features]. The
-// leading/M dims follow the input; out_features (the last dim) is
-// weight-derived and shape-independent, so it stays as currently allocated.
-// Without this the DynamicDispatchNode freezes the output (incl. the M dim) at
-// the build-time upper bound. Mirrors the fp32 resize_linear_qw_node shape
-// logic, generalized to arbitrary input rank.
-void resize_q8ta_linear_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph->sizes_of(out);
-  // Keep out_features (last dim, weight-derived); take all leading dims from
-  // in.
-  new_sizes.at(new_sizes.size() - 1) = out_sizes.at(out_sizes.size() - 1);
-  graph->virtual_resize(out, new_sizes);
-}
-
 //
 // Dispatch node
 //
@@ -163,7 +135,7 @@ void add_q8ta_linear_node(
       // Resize args
       {},
       // Resizing Logic
-      resize_q8ta_linear_node));
+      nullptr));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
index fb0ffcab14c..bca36444725 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
@@ -13,21 +13,6 @@
 
 namespace vkcompute {
 
-// quantize / dequantize are elementwise: output shape == input shape. Without a
-// resize function the DynamicDispatchNode freezes the output at the build-time
-// upper bound, so on a dynamic-shape graph (e.g. a 238-row input fed to a
-// 241-allocated graph) the FIRST quantize_per_tensor freezes everything
-// downstream at 241. Propagate the input's current sizes to the output.
-void resize_q8ta_qdq_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  graph->virtual_resize(out, graph->sizes_of(in));
-}
-
 void add_q8ta_quantize_node(
     ComputeGraph& graph,
     const ValueRef fp_input,
@@ -95,9 +80,7 @@ void add_q8ta_quantize_node(
        inp_block_config.as_packed_int(),
        outp_block_config.as_packed_int()},
       // Resize args
-      {block_config_ref},
-      // Resize function: output shape == input shape (elementwise).
-      resize_q8ta_qdq_node));
+      {block_config_ref}));
 }
 
 void add_q8ta_dequantize_node(
@@ -167,9 +150,7 @@ void add_q8ta_dequantize_node(
        outp_block_config.as_packed_int(),
        inp_block_config.as_packed_int()},
       // Resize args
-      {block_config_ref},
-      // Resize function: output shape == input shape (elementwise).
-      resize_q8ta_qdq_node));
+      {block_config_ref}));
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
index 8709e4bdc2c..f7454b6b93a 100644
--- a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
@@ -53,13 +53,7 @@ static std::string pick_conv2d_dw_shader_with_selector(
   if (is_3x3) {
     kernel_name += "_output_tile_3x3";
     if (impl_selector == "b1x1") {
-      // The _b1x1 batch-tile variant exists only for the non-sned family;
-      // sned (stride != dilation) shaders are not batch-tiled. Match
-      // pick_conv2d_dw_shader and only append it when stride == dilation,
-      // otherwise fall back to the un-suffixed sned shader.
-      if (stride_equals_dilation) {
-        kernel_name += "_b1x1";
-      }
+      kernel_name += "_b1x1";
     } else if (impl_selector == "b4x2") {
       // b4x2 is the default (no suffix)
     } else {
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
index a282ebfb0ff..12d4ed61b76 100644
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ b/backends/vulkan/test/custom_ops/utils.cpp
@@ -1366,10 +1366,6 @@ ComputeGraph setup_compute_graph(
     int op_invocations_per_execute) {
   GraphConfig config;
   config.enable_querypool = true;
-  // Default-on (opt-out via TestCase::set_force_resize(false)): force every
-  // DynamicDispatchNode to run its resize function on each execute(),
-  // exercising the op's resize formula even when input shapes are unchanged.
-  config.force_resize = test_case.get_force_resize();
   ComputeGraph graph(config);
 
   std::vector<ValueRef> input_values;
diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h
index 81bad5e9df0..d8fc36a5142 100644
--- a/backends/vulkan/test/custom_ops/utils.h
+++ b/backends/vulkan/test/custom_ops/utils.h
@@ -603,22 +603,6 @@ class TestCase {
     return target_execute_time_us_;
   }
 
-  // When true, the ComputeGraph built for this test case sets
-  // GraphConfig::force_resize, so every DynamicDispatchNode runs its resize
-  // function on each execute() even when no input shape changed. Because the
-  // output is already allocated at the swept shape, the resize must recompute
-  // the same shape from the current input — a wrong resize formula resizes the
-  // output to a mismatched shape and surfaces as a test failure. Default true
-  // (opt-out): every custom_ops test exercises its resize formulas across the
-  // swept shapes. Call set_force_resize(false) for the rare op whose resize fn
-  // is intentionally not shape-preserving under a fixed output allocation.
-  void set_force_resize(bool force_resize) {
-    force_resize_ = force_resize;
-  }
-  bool get_force_resize() const {
-    return force_resize_;
-  }
-
   void add_input_spec(const ValueSpec& spec) {
     inputs_.push_back(spec);
   }
@@ -664,7 +648,6 @@ class TestCase {
     shader_filter_ = kDefaultShaderFilter;
     op_invocations_per_execute_ = 0;
     target_execute_time_us_ = kDefaultTargetExecuteTimeUs;
-    force_resize_ = true;
   }
 
  private:
@@ -677,7 +660,6 @@ class TestCase {
   std::vector<std::string> shader_filter_;
   int op_invocations_per_execute_ = 0; // 0 = adaptive
   int target_execute_time_us_ = kDefaultTargetExecuteTimeUs;
-  bool force_resize_ = true;
 };
 
 //
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index f7cd85f9758..957862935a4 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -38,10 +38,6 @@ set(WEBGPU_SRCS
     runtime/ops/sdpa/Sdpa.cpp
     runtime/ops/select_as_symint/SelectAsSymint.cpp
     runtime/ops/quantized_linear/QuantizedLinear.cpp
-    runtime/ops/mul/BinaryOp.cpp
-    runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
-    runtime/ops/rope/RotaryEmbedding.cpp
-    runtime/ops/prepack/Prepack.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
@@ -142,6 +138,7 @@ endfunction()
 
 if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
+  add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
   add_webgpu_native_test(
     webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
   )
@@ -151,38 +148,4 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(
     webgpu_update_cache_test test/native/test_update_cache.cpp
   )
-
-  # Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) +
-  # its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
-  if(NOT TARGET GTest::gtest)
-    find_package(GTest QUIET)
-  endif()
-  if(TARGET GTest::gtest)
-    # Reuse add_webgpu_native_test for the backend link + frameworks + flags;
-    # add only driver_util, GTest, and the header-only nlohmann/json include.
-    add_webgpu_native_test(webgpu_op_test test/op_tests/op_test_driver.cpp)
-    target_sources(webgpu_op_test PRIVATE test/op_tests/driver_util.cpp)
-    target_link_libraries(webgpu_op_test PRIVATE GTest::gtest)
-    target_include_directories(
-      webgpu_op_test
-      PRIVATE "${EXECUTORCH_ROOT}/third-party/json/single_include"
-    )
-
-    # Device-free util unit test: no backend/Dawn link (pure manifest/tolerance
-    # helpers), so it does NOT use the native-test helper.
-    add_executable(
-      webgpu_op_test_util_test test/op_tests/test_driver_util.cpp
-                               test/op_tests/driver_util.cpp
-    )
-    target_include_directories(
-      webgpu_op_test_util_test
-      PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-              "${EXECUTORCH_ROOT}/third-party/json/single_include"
-    )
-    target_link_libraries(
-      webgpu_op_test_util_test PRIVATE GTest::gtest GTest::gtest_main
-    )
-    target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
-    set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
-  endif()
 endif()
diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp
index ceca89d1710..aed769da4a4 100644
--- a/backends/webgpu/runtime/WebGPUBackend.cpp
+++ b/backends/webgpu/runtime/WebGPUBackend.cpp
@@ -98,21 +98,20 @@ Error WebGPUBackend::execute(
   const size_t num_outputs = graph->output_ids().size();
 
   // Copy inputs from EValue tensors to GPU buffers
-  std::vector<InputData> inputs;
+  std::vector<std::pair<const void*, size_t>> inputs;
   inputs.reserve(num_inputs);
   for (size_t i = 0; i < num_inputs; i++) {
     const auto& tensor = args[i]->toTensor();
-    const bool host_is_int64 =
-        tensor.scalar_type() == executorch::aten::ScalarType::Long;
-    inputs.push_back({tensor.const_data_ptr(), tensor.nbytes(), host_is_int64});
+    inputs.emplace_back(tensor.const_data_ptr(), tensor.nbytes());
   }
+  graph->copy_inputs(inputs);
+
   // Fail loud as a runtime Error so a throw never crosses the backend boundary.
   try {
-    graph->copy_inputs(inputs);
     graph->update_symints_from_inputs(inputs);
     graph->propagate_resize();
   } catch (const std::exception& e) {
-    ET_LOG(Error, "WebGPU input copy / symint refresh failed: %s", e.what());
+    ET_LOG(Error, "WebGPU symint refresh/resize failed: %s", e.what());
     return Error::Internal;
   }
 
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index b7fb4313400..1c977d130dd 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -26,10 +26,6 @@ namespace executorch::backends::webgpu {
 
 namespace {
 
-// Op name the AOT exporter emits for a prepacked constant (must match the
-// serialized schema); compared in the prepack pre-scan below.
-constexpr const char* kPrepackOpName = "et_vk.prepack.default";
-
 size_t vk_datatype_size(vkgraph::VkDataType dtype) {
   switch (dtype) {
     case vkgraph::VkDataType::BOOL:
@@ -49,19 +45,6 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) {
   }
 }
 
-bool vk_datatype_is_int(vkgraph::VkDataType dtype) {
-  switch (dtype) {
-    case vkgraph::VkDataType::BOOL:
-    case vkgraph::VkDataType::UINT8:
-    case vkgraph::VkDataType::INT8:
-    case vkgraph::VkDataType::INT32:
-    case vkgraph::VkDataType::INT64:
-      return true;
-    default:
-      return false;
-  }
-}
-
 } // namespace
 
 WebGPUGraph::WebGPUGraph() = default;
@@ -78,7 +61,7 @@ WGPUBuffer WebGPUGraph::create_scratch_buffer(size_t nbytes) {
 }
 
 void WebGPUGraph::update_symints_from_inputs(
-    const std::vector<InputData>& inputs) {
+    const std::vector<std::pair<const void*, size_t>>& inputs) {
   for (const auto& src : symint_sources_) {
     int pos = -1;
     for (size_t i = 0; i < input_ids_.size(); i++) {
@@ -117,8 +100,8 @@ void WebGPUGraph::update_symints_from_inputs(
     // Reads the [0,..,index,..,0] element; symint sources are scalar-ish.
     const int64_t offset = static_cast<int64_t>(index) * stride;
     // elem_size back-derived from build-time numel (sources are static-shaped).
-    const void* host = inputs[pos].data;
-    const size_t elem_size = inputs[pos].nbytes / static_cast<size_t>(numel);
+    const void* host = inputs[pos].first;
+    const size_t elem_size = inputs[pos].second / static_cast<size_t>(numel);
     int32_t val;
     if (elem_size == sizeof(int64_t)) {
       val = static_cast<int32_t>(static_cast<const int64_t*>(host)[offset]);
@@ -234,10 +217,6 @@ void WebGPUGraph::build(
 
   const auto* graph = vkgraph::GetVkGraph(flatbuffer_data);
 
-  // .pte byte sources for prepack-time constant materialization (build-only).
-  constant_data_ = constant_data;
-  named_data_map_ = named_data_map;
-
   // Phase 1: Create all values
   const auto* values = graph->values();
   const int num_vals = values ? values->size() : 0;
@@ -247,42 +226,6 @@ void WebGPUGraph::build(
   ints_.resize(num_vals, 0);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
-  value_lists_.resize(num_vals);
-
-  // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the
-  // prepack node materializes it once) only if it is a prepack source AND never
-  // a direct arg of a non-prepack op. ValueList args are expanded so a constant
-  // reached through a list still counts as a direct use.
-  std::unordered_set<int> prepack_src_ids;
-  std::unordered_set<int> direct_use_ids;
-  const auto* chain_prescan = graph->chain();
-  if (chain_prescan) {
-    for (unsigned ci = 0; ci < chain_prescan->size(); ci++) {
-      const auto* oc = chain_prescan->Get(ci);
-      const bool is_prepack = oc->name()->str() == kPrepackOpName;
-      const auto* a = oc->args();
-      if (!a) {
-        continue;
-      }
-      for (unsigned j = 0; j < a->size(); j++) {
-        int id = static_cast<int>(a->Get(j));
-        if (is_prepack && j == 0) {
-          prepack_src_ids.insert(id);
-        } else if (!is_prepack) {
-          direct_use_ids.insert(id);
-          const auto* v = values ? values->Get(id) : nullptr;
-          if (v && v->value_type() == vkgraph::GraphTypes::ValueList) {
-            const auto* items = v->value_as_ValueList()->items();
-            if (items) {
-              for (unsigned k = 0; k < items->size(); k++) {
-                direct_use_ids.insert(static_cast<int>(items->Get(k)));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
 
   for (int i = 0; i < num_vals; i++) {
     const auto* val = values->Get(i);
@@ -305,57 +248,56 @@ void WebGPUGraph::build(
             numel *= dims->Get(j);
           }
         }
-        tensor.elem_size = vk_datatype_size(vk_tensor->datatype());
-        tensor.is_int = vk_datatype_is_int(vk_tensor->datatype());
-        tensor.nbytes = numel * tensor.elem_size;
+        tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());
 
         int constant_id = vk_tensor->constant_id();
         int mem_obj_id = vk_tensor->mem_obj_id();
 
-        // Constants are dedicated. Every constant is recorded as a
-        // ConstantSource and materialized via materialize_constant (one
-        // CPU->GPU write); a constant consumed ONLY via prepack is deferred
-        // (no eager buffer -- its prepack node performs that one write).
+        // Constants always get dedicated buffers regardless of mem_obj_id
         if (constant_id >= 0 || mem_obj_id < 0) {
           tensor_mem_obj_ids_[i] = -1;
-
-          if (constant_id >= 0) {
+          WGPUBufferDescriptor buf_desc = {};
+          buf_desc.size = std::max(tensor.nbytes, size_t(4));
+          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+              WGPUBufferUsage_CopySrc;
+          buf_desc.mappedAtCreation = false;
+          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+          if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
             const auto* constants = graph->constants();
-            if (!constants ||
-                constant_id >= static_cast<int>(constants->size())) {
-              throw std::runtime_error(
-                  "WebGPU: constant_id set but the constants table is missing "
-                  "or the id is out of range");
-            }
-            const auto* vk_bytes = constants->Get(constant_id);
-            ConstantSource cs;
-            cs.nbytes = tensor.nbytes;
-            if (vk_bytes->offset() != UINT64_MAX) {
-              cs.inline_offset = vk_bytes->offset();
-            } else if (vk_bytes->named_key() != nullptr) {
-              cs.named_key = vk_bytes->named_key()->str();
-            } else {
-              throw std::runtime_error(
-                  "WebGPU: constant has no inline offset and no named-data key");
-            }
-            constant_sources_[i] = std::move(cs);
-          }
-
-          // Defer constants consumed solely via prepack: skip the eager buffer.
-          const bool defer = constant_id >= 0 &&
-              prepack_src_ids.count(i) != 0 && direct_use_ids.count(i) == 0;
-          if (!defer) {
-            WGPUBufferDescriptor buf_desc = {};
-            buf_desc.size = std::max(tensor.nbytes, size_t(4));
-            buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-                WGPUBufferUsage_CopySrc;
-            buf_desc.mappedAtCreation = false;
-            tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-            // Same single CPU->GPU write the prepack node uses (no
-            // duplication).
-            if (constant_id >= 0) {
-              materialize_constant(i, tensor.buffer);
+            if (constants &&
+                constant_id < static_cast<int>(constants->size())) {
+              const auto* vk_bytes = constants->Get(constant_id);
+              if (vk_bytes->offset() != UINT64_MAX) {
+                const uint8_t* src = constant_data + vk_bytes->offset();
+                wgpuQueueWriteBuffer(
+                    queue_, tensor.buffer, 0, src, tensor.nbytes);
+              } else if (
+                  vk_bytes->named_key() != nullptr &&
+                  named_data_map != nullptr) {
+                // Constant stored in the PTE named-data map.
+                auto buf =
+                    named_data_map->get_data(vk_bytes->named_key()->c_str());
+                if (!buf.ok()) {
+                  throw std::runtime_error(
+                      std::string("WebGPU: named constant '") +
+                      vk_bytes->named_key()->c_str() +
+                      "' not found in NamedDataMap");
+                }
+                if (buf->size() < tensor.nbytes) {
+                  throw std::runtime_error(
+                      std::string("WebGPU: named constant '") +
+                      vk_bytes->named_key()->c_str() + "' undersized: have " +
+                      std::to_string(buf->size()) + " bytes, need " +
+                      std::to_string(tensor.nbytes));
+                }
+                wgpuQueueWriteBuffer(
+                    queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
+                buf->Free();
+              } else {
+                throw std::runtime_error(
+                    "WebGPU: constant has no inline offset and no named-data key");
+              }
             }
           }
         } else {
@@ -406,16 +348,6 @@ void WebGPUGraph::build(
         add_uniform_buffer_bytes(kSymIntUniformBytes);
         break;
       }
-      case vkgraph::GraphTypes::ValueList: {
-        value_types_[i] = ValueType::ValueList;
-        const auto* items = val->value_as_ValueList()->items();
-        if (items) {
-          for (unsigned j = 0; j < items->size(); j++) {
-            value_lists_[i].push_back(static_cast<int>(items->Get(j)));
-          }
-        }
-        break;
-      }
       default:
         value_types_[i] = ValueType::Null;
         break;
@@ -492,47 +424,6 @@ void WebGPUGraph::build(
       webgpu_operator_registry().get_op_fn(op_name)(*this, args);
     }
   }
-
-  // Prepack nodes (Phase 3) materialized their constants directly into the
-  // consumer buffers via materialize_constant; no separate copy pass needed.
-  // The .pte bytes are freed right after build() returns (WebGPUBackend
-  // processed->Free()), so clear the build-only source pointers.
-  constant_data_ = nullptr;
-  named_data_map_ = nullptr;
-}
-
-void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) {
-  auto it = constant_sources_.find(const_value_id);
-  if (it == constant_sources_.end()) {
-    throw std::runtime_error(
-        "WebGPU: no source recorded for constant id " +
-        std::to_string(const_value_id));
-  }
-  const ConstantSource& cs = it->second;
-  if (cs.nbytes == 0) {
-    return;
-  }
-  if (cs.inline_offset != UINT64_MAX) {
-    if (constant_data_ == nullptr) {
-      throw std::runtime_error("WebGPU: inline constant data is null");
-    }
-    wgpuQueueWriteBuffer(
-        queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes);
-  } else if (!cs.named_key.empty() && named_data_map_ != nullptr) {
-    auto buf = named_data_map_->get_data(cs.named_key.c_str());
-    if (!buf.ok()) {
-      throw std::runtime_error(
-          "WebGPU: named constant '" + cs.named_key + "' not found");
-    }
-    if (buf->size() < cs.nbytes) {
-      throw std::runtime_error(
-          "WebGPU: named constant '" + cs.named_key + "' undersized");
-    }
-    wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes);
-    buf->Free();
-  } else {
-    throw std::runtime_error("WebGPU: constant has no source");
-  }
 }
 
 WGPUShaderModule WebGPUGraph::get_or_create_shader(
@@ -593,47 +484,16 @@ WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl(
   return bgl;
 }
 
-void WebGPUGraph::copy_inputs(const std::vector<InputData>& inputs) {
+void WebGPUGraph::copy_inputs(
+    const std::vector<std::pair<const void*, size_t>>& inputs) {
   for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) {
-    const InputData& in = inputs[i];
-    if (in.nbytes == 0) {
+    if (inputs[i].second == 0) {
       continue;
     }
     int tid = input_ids_[i];
     const auto& tensor = tensors_[tid];
-
-    // Fast path: host and GPU element types match byte-for-byte.
-    if (in.nbytes == tensor.nbytes) {
-      wgpuQueueWriteBuffer(queue_, tensor.buffer, 0, in.data, tensor.nbytes);
-      continue;
-    }
-
-    // Narrow int64 host indices into the int32 buffer (mirrors Vulkan).
-    const bool buffer_is_int32 = tensor.is_int && tensor.elem_size == 4;
-    if (in.host_is_int64 && buffer_is_int32 && in.nbytes == tensor.nbytes * 2) {
-      const size_t numel = tensor.nbytes / 4;
-      const int64_t* src = static_cast<const int64_t*>(in.data);
-      std::vector<int32_t> narrowed(numel);
-      for (size_t e = 0; e < numel; e++) {
-#ifndef NDEBUG
-        // Index tensors (tokens/positions) are far below int32 range in
-        // practice; assert in debug that the narrowing is lossless.
-        if (static_cast<int32_t>(src[e]) != src[e]) {
-          throw std::runtime_error("WebGPU: int64 index overflows int32");
-        }
-#endif
-        narrowed[e] = static_cast<int32_t>(src[e]);
-      }
-      wgpuQueueWriteBuffer(
-          queue_, tensor.buffer, 0, narrowed.data(), tensor.nbytes);
-      continue;
-    }
-
-    throw std::runtime_error(
-        "WebGPU: unsupported input copy for input " + std::to_string(i) +
-        " (host " + std::to_string(in.nbytes) + " bytes" +
-        (in.host_is_int64 ? " int64" : "") + " vs buffer " +
-        std::to_string(tensor.nbytes) + " bytes)");
+    wgpuQueueWriteBuffer(
+        queue_, tensor.buffer, 0, inputs[i].first, inputs[i].second);
   }
 }
 
@@ -855,11 +715,10 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
       stats.num_tensors++;
-      // Shared tensors are tracked via shared_buffer_sizes_; a deferred
-      // prepack-routed constant has no buffer (no GPU memory) -> not counted.
+      // Shared tensors are tracked via shared_buffer_sizes_
       bool is_shared =
           i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
-      if (!is_shared && tensors_[i].buffer != nullptr) {
+      if (!is_shared) {
         stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
       }
     }
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 3572f751a06..3cff09ecb6d 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -25,16 +25,6 @@ struct WebGPUTensor {
   WGPUBuffer buffer = nullptr;
   std::vector<int64_t> dims;
   size_t nbytes = 0;
-  // Serialized (GPU-side) element type, used to narrow wider host inputs.
-  size_t elem_size = 0;
-  bool is_int = false;
-};
-
-// Host-side view of one graph input, passed to copy_inputs.
-struct InputData {
-  const void* data = nullptr;
-  size_t nbytes = 0;
-  bool host_is_int64 = false;
 };
 
 struct WebGPUDispatch {
@@ -50,15 +40,6 @@ struct OutputCopy {
   size_t nbytes = 0;
 };
 
-// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef
-// (sizes + a data reference, not a live GPU tensor). The prepack node is the
-// sole materialization, so the constant needs no eager GPU buffer.
-struct ConstantSource {
-  uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key
-  std::string named_key; // non-empty => fetch from named_data_map_
-  size_t nbytes = 0;
-};
-
 struct ExecuteConfig {
   size_t chunk_size = 0;
   size_t initial_chunk_size = 0;
@@ -94,7 +75,7 @@ class WebGPUGraph {
       const executorch::runtime::NamedDataMap* named_data_map = nullptr);
 
   // Copy input tensor data from host pointers into GPU buffers.
-  void copy_inputs(const std::vector<InputData>& inputs);
+  void copy_inputs(const std::vector<std::pair<const void*, size_t>>& inputs);
 
   // Execute all recorded dispatches.
   void execute();
@@ -128,10 +109,6 @@ class WebGPUGraph {
   bool get_bool(int id) const {
     return bools_[id];
   }
-  // Member value ids of a serialized ValueList (op multi-output list).
-  const std::vector<int>& get_value_list(int id) const {
-    return value_lists_[id];
-  }
 
   // Live-scalar (SymInt) API; mirrors the Vulkan SymInt/ParamsBuffer UBO.
   // set_symint writes the buffer + marks dirty only if the value changed.
@@ -161,7 +138,8 @@ class WebGPUGraph {
   }
 
   // Execute-time select_as_symint read; mirrors Vulkan select_as_symint_impl.
-  void update_symints_from_inputs(const std::vector<InputData>& inputs);
+  void update_symints_from_inputs(
+      const std::vector<std::pair<const void*, size_t>>& inputs);
 
   // Per-SymInt resize hook; mirrors Vulkan DynamicDispatchNode::trigger_resize.
   void add_resize_hook(int symint_id, std::function<void(WebGPUGraph&)> fn) {
@@ -189,11 +167,6 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
-  // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
-  // transfer. Build-time only (the .pte bytes are freed after build()).
-  // Mirrors Vulkan prepack_standard.
-  void materialize_constant(int const_value_id, WGPUBuffer dst);
-
   void add_uniform_buffer_bytes(size_t bytes) {
     uniform_buffer_bytes_ += bytes;
   }
@@ -233,16 +206,7 @@ class WebGPUGraph {
     return static_cast<int>(value_types_.size());
   }
 
-  enum class ValueType {
-    Tensor,
-    Int,
-    Double,
-    Bool,
-    Null,
-    String,
-    SymInt,
-    ValueList
-  };
+  enum class ValueType { Tensor, Int, Double, Bool, Null, String, SymInt };
 
   ValueType get_value_type(int id) const {
     return value_types_[id];
@@ -260,7 +224,6 @@ class WebGPUGraph {
   std::vector<int64_t> ints_;
   std::vector<double> doubles_;
   std::vector<bool> bools_;
-  std::vector<std::vector<int>> value_lists_;
 
   // SymInt (live scalar): id -> {live Uniform buffer, current value}, sparse.
   struct SymIntSlot {
@@ -300,13 +263,6 @@ class WebGPUGraph {
 
   std::vector<WebGPUDispatch> dispatches_;
 
-  // Prepack-routed constant sources (offset/named-key + size); the prepack node
-  // materializes these once. constant_data_/named_data_map_ point at the .pte
-  // bytes and are valid only during build().
-  const uint8_t* constant_data_ = nullptr;
-  const executorch::runtime::NamedDataMap* named_data_map_ = nullptr;
-  std::unordered_map<int, ConstantSource> constant_sources_;
-
   ExecuteConfig execute_config_;
 
   // Caches for reusing GPU objects across dispatches.
diff --git a/backends/webgpu/runtime/WebGPUUtils.h b/backends/webgpu/runtime/WebGPUUtils.h
index 39eb3caa28b..690ea72ebf7 100644
--- a/backends/webgpu/runtime/WebGPUUtils.h
+++ b/backends/webgpu/runtime/WebGPUUtils.h
@@ -12,7 +12,6 @@
 
 #include <algorithm>
 #include <cstdint>
-#include <cstring>
 #include <stdexcept>
 #include <string>
 
@@ -49,25 +48,4 @@ inline uint32_t compute_1d_workgroup_count(
   return count;
 }
 
-// Create a uniform buffer mapped-at-creation, copy `size` bytes in, and unmap.
-inline WGPUBuffer
-make_uniform(WGPUDevice device, const void* data, size_t size) {
-  WGPUBufferDescriptor desc = {};
-  desc.size = size;
-  desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
-  desc.mappedAtCreation = true;
-  WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &desc);
-  if (!buf) {
-    throw std::runtime_error("make_uniform: buffer creation failed");
-  }
-  void* ptr = wgpuBufferGetMappedRange(buf, 0, size);
-  if (!ptr) {
-    wgpuBufferRelease(buf);
-    throw std::runtime_error("make_uniform: mapped range is null");
-  }
-  std::memcpy(ptr, data, size);
-  wgpuBufferUnmap(buf);
-  return buf;
-}
-
 } // namespace executorch::backends::webgpu::utils
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index 84b5349ef2d..28d4e8fef91 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -8,9 +8,8 @@
 # Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader.
 # This is the substantive op-coverage gate: unlike the python operators suite
 # (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these
-# executables run quantized_linear / SDPA / update_cache / multi-dispatch
-# ordering / scratch through the real WebGPU backend on Dawn. (Simple ops —
-# add / rms_norm / the misc ops — run through the cases.py op-test framework.)
+# executables run rms_norm / multi-dispatch ordering / scratch through the real
+# WebGPU backend on Dawn.
 #
 # Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES +
 # LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs:
@@ -18,9 +17,9 @@
 #   bash backends/webgpu/scripts/test_webgpu_native_ci.sh
 #
 # Builds whatever native test targets are present in the landed tree (NOT a fixed
-# list): webgpu_native_test (base) + webgpu_dispatch_order_test,
-# webgpu_scratch_buffer_test (D107576199) + webgpu_update_cache_test
-# (D107547307). SDPA executables join once they land.
+# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) +
+# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199) +
+# webgpu_update_cache_test (D107547307). SDPA executables join once they land.
 
 set -e
 
@@ -38,31 +37,22 @@ fi
 cd "${EXECUTORCH_ROOT}"
 
 # ── Exports for the model-driven executables (best-effort) ───────────────────
-# native_test (quantized_linear/SDPA/update_cache) + dispatch_order read .pte/
-# golden inputs via env/dir and self-skip if absent; scratch is standalone.
-# native_test itself is gated below on the executorch wheel being importable.
+# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
+# self-skip if absent; scratch is standalone (generates its own inputs).
+PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
+RMS_NORM_DIR="/tmp/rmsn"
+RMS_NORM_OK=1
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
-EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
-EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
-EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
-EMBEDDING_LLAMA1B_MODEL="/tmp/webgpu_embedding_q4gsw_llama1b.pte"
-EMBEDDING_LLAMA1B_INDICES="/tmp/webgpu_embedding_q4gsw_llama1b_indices.bin"
-EMBEDDING_LLAMA1B_GOLDEN="/tmp/webgpu_embedding_q4gsw_llama1b_golden.bin"
-ROPE_MODEL="/tmp/webgpu_rope.pte"
-ROPE_XQ_GOLDEN="/tmp/webgpu_rope_xq_golden.bin"
-ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin"
-ROPE_DECODE_MODEL="/tmp/webgpu_rope_decode.pte"
-ROPE_DECODE_XQ_GOLDEN="/tmp/webgpu_rope_decode_xq_golden.bin"
-ROPE_DECODE_XK_GOLDEN="/tmp/webgpu_rope_decode_xk_golden.bin"
-PREPACK_MODEL="/tmp/webgpu_prepack.pte"
-PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin"
-PREPACK2_MODEL="/tmp/webgpu_prepack_two_const.pte"
-PREPACK2_GOLDEN="/tmp/webgpu_prepack_two_const_golden.bin"
-PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte"
-PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin"
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
+export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
+" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
@@ -70,23 +60,9 @@ export_all_quantized_linear_models('/tmp')
 " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import export_embedding_q4gsw_model
-export_embedding_q4gsw_model('${EMBEDDING_MODEL}', '${EMBEDDING_GOLDEN}', '${EMBEDDING_INDICES}')
-export_embedding_q4gsw_model('${EMBEDDING_LLAMA1B_MODEL}', '${EMBEDDING_LLAMA1B_GOLDEN}', '${EMBEDDING_LLAMA1B_INDICES}', 'llama1b')
-" || echo "WARN: embedding_q4gsw export failed; embedding configs will FAIL in webgpu_native_test"
-
-$PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model
-export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
-export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode')
-" || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test"
-
-$PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model
-export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
-export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
-export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}')
-" || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test"
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
+export_rms_norm_cases('${RMS_NORM_DIR}')
+" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
@@ -136,7 +112,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -165,35 +141,20 @@ for t in "${TARGETS[@]}"; do
 done
 
 echo "=== Run native tests on Dawn + SwiftShader ==="
-# webgpu_native_test hosts the quantized_linear / SDPA / update_cache / symint
-# sweeps. Gate on the executorch wheel being importable (the proxy for "the
-# exports above ran"): CI has the wheel so they ran; a bare local run without it
-# skips here rather than hard-failing the required-config guards.
-if [[ -x "${BIN_DIR}/webgpu_native_test" ]] &&
-  "${PYTHON_EXECUTABLE}" -c "import executorch" 2>/dev/null; then
-  env WEBGPU_TEST_SDPA_DIR=/tmp/ \
+# native_test is model-driven; only run it if the export produced its .pte
+# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare
+# local run without the wheel self-skips here rather than hard-failing on load).
+if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
+  env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+      WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
+      WEBGPU_TEST_SDPA_DIR=/tmp/ \
       WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL="${EMBEDDING_MODEL}" \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES="${EMBEDDING_INDICES}" \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN="${EMBEDDING_GOLDEN}" \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL="${EMBEDDING_LLAMA1B_MODEL}" \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES="${EMBEDDING_LLAMA1B_INDICES}" \
-      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN="${EMBEDDING_LLAMA1B_GOLDEN}" \
-      WEBGPU_TEST_ROPE_MODEL="${ROPE_MODEL}" \
-      WEBGPU_TEST_ROPE_XQ_GOLDEN="${ROPE_XQ_GOLDEN}" \
-      WEBGPU_TEST_ROPE_XK_GOLDEN="${ROPE_XK_GOLDEN}" \
-      WEBGPU_TEST_ROPE_DECODE_MODEL="${ROPE_DECODE_MODEL}" \
-      WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN="${ROPE_DECODE_XQ_GOLDEN}" \
-      WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN="${ROPE_DECODE_XK_GOLDEN}" \
-      WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \
-      WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \
-      WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \
-      WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \
-      WEBGPU_TEST_PREPACK_TIED_MODEL="${PREPACK_TIED_MODEL}" \
-      WEBGPU_TEST_PREPACK_TIED_GOLDEN="${PREPACK_TIED_GOLDEN}" \
       "${BIN_DIR}/webgpu_native_test"
 else
-  echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)"
+  echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
+fi
+if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
+  "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
 fi
 if [[ "${UPDATE_CACHE_OK}" == "1" && -x "${BIN_DIR}/webgpu_update_cache_test" ]]; then
   "${BIN_DIR}/webgpu_update_cache_test" "${UPDATE_CACHE_DIR}"
@@ -204,25 +165,3 @@ fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="
-
-# ── Op-test codegen framework: generate manifest → build → run (Dawn+SwiftShader) ──
-# Reconfigure the SAME build dir adding GTest (EXECUTORCH_BUILD_TESTS=ON), then run
-# every op in cases.py against its torch golden. Self-skips if the generator can't run.
-OP_TEST_DIR="/tmp/webgpu_op_tests"
-if $PYTHON_EXECUTABLE -m executorch.backends.webgpu.test.op_tests.generate_op_tests \
-    --output "${OP_TEST_DIR}"; then
-  echo "=== Reconfigure with GTest + build/run op-test framework ==="
-  cmake -DEXECUTORCH_BUILD_TESTS=ON -B "${BUILD_DIR}" "${EXECUTORCH_ROOT}"
-  OP_DEFINED="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)"
-  if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test_util_test; then
-    cmake --build "${BUILD_DIR}" --target webgpu_op_test_util_test -j"${NPROC}"
-    "${BIN_DIR}/webgpu_op_test_util_test"
-  fi
-  if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test; then
-    cmake --build "${BUILD_DIR}" --target webgpu_op_test -j"${NPROC}"
-    "${BIN_DIR}/webgpu_op_test" --manifest "${OP_TEST_DIR}/manifest.json"
-  fi
-  echo "=== WebGPU op-test framework on Dawn: passed ==="
-else
-  echo "WARN: op-test manifest generation failed (needs the executorch wheel); skipping"
-fi
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 5ea465e853b..6681499c055 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -26,18 +26,36 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v
 
 echo "=== Step 1: Run Python export tests ==="
 $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
-$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v
+# Non-fatal: a rms_norm pytest failure skips the rms_norm native test below
+# rather than aborting the whole run.
+RMS_NORM_PYTEST_OK=1
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v \
+    || RMS_NORM_PYTEST_OK=0
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
 echo "=== Step 2: Export test models ==="
+PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
+RMS_NORM_DIR="/tmp/rmsn"
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
+export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
+"
+$PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
 "
+if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
+  $PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
+export_rms_norm_cases('${RMS_NORM_DIR}')
+" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; }
+fi
 
 echo "=== Export update_cache model ==="
 UPDATE_CACHE_OK=1
@@ -95,6 +113,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
+cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC}
 
@@ -106,10 +125,18 @@ else
   echo "(skipping update_cache native test: export did not complete)"
 fi
 env \
+    WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+    WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
     ${UPDATE_CACHE_ENV_VAR} \
     WEBGPU_TEST_SDPA_DIR=/tmp/ \
     "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
 
+if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
+  "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
+else
+  echo "(skipping rms_norm native test: pytest or export did not complete)"
+fi
+
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test"
 
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index ad7ad2f2fc2..ef643d33482 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -24,6 +24,118 @@ using namespace executorch::backends::webgpu;
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
+static bool test_single_add(const std::string& model_path) {
+  printf("\n--- Test: single add (1024x1024) ---\n");
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  constexpr int dim = 1024;
+  constexpr int size = dim * dim;
+
+  std::vector<float> a_data(size);
+  std::vector<float> b_data(size);
+  for (int i = 0; i < size; i++) {
+    a_data[i] = static_cast<float>(i) * 1.0f;
+    b_data[i] = static_cast<float>(i) * 2.0f;
+  }
+
+  auto a = make_tensor_ptr({dim, dim}, std::vector<float>(a_data));
+  auto b = make_tensor_ptr({dim, dim}, std::vector<float>(b_data));
+
+  auto result = module.forward({EValue(a), EValue(b)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+
+  const auto& out_tensor = outputs[0].toTensor();
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_error = 0.0f;
+  int check_count = std::min(size, 1024);
+  for (int i = 0; i < check_count; i++) {
+    float expected = a_data[i] + b_data[i];
+    float error = std::abs(out_data[i] - expected);
+    max_error = std::max(max_error, error);
+  }
+
+  printf("Max error: %e (checked %d elements)\n", max_error, check_count);
+  if (max_error > 1e-3f) {
+    printf("FAIL: max error exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: single add test\n");
+  return true;
+}
+
+static bool test_chained_add(const std::string& model_path) {
+  printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n");
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  constexpr int dim = 1024;
+  constexpr int size = dim * dim;
+
+  std::vector<float> x_data(size);
+  std::vector<float> y_data(size);
+  for (int i = 0; i < size; i++) {
+    x_data[i] = static_cast<float>(i % 100) * 0.01f;
+    y_data[i] = static_cast<float>(i % 50) * 0.02f;
+  }
+
+  auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
+  auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));
+
+  auto result = module.forward({EValue(x), EValue(y)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+
+  // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
+  const auto& out_tensor = outputs[0].toTensor();
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_error = 0.0f;
+  for (int i = 0; i < size; i++) {
+    float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
+    float error = std::abs(out_data[i] - expected);
+    max_error = std::max(max_error, error);
+  }
+
+  printf("Max error: %e (checked %d elements)\n", max_error, size);
+  if (max_error > 1e-3f) {
+    printf("FAIL: max error exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: chained add test\n");
+  return true;
+}
+
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
 // Capacity-overrun must throw; runs without a device or TimestampQuery.
 static bool test_query_pool_overrun_throws() {
@@ -295,112 +407,7 @@ static float q4gsw_ramp(int i) {
   return static_cast<float>((i % 17) - 8) / 16.0f;
 }
 
-// Fwd decl of the per-element abs-OR-rel tolerance helper (defined below).
-static bool quant_within_tol(
-    const float* out,
-    const float* golden,
-    int n,
-    float atol,
-    float rtol,
-    float* ma,
-    float* mr);
-
-static std::vector<int32_t> load_indices(
-    const std::string& path,
-    size_t numel) {
-  // Load raw little-endian int32 indices written by the export .py.
-  std::vector<int32_t> g(numel);
-  FILE* f = std::fopen(path.c_str(), "rb");
-  if (!f) {
-    return {};
-  }
-  size_t n = std::fread(g.data(), sizeof(int32_t), numel, f);
-  std::fclose(f);
-  if (n != numel) {
-    return {};
-  }
-  return g;
-}
-
-static bool test_embedding_q4gsw(
-    const std::string& model_path,
-    const std::string& indices_path,
-    const std::string& golden_path,
-    int num_indices,
-    int embed,
-    const char* label) {
-  // q4gsw embedding-gather vs torch golden; shapes per test_embedding_q4gsw.py.
-  const int out_numel = num_indices * embed;
-  printf(
-      "\n--- Test: embedding_q4gsw (%s: indices=%d, embed=%d) ---\n",
-      label,
-      num_indices,
-      embed);
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  std::vector<int32_t> idx32 = load_indices(indices_path, num_indices);
-  std::vector<float> golden = load_golden(golden_path, out_numel);
-  if (idx32.empty() || golden.empty()) {
-    printf(
-        "FAIL: could not load indices %s / golden %s\n",
-        indices_path.c_str(),
-        golden_path.c_str());
-    return false;
-  }
-
-  // int64 at the program boundary; copy_inputs narrows to the int32 buffer.
-  std::vector<int64_t> idx64(idx32.begin(), idx32.end());
-  auto idx = make_tensor_ptr({num_indices}, std::move(idx64));
-
-  auto result = module.forward({EValue(idx)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-  const auto& out_tensor = outputs[0].toTensor();
-  if (out_tensor.numel() != out_numel) {
-    printf(
-        "FAIL: output numel %zu != expected %d\n",
-        (size_t)out_tensor.numel(),
-        out_numel);
-    return false;
-  }
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_abs_err = 0.0f, max_rel_err = 0.0f;
-  const bool pass = quant_within_tol(
-      out_data,
-      golden.data(),
-      out_numel,
-      1e-3f,
-      1e-3f,
-      &max_abs_err,
-      &max_rel_err);
-  printf(
-      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
-      max_abs_err,
-      max_rel_err,
-      out_numel);
-  if (!pass) {
-    printf("FAIL: embedding_q4gsw exceeds tolerance 1e-3 (abs AND rel)\n");
-    return false;
-  }
-  printf("PASS: embedding_q4gsw test\n");
-  return true;
-}
-
+// Per-element dual tolerance (abs OR rel), parameterized like sdpa_within_tol.
 static bool quant_within_tol(
     const float* out,
     const float* golden,
@@ -425,185 +432,6 @@ static bool quant_within_tol(
   return ok;
 }
 
-static bool test_rope(
-    const std::string& model_path,
-    const std::string& xq_golden_path,
-    const std::string& xk_golden_path,
-    int S,
-    int NH,
-    int NKV,
-    int HD,
-    const char* label) {
-  // Llama interleaved RoPE vs torch goldens; shapes/ramps per test_rope.py.
-  const int xq_numel = S * NH * HD;
-  const int xk_numel = S * NKV * HD;
-  const int freqs_numel = S * (HD / 2);
-  printf(
-      "\n--- Test: apply_rotary_emb (%s: S=%d,NH=%d,NKV=%d,HD=%d) ---\n",
-      label,
-      S,
-      NH,
-      NKV,
-      HD);
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  // ((i % mod) - off) / 16: exact in fp32, matches test_rope.py::_ramp.
-  auto ramp = [](int i, int mod, int off) {
-    return static_cast<float>((i % mod) - off) / 16.0f;
-  };
-  std::vector<float> xq(xq_numel), xk(xk_numel), fc(freqs_numel),
-      fs(freqs_numel);
-  for (int i = 0; i < xq_numel; i++) {
-    xq[i] = ramp(i, 17, 8);
-  }
-  for (int i = 0; i < xk_numel; i++) {
-    xk[i] = ramp(i, 13, 6);
-  }
-  for (int i = 0; i < freqs_numel; i++) {
-    fc[i] = ramp(i, 11, 5);
-    fs[i] = ramp(i, 7, 3);
-  }
-
-  auto xqt = make_tensor_ptr({1, S, NH, HD}, std::vector<float>(xq));
-  auto xkt = make_tensor_ptr({1, S, NKV, HD}, std::vector<float>(xk));
-  auto fct = make_tensor_ptr({S, HD / 2}, std::vector<float>(fc));
-  auto fst = make_tensor_ptr({S, HD / 2}, std::vector<float>(fs));
-
-  auto result =
-      module.forward({EValue(xqt), EValue(xkt), EValue(fct), EValue(fst)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-  const auto& outputs = result.get();
-
-  // Outputs in graph order [0]=xq_out, [1]=xk_out (positional; the numel check
-  // below guards a swap, since NH != NKV under GQA).
-  if (outputs.size() < 2 || !outputs[0].isTensor() || !outputs[1].isTensor()) {
-    printf("FAIL: expected 2 tensor outputs, got %zu\n", outputs.size());
-    return false;
-  }
-  const auto& xq_t = outputs[0].toTensor();
-  const auto& xk_t = outputs[1].toTensor();
-  if (xq_t.numel() != xq_numel || xk_t.numel() != xk_numel) {
-    printf(
-        "FAIL: output shapes [%zu,%zu] != expected [%d,%d]\n",
-        (size_t)xq_t.numel(),
-        (size_t)xk_t.numel(),
-        xq_numel,
-        xk_numel);
-    return false;
-  }
-  const float* xq_out = xq_t.const_data_ptr<float>();
-  const float* xk_out = xk_t.const_data_ptr<float>();
-
-  std::vector<float> gq = load_golden(xq_golden_path, xq_numel);
-  std::vector<float> gk = load_golden(xk_golden_path, xk_numel);
-  if (gq.empty() || gk.empty()) {
-    printf(
-        "FAIL: could not load goldens %s / %s\n",
-        xq_golden_path.c_str(),
-        xk_golden_path.c_str());
-    return false;
-  }
-
-  // Per-element abs-OR-rel on xq and xk (shared helper, defined above).
-  float maq = 0.0f, mrq = 0.0f, mak = 0.0f, mrk = 0.0f;
-  const bool pass_q =
-      quant_within_tol(xq_out, gq.data(), xq_numel, 1e-3f, 1e-3f, &maq, &mrq);
-  const bool pass_k =
-      quant_within_tol(xk_out, gk.data(), xk_numel, 1e-3f, 1e-3f, &mak, &mrk);
-  const float max_abs_err = std::max(maq, mak);
-  const float max_rel_err = std::max(mrq, mrk);
-
-  printf(
-      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
-      max_abs_err,
-      max_rel_err,
-      xq_numel + xk_numel);
-  if (!(pass_q && pass_k)) {
-    printf("FAIL: apply_rotary_emb exceeds tolerance 1e-3 (abs AND rel)\n");
-    return false;
-  }
-  printf("PASS: apply_rotary_emb test\n");
-  return true;
-}
-
-static bool test_prepack(
-    const std::string& model_path,
-    const std::string& golden_path,
-    const std::string& label = "x + const w") {
-  // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py.
-  constexpr int n = 4;
-  constexpr int numel = n * n;
-  printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n);
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  std::vector<float> golden = load_golden(golden_path, numel);
-  if (golden.empty()) {
-    printf("FAIL: could not load golden %s\n", golden_path.c_str());
-    return false;
-  }
-
-  // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs.
-  std::vector<float> x_data(numel);
-  for (int i = 0; i < numel; i++) {
-    x_data[i] = static_cast<float>((i % 13) - 6) / 16.0f;
-  }
-  auto x = make_tensor_ptr({n, n}, std::vector<float>(x_data));
-
-  auto result = module.forward({EValue(x)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-  const auto& out_tensor = outputs[0].toTensor();
-  if (out_tensor.numel() != numel) {
-    printf(
-        "FAIL: output numel %zu != expected %d\n",
-        (size_t)out_tensor.numel(),
-        numel);
-    return false;
-  }
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_abs_err = 0.0f, max_rel_err = 0.0f;
-  // Per-element abs-OR-rel (quant_within_tol): a global rel gate spuriously
-  // fails near-zero outputs where rel error explodes.
-  const bool within = quant_within_tol(
-      out_data, golden.data(), numel, 1e-3f, 1e-3f, &max_abs_err, &max_rel_err);
-  printf(
-      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
-      max_abs_err,
-      max_rel_err,
-      numel);
-  if (!within) {
-    printf("FAIL: prepack exceeds tolerance 1e-3\n");
-    return false;
-  }
-  printf("PASS: prepack test\n");
-  return true;
-}
-
 // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
 static bool test_q4gsw_config(
     const Q4gswConfig& cfg,
@@ -1612,6 +1440,19 @@ static bool test_resize_hook(const std::string& blob_path) {
 }
 
 int main(int argc, char** argv) {
+  std::string model_path = "webgpu_add_test.pte";
+  if (argc > 1) {
+    model_path = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) {
+    model_path = env;
+  }
+
+  std::string chained_model_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
+    chained_model_path = env;
+  }
+
   std::string update_cache_model_path;
   if (const char* env = std::getenv("WEBGPU_TEST_UPDATE_CACHE_MODEL")) {
     update_cache_model_path = env;
@@ -1626,86 +1467,6 @@ int main(int argc, char** argv) {
     }
   }
 
-  // embedding_q4gsw on-GPU configs: small + llama1b (env-gated,
-  // run-if-present).
-  struct EmbConfig {
-    const char* name;
-    const char* model_env;
-    const char* indices_env;
-    const char* golden_env;
-    int num_indices;
-    int embed;
-  };
-  const EmbConfig emb_configs[] = {
-      {"small",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN",
-       4,
-       64},
-      {"llama1b",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES",
-       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN",
-       4,
-       2048},
-  };
-
-  // apply_rotary_emb on-GPU configs: multi + decode (env-gated,
-  // run-if-present).
-  struct RopeConfig {
-    const char* name;
-    const char* model_env;
-    const char* xq_env;
-    const char* xk_env;
-    int S;
-    int NH;
-    int NKV;
-    int HD;
-  };
-  const RopeConfig rope_configs[] = {
-      {"multi",
-       "WEBGPU_TEST_ROPE_MODEL",
-       "WEBGPU_TEST_ROPE_XQ_GOLDEN",
-       "WEBGPU_TEST_ROPE_XK_GOLDEN",
-       5,
-       8,
-       2,
-       64},
-      {"decode",
-       "WEBGPU_TEST_ROPE_DECODE_MODEL",
-       "WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN",
-       "WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN",
-       1,
-       32,
-       8,
-       64},
-  };
-
-  std::string prepack_model_path, prepack_golden_path;
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) {
-    prepack_model_path = env;
-  }
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) {
-    prepack_golden_path = env;
-  }
-
-  std::string prepack2_model_path, prepack2_golden_path;
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) {
-    prepack2_model_path = env;
-  }
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) {
-    prepack2_golden_path = env;
-  }
-
-  std::string prepack_tied_model_path, prepack_tied_golden_path;
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_MODEL")) {
-    prepack_tied_model_path = env;
-  }
-  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_GOLDEN")) {
-    prepack_tied_golden_path = env;
-  }
-
   // SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
   // this directory (default "" = the embedded-file root / cwd). Set
   // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
@@ -1733,6 +1494,12 @@ int main(int argc, char** argv) {
   ok = test_query_pool_overrun_throws() && ok;
   ok = test_query_pool_roundtrip(ctx) && ok;
 #endif // WGPU_BACKEND_ENABLE_PROFILING
+  ok = test_single_add(model_path) && ok;
+
+  if (!chained_model_path.empty()) {
+    ok = test_chained_add(chained_model_path) && ok;
+  }
+
   if (!update_cache_model_path.empty()) {
     ok = test_update_cache(update_cache_model_path) && ok;
   }
@@ -1753,42 +1520,6 @@ int main(int argc, char** argv) {
     ok = false;
   }
 
-  for (const auto& c : emb_configs) {
-    const char* m = std::getenv(c.model_env);
-    const char* ip = std::getenv(c.indices_env);
-    const char* g = std::getenv(c.golden_env);
-    if (m && ip && g && *m && *ip && *g) {
-      ok = test_embedding_q4gsw(m, ip, g, c.num_indices, c.embed, c.name) && ok;
-    }
-  }
-
-  for (const auto& c : rope_configs) {
-    const char* m = std::getenv(c.model_env);
-    const char* xq = std::getenv(c.xq_env);
-    const char* xk = std::getenv(c.xk_env);
-    if (m && xq && xk && *m && *xq && *xk) {
-      ok = test_rope(m, xq, xk, c.S, c.NH, c.NKV, c.HD, c.name) && ok;
-    }
-  }
-
-  if (!prepack_model_path.empty() && !prepack_golden_path.empty()) {
-    ok = test_prepack(prepack_model_path, prepack_golden_path) && ok;
-  }
-
-  if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) {
-    ok = test_prepack(
-             prepack2_model_path, prepack2_golden_path, "x + w1 + w2") &&
-        ok;
-  }
-
-  if (!prepack_tied_model_path.empty() && !prepack_tied_golden_path.empty()) {
-    ok = test_prepack(
-             prepack_tied_model_path,
-             prepack_tied_golden_path,
-             "x + w + w (tied weights, shared key)") &&
-        ok;
-  }
-
   bool sdpa_ran = false;
   bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
   if (sdpa_ran) {
diff --git a/codegen/api/et_cpp.py b/codegen/api/et_cpp.py
index a144128368c..88f1eb83fe0 100644
--- a/codegen/api/et_cpp.py
+++ b/codegen/api/et_cpp.py
@@ -40,6 +40,7 @@
     tensorT,
 )
 
+
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
@@ -277,7 +278,7 @@ def default_expr(d: str, t: Type) -> str:
 
     if isinstance(t, OptionalType):
         if d == "None":
-            return "std::nullopt"
+            return "torch::executor::nullopt"
 
         return default_expr(d, t.elem)
 
diff --git a/codegen/api/types/types.py b/codegen/api/types/types.py
index dd80daebb33..712d7e5e341 100644
--- a/codegen/api/types/types.py
+++ b/codegen/api/types/types.py
@@ -16,6 +16,7 @@
 )
 from torchgen.model import BaseTy
 
+
 halfT = BaseCppType("torch::executor", "Half")
 bfloat16T = BaseCppType("torch::executor", "BFloat16")
 stringT = BaseCppType("torch::executor", "string_view")
@@ -58,7 +59,7 @@ class OptionalCType(CType):
 
     def cpp_type(self, *, strip_ref: bool = False) -> str:
         # Do not pass `strip_ref` recursively.
-        return f"std::optional<{self.elem.cpp_type()}>"
+        return f"torch::executor::optional<{self.elem.cpp_type()}>"
 
     def remove_const_ref(self) -> CType:
         return OptionalCType(self.elem.remove_const_ref())
diff --git a/devtools/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md
index c161958f189..096ab10fb57 100644
--- a/devtools/bundled_program/schema/README.md
+++ b/devtools/bundled_program/schema/README.md
@@ -4,13 +4,3 @@ and other useful info together for verifying the correctness of ExecuTorch progr
 
 ## Rules to ensure forward/backward compatibility
 Please check the rules in [here](../../../schema/README.md) for more info.
-
-
-## Regenerating generated code
-
-Schema changes require regenerating the Python bindings in
-`devtools/bundled_program/serialize/generated` and committing the updated files. From the repo root:
-
-```sh
-python devtools/bundled_program/serialize/generate_bundled_program.py
-```
\ No newline at end of file
diff --git a/devtools/bundled_program/serialize/BUCK b/devtools/bundled_program/serialize/BUCK
index 89a8122503c..ae920d1e4c2 100644
--- a/devtools/bundled_program/serialize/BUCK
+++ b/devtools/bundled_program/serialize/BUCK
@@ -9,7 +9,7 @@ fbcode_target(_kind = runtime.python_library,
     name = "lib",
     srcs = [
         "__init__.py",
-    ] + glob(["generated/**/*.py"]),
+    ],
     resources = {
         "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
         "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
@@ -19,7 +19,6 @@ fbcode_target(_kind = runtime.python_library,
     # Please ask before changing this.
     visibility = ["PUBLIC"],
     deps = [
-        "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/setuptools:setuptools",
         "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir/_serialize:lib",
diff --git a/devtools/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py
index 50c6b5768ce..ceba7670910 100644
--- a/devtools/bundled_program/serialize/__init__.py
+++ b/devtools/bundled_program/serialize/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025-2026 Arm Limited and/or its affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,62 +9,23 @@
 
 # TODO(T138924864): Refactor to unify the serialization for bundled program and executorch program.
 
-import functools
 import importlib.resources as _resources
 import json
 import os
-import re
 import tempfile
-from typing import Any
 
 import executorch.devtools.bundled_program.schema as bp_schema
 
 import executorch.devtools.bundled_program.serialize as serialization_package
-
-import flatbuffers  # pyre-ignore[21]
 from executorch.devtools.bundled_program.core import BundledProgram
-from executorch.devtools.bundled_program.serialize.generated.bundled_program_flatbuffer import (
-    Bool as _Bool,
-    BundledMethodTestCase as _BundledMethodTestCase,
-    BundledMethodTestSuite as _BundledMethodTestSuite,
-    BundledProgram as _BundledProgram,
-    Double as _Double,
-    Int as _Int,
-    Tensor as _Tensor,
-    Value as _Value,
-    ValueUnion as _ValueUnion,
-)
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-from executorch.exir._serialize._flatbuffer_program import (
-    _coerce_bytes,
-    _create_aligned_byte_vector,
-)
 
 # The prefix of schema files used for bundled program
 BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema"
 SCALAR_TYPE_SCHEMA_NAME = "scalar_type"
 
 
-@functools.lru_cache(maxsize=1)
-def _bundled_program_file_identifier() -> bytes:
-    schema = _resources.read_binary(
-        serialization_package, f"{BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
-    )
-    match = re.search(rb'file_identifier\s+"([^"]+)"', schema)
-    if match is None:
-        raise ValueError(
-            f"Missing file_identifier in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
-        )
-    file_identifier = match.group(1)
-    if len(file_identifier) != 4:
-        raise ValueError(
-            f"Invalid file_identifier length {len(file_identifier)} "
-            f"in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
-        )
-    return file_identifier
-
-
 def write_schema(d: str, schema_name: str) -> None:
     schema_path = os.path.join(d, "{}.fbs".format(schema_name))
     with open(schema_path, "wb") as schema_file:
@@ -117,145 +78,6 @@ def convert_from_flatbuffer(program_flatbuffer: bytes) -> bytes:
             return output_file.read()
 
 
-def _pack_tensor(self: Any, builder: Any) -> int:
-    if self.sizes is not None:
-        _Tensor.TensorStartSizesVector(builder, len(self.sizes))
-        for i in reversed(range(len(self.sizes))):
-            builder.PrependInt32(self.sizes[i])
-        sizes = builder.EndVector()
-    if self.data is not None:
-        data = _create_aligned_byte_vector(builder, _coerce_bytes(self.data), 16)
-    if self.dimOrder is not None:
-        dim_order = _create_aligned_byte_vector(
-            builder, _coerce_bytes(self.dimOrder), 1
-        )
-
-    _Tensor.TensorStart(builder)
-    _Tensor.TensorAddScalarType(builder, self.scalarType)
-    if self.sizes is not None:
-        _Tensor.TensorAddSizes(builder, sizes)
-    if self.data is not None:
-        _Tensor.TensorAddData(builder, data)
-    if self.dimOrder is not None:
-        _Tensor.TensorAddDimOrder(builder, dim_order)
-    return _Tensor.TensorEnd(builder)
-
-
-def _pack_bundled_program(self: Any, builder: Any) -> int:
-    if self.methodTestSuites is not None:
-        method_test_suites_list = [
-            method_test_suite.Pack(builder)
-            for method_test_suite in self.methodTestSuites
-        ]
-        _BundledProgram.BundledProgramStartMethodTestSuitesVector(
-            builder, len(self.methodTestSuites)
-        )
-        for i in reversed(range(len(self.methodTestSuites))):
-            builder.PrependUOffsetTRelative(method_test_suites_list[i])
-        method_test_suites = builder.EndVector()
-    if self.program is not None:
-        program = _create_aligned_byte_vector(builder, _coerce_bytes(self.program), 32)
-
-    _BundledProgram.BundledProgramStart(builder)
-    _BundledProgram.BundledProgramAddVersion(builder, self.version)
-    if self.methodTestSuites is not None:
-        _BundledProgram.BundledProgramAddMethodTestSuites(builder, method_test_suites)
-    if self.program is not None:
-        _BundledProgram.BundledProgramAddProgram(builder, program)
-    return _BundledProgram.BundledProgramEnd(builder)
-
-
-@functools.lru_cache(maxsize=1)
-def _install_fast_packers() -> None:
-    _Tensor.TensorT.Pack = _pack_tensor
-    _BundledProgram.BundledProgramT.Pack = _pack_bundled_program
-
-
-def _convert_tensor(val: bp_schema.Tensor) -> Any:
-    result = _Tensor.TensorT()
-    result.scalarType = int(val.scalar_type)
-    result.sizes = list(val.sizes)
-    result.data = _coerce_bytes(val.data)
-    result.dimOrder = _coerce_bytes(val.dim_order)
-    return result
-
-
-def _convert_int(val: bp_schema.Int) -> Any:
-    result = _Int.IntT()
-    result.intVal = val.int_val
-    return result
-
-
-def _convert_bool(val: bp_schema.Bool) -> Any:
-    result = _Bool.BoolT()
-    result.boolVal = val.bool_val
-    return result
-
-
-def _convert_double(val: bp_schema.Double) -> Any:
-    result = _Double.DoubleT()
-    result.doubleVal = val.double_val
-    return result
-
-
-def _convert_value_union(val: bp_schema.ValueUnion) -> tuple[int, Any]:
-    if isinstance(val, bp_schema.Tensor):
-        return _ValueUnion.ValueUnion.Tensor, _convert_tensor(val)
-    if isinstance(val, bp_schema.Int):
-        return _ValueUnion.ValueUnion.Int, _convert_int(val)
-    if isinstance(val, bp_schema.Bool):
-        return _ValueUnion.ValueUnion.Bool, _convert_bool(val)
-    if isinstance(val, bp_schema.Double):
-        return _ValueUnion.ValueUnion.Double, _convert_double(val)
-    return _ValueUnion.ValueUnion.NONE, None
-
-
-def _convert_value(val: bp_schema.Value) -> Any:
-    result = _Value.ValueT()
-    result.valType, result.val = _convert_value_union(val.val)
-    return result
-
-
-def _convert_method_test_case(val: bp_schema.BundledMethodTestCase) -> Any:
-    result = _BundledMethodTestCase.BundledMethodTestCaseT()
-    result.inputs = [_convert_value(value) for value in val.inputs]
-    result.expectedOutputs = [_convert_value(value) for value in val.expected_outputs]
-    return result
-
-
-def _convert_method_test_suite(val: bp_schema.BundledMethodTestSuite) -> Any:
-    result = _BundledMethodTestSuite.BundledMethodTestSuiteT()
-    result.methodName = val.method_name
-    result.testCases = [
-        _convert_method_test_case(test_case) for test_case in val.test_cases
-    ]
-    return result
-
-
-def _convert_bundled_program(val: bp_schema.BundledProgram) -> Any:
-    result = _BundledProgram.BundledProgramT()
-    result.version = val.version
-    result.methodTestSuites = [
-        _convert_method_test_suite(suite) for suite in val.method_test_suites
-    ]
-    result.program = _coerce_bytes(val.program)
-    return result
-
-
-def _bundled_program_schema_to_flatbuffer(
-    bundled_program: bp_schema.BundledProgram,
-) -> bytes:
-    _install_fast_packers()
-    bundled_program_t = _convert_bundled_program(bundled_program)
-    builder = flatbuffers.Builder()
-    bundled_program_offset = bundled_program_t.Pack(builder)
-    builder.Finish(
-        bundled_program_offset,
-        file_identifier=_bundled_program_file_identifier(),
-    )
-    return bytes(builder.Output())
-
-
 # from bundled program to flatbuffer
 def serialize_from_bundled_program_to_flatbuffer(
     bundled_program: BundledProgram,
@@ -272,7 +94,9 @@ def serialize_from_bundled_program_to_flatbuffer(
 
     bundled_program_in_schema = bundled_program.serialize_to_schema()
 
-    return _bundled_program_schema_to_flatbuffer(bundled_program_in_schema)
+    return convert_to_flatbuffer(
+        serialize_from_bundled_program_to_json(bundled_program_in_schema)
+    )
 
 
 # From flatbuffer to bundled program in schema.
diff --git a/docs/source/backends/nxp/op-support.csv b/docs/source/backends/nxp/op-support.csv
index fb67f47bf62..8a250dce88d 100644
--- a/docs/source/backends/nxp/op-support.csv
+++ b/docs/source/backends/nxp/op-support.csv
@@ -13,7 +13,6 @@ aten.constant_pad_nd.default,int8,static int8,"H or W padding only"
 aten.convolution.default,int8,static int8,"1D or 2D convolution, constant weights, groups=1 or groups=channels_count (depthwise)"
 aten.dim_order_ops._clone_dim_order.default,,, "See aten.clone.default"
 aten.div.Tensor,int8,static int8,"divisor - static tensor or scalar value, one dimension must satisfy %8 = 0 or scalar division (all dims = 1)"
-aten.exp.default,int8,static int8,
 aten.hardtanh.default,int8,static int8,"supported ranges: <0,6>, <-1, 1>, <0,1>, <0,inf>"
 aten.leaky_relu.default,int8,static int8,
 aten.log.default,int8,static int8,
diff --git a/examples/arm/executor_runner/arm_memory_allocator.cpp b/examples/arm/executor_runner/arm_memory_allocator.cpp
index d3337b6005e..de670df29ae 100644
--- a/examples/arm/executor_runner/arm_memory_allocator.cpp
+++ b/examples/arm/executor_runner/arm_memory_allocator.cpp
@@ -26,7 +26,7 @@ static void asan_unpoison_buffer(void* base, size_t size) {
 #endif
 
 ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
-    : MemoryAllocator(size, base_address) {
+    : MemoryAllocator(size, base_address), used_(0) {
 #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
   asan_poison_buffer(base_address, size);
 #endif
@@ -34,16 +34,35 @@ ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
 
 void* ArmMemoryAllocator::allocate(size_t size, size_t alignment) {
   void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
-#if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
   if (ret != nullptr) {
+#if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
     asan_unpoison_buffer(ret, size);
-  }
 #endif
+    // Align with the same code as in MemoryAllocator::allocate() to keep
+    // used_ "in sync" As alignment is expected to be power of 2 (checked by
+    // MemoryAllocator::allocate()) we can check it the lower bits
+    // (same as alignment - 1) is zero or not.
+    if ((size & (alignment - 1)) == 0) {
+      // Already aligned.
+      used_ += size;
+    } else {
+      used_ = (used_ | (alignment - 1)) + 1 + size;
+    }
+  }
   return ret;
 }
 
+size_t ArmMemoryAllocator::used_size() const {
+  return used_;
+}
+
+size_t ArmMemoryAllocator::free_size() const {
+  return executorch::runtime::MemoryAllocator::size() - used_;
+}
+
 void ArmMemoryAllocator::reset() {
   executorch::runtime::MemoryAllocator::reset();
+  used_ = 0;
 #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
   asan_poison_buffer(base_address(), size());
 #endif
diff --git a/examples/arm/executor_runner/arm_memory_allocator.h b/examples/arm/executor_runner/arm_memory_allocator.h
index 3c82f72c44b..1d7bbdecb4c 100644
--- a/examples/arm/executor_runner/arm_memory_allocator.h
+++ b/examples/arm/executor_runner/arm_memory_allocator.h
@@ -10,14 +10,21 @@ using executorch::runtime::MemoryAllocator;
 
 #pragma once
 
-// Custom allocator that poisons/unpoisons its buffer for AddressSanitizer. The
-// used and free byte counts are reported by the base MemoryAllocator's
-// used_size() / free_size().
+// Setup our own allocator that can show some extra stuff like used and free
+// memory info
 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
  public:
   ArmMemoryAllocator(uint32_t size, uint8_t* base_address);
 
   void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
 
-  void reset() override;
+  // Returns the used size of the allocator's memory buffer.
+  size_t used_size() const;
+
+  // Returns the free size of the allocator's memory buffer.
+  size_t free_size() const;
+  void reset();
+
+ private:
+  size_t used_;
 };
diff --git a/examples/espressif/README.md b/examples/espressif/README.md
index a76e794030c..025bdf94094 100644
--- a/examples/espressif/README.md
+++ b/examples/espressif/README.md
@@ -44,6 +44,8 @@ examples/espressif/
 ├── executor_runner/
 │   ├── CMakeLists.txt           # Component/standalone CMake build
 │   ├── esp_executor_runner.cpp  # Main executor runner
+│   ├── esp_memory_allocator.h   # Custom memory allocator
+│   ├── esp_memory_allocator.cpp
 │   ├── esp_perf_monitor.h       # Performance monitoring
 │   ├── esp_perf_monitor.cpp
 │   └── pte_to_header.py         # Convert .pte to C header
diff --git a/examples/espressif/executor_runner/CMakeLists.txt b/examples/espressif/executor_runner/CMakeLists.txt
index 2a26c53d5e0..a103a1ddc8c 100644
--- a/examples/espressif/executor_runner/CMakeLists.txt
+++ b/examples/espressif/executor_runner/CMakeLists.txt
@@ -28,6 +28,7 @@ if(ESP_PLATFORM)
     SRCS
     "esp_executor_runner.cpp"
     "esp_pal.cpp"
+    "esp_memory_allocator.cpp"
     "esp_perf_monitor.cpp"
     INCLUDE_DIRS
     "."
@@ -282,7 +283,7 @@ else()
   add_executable(esp_executor_runner)
   target_sources(
     esp_executor_runner PRIVATE esp_executor_runner.cpp esp_pal.cpp
-                                esp_perf_monitor.cpp
+                                esp_perf_monitor.cpp esp_memory_allocator.cpp
   )
 
   target_link_libraries(
diff --git a/examples/espressif/executor_runner/esp_executor_runner.cpp b/examples/espressif/executor_runner/esp_executor_runner.cpp
index c2f1fa34dde..9260e6b88a0 100644
--- a/examples/espressif/executor_runner/esp_executor_runner.cpp
+++ b/examples/espressif/executor_runner/esp_executor_runner.cpp
@@ -73,6 +73,7 @@
 #include <executorch/runtime/platform/runtime.h>
 
 #include "esp_executor_runner.h"
+#include "esp_memory_allocator.h"
 #include "esp_perf_monitor.h"
 
 #if defined(ESP_PLATFORM)
@@ -477,8 +478,8 @@ struct RunnerContext {
   bool bundle_io = false;
   Box<BufferDataLoader> loader;
   Box<Program> program;
-  Box<MemoryAllocator> method_allocator;
-  Box<MemoryAllocator> temp_allocator;
+  Box<EspMemoryAllocator> method_allocator;
+  Box<EspMemoryAllocator> temp_allocator;
   std::vector<Span<uint8_t>> planned_spans;
   Box<HierarchicalAllocator> planned_memory;
   Box<MemoryManager> memory_manager;
@@ -1019,7 +1020,7 @@ bool et_runner_init(void) {
     return false;
   }
 #endif
-  MemoryAllocator file_allocator(
+  EspMemoryAllocator file_allocator(
       method_allocation_pool_size, method_allocation_pool);
   auto [buffer, buffer_size] =
       load_file_from_fs("/spiffs/model.pte", file_allocator);
@@ -1246,4 +1247,4 @@ size_t et_runner_outputs_size(void) {
   ET_CHECK_MSG(model_ok == true, "Problem running model");
 
   ET_LOG(Info, "Program complete.");
-}
+}
\ No newline at end of file
diff --git a/examples/models/BUCK b/examples/models/BUCK
index ed72a16e05f..a2b6789a95e 100644
--- a/examples/models/BUCK
+++ b/examples/models/BUCK
@@ -33,9 +33,6 @@ fbcode_target(_kind = python_library,
         "//executorch/examples/models/phi_4_mini:phi_4_mini",  # @manual
         "//executorch/examples/models/smollm2:smollm2",  # @manual
         "//executorch/examples/models/smollm3:smollm3",  # @manual
-        "//executorch/examples/models/smolvlm:smolvlm",  # @manual
-        "//executorch/examples/models/whisper:whisper",  # @manual
-        "//executorch/examples/models/yolo26:yolo26",  # @manual
     ],
 )
 
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index d50554006bd..241a5cc366e 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -45,10 +45,6 @@ class Model(str, Enum):
     MobileNetV1025 = "mobilenet_v1_025"
     ResNet8 = "resnet8"
     Sdpa = "sdpa"
-    Qwen3 = "qwen3"
-    SmolVLM = "smolvlm"
-    YOLO26 = "yolo26"
-    Whisper = "whisper"
 
     def __str__(self) -> str:
         return self.value
@@ -109,10 +105,6 @@ def __str__(self) -> str:
     ),
     str(Model.ResNet8): ("mlperf_tiny.resnet8", "ResNet8Model"),
     str(Model.Sdpa): ("toy_model", "SdpaModule"),
-    str(Model.Qwen3): ("qwen3", "Qwen3Model"),
-    str(Model.SmolVLM): ("smolvlm", "SmolVLMModel"),
-    str(Model.YOLO26): ("yolo26", "YOLO26Model"),
-    str(Model.Whisper): ("whisper", "WhisperModel"),
 }
 
 __all__ = [
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
index 482f64083a0..ae3bcb24c19 100644
--- a/examples/models/gemma4_31b/README.md
+++ b/examples/models/gemma4_31b/README.md
@@ -93,31 +93,14 @@ method with dynamic sequence length and host-side sampling.
 
 Writes `model.pte` (and optionally `model.ptd`) into `--output-dir`.
 
-#### TurboQuant KV cache (long context, CUDA + MLX)
+#### TurboQuant KV cache (long context, MLX only)
 
 For long-context inference, add `--turboquant` to swap the full-attention
 layers' KV cache for a TurboQuant TQ4 cache (4-bit codebook + nibble pack).
 This gives ~3.8× cache memory savings on the full-attention layers and lets
-you fit context lengths that wouldn't fit in bf16. Sliding-window layers are
-unaffected. Supported on both the CUDA and MLX backends.
-
-**Long context requires BOTH flags**: `--turboquant` *and* a larger
-`--max-seq-len`. Raising `--max-seq-len` alone keeps a bf16 KV cache, which does
-not fit at long context. On CUDA, `--turboquant` is what enables 128k: Gemma4-31B
-at `--max-seq-len 131072` runs within ~27 GiB at runtime (fits a 32 GB card).
-
-```bash
-# CUDA — 128k context (TQ4 KV)
-python examples/models/gemma4_31b/export.py \
-    --gguf ./gemma-4-31B-it-Q4_K_M.gguf \
-    --output-dir ./gemma4_31b_exports_128k \
-    --max-seq-len 131072 \
-    --backend cuda \
-    --turboquant
-```
+you fit context lengths that wouldn't fit in bf16. Sliding-window layers are unaffected.
 
 ```bash
-# MLX (Apple Silicon)
 python examples/models/gemma4_31b/export.py \
     --prequantized ./gemma4_31b_int4 \
     --output-dir ./gemma4_31b_exports_mlx_tq \
diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py
index 666d0c44e9d..f9b383cf224 100644
--- a/examples/models/gemma4_31b/cuda_source_transformations.py
+++ b/examples/models/gemma4_31b/cuda_source_transformations.py
@@ -77,15 +77,9 @@ def _turboquant_attention_forward(
     # uncompressed K/V is never materialized.
     k_packed, k_norms, v_packed, v_norms = self.kv_cache.update(input_pos, k, v)
 
-    # Number of valid (filled) KV positions = input_pos[0] + T. Passing this to
-    # tq4_sdpa bounds its KV loop to the actual context instead of the full
-    # pre-allocated buffer (max_seq_len for global layers), making attention
-    # O(context) instead of O(max_seq_len). Kept as a GPU scalar (no ``.item()``)
-    # so the bound is captured correctly by the decode CUDA graph. Decode: T=1 ->
-    # input_pos+1; prefill chunk: T -> chunk_end.
-    # NOTE: this call-site argument was dropped during a rebase, which silently
-    # disabled the O(context) bound and forced a full max_seq_len sweep every
-    # step (catastrophic at 128k: ~2.7 tok/s decode vs ~37+ when bounded).
+    # Number of valid (filled) KV positions = input_pos[0] + T. Bounds tq4_sdpa's
+    # KV loop to the actual context (O(context), not O(max_seq_len)) and enables
+    # the split-K decode path. GPU scalar (no .item()) so it's CUDA-graph-safe.
     kv_len = input_pos[0] + input_pos.shape[0]
 
     # ``scale=self.scaling`` (= 1.0 for Gemma 4) — overrides tq4_sdpa's
@@ -100,7 +94,7 @@ def _turboquant_attention_forward(
         self.kv_cache.centroids,
         self.kv_cache.rotation,
         attn_mask,
-        False,  # is_causal: attn_mask already encodes causal masking
+        False,  # is_causal — attn_mask already encodes causal masking
         self.scaling,
         kv_len,
         True,  # mask_is_causal: Gemma full-attention mask is standard causal
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
index d9e16bc34df..59be23020f2 100644
--- a/examples/models/gemma4_31b/export.py
+++ b/examples/models/gemma4_31b/export.py
@@ -171,7 +171,6 @@ def _export_cuda(
     )
     from executorch.exir.backend.compile_spec_schema import CompileSpec
     from executorch.exir.passes import MemoryPlanningPass
-    from executorch.exir.passes.propagate_device_pass import PropagateDeviceConfig
     from torch.export import Dim, export
 
     inductor_config.coordinate_descent_tuning = False
@@ -271,14 +270,6 @@ def _export_cuda(
                 alloc_graph_input=False,
             ),
             emit_mutable_buffer_names=True,
-            # Keep method inputs/outputs device-resident so the CUDA backend
-            # does not insert boundary H2D/D2H copies: the runner stages inputs
-            # in CUDA memory and reads the sampled token back with a single
-            # small D2H. CUDA-only (no effect on the MLX path).
-            propagate_device_config=PropagateDeviceConfig(
-                skip_h2d_for_method_inputs=True,
-                skip_d2h_for_method_outputs=True,
-            ),
         ),
     )
 
diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py
index 90839ea6f6a..e95581dc95d 100644
--- a/examples/models/gemma4_31b/gguf_loader.py
+++ b/examples/models/gemma4_31b/gguf_loader.py
@@ -17,12 +17,9 @@
   linear and embedding. ``embed_tokens`` and ``lm_head`` stay tied -- they share
   the one quantized tensor.
 * **CUDA**: Q4_K -> ``Int4Tensor``, Q6_K -> ``CudaDp4aPlanarInt6Tensor`` (a genuine
-  6-bit packed weight, lossless, symmetric). ``embed_tokens`` and ``lm_head`` are
-  untied: ``lm_head`` keeps a packed (int6/int4) matmul weight, while the token
-  embedding becomes a gatherable ``IntxUnpackedToInt8Tensor`` (int8) -- the truly
-  packed int4/int6 tensors can't gather. For the Q6_K tied weight the decode is
-  done once and shared between the two, avoiding a whole-tensor bf16 dequant and
-  a second decode (see ``_untie_embed_lm_head``).
+  6-bit packed weight, lossless, symmetric); ``lm_head`` keeps the quantized
+  tensor but the token embedding is dequantized to bf16 (the packed tensors can't
+  gather), so they are untied.
 
 Usage:
     model, config = load_gguf_model("model.gguf", backend="cuda")
@@ -119,55 +116,6 @@ def _resolve_tied_lm_head(model, lm_head_weight, packers):
         )
 
 
-def _untie_embed_lm_head(model, gtensor, weight, backend):
-    """Untie the GGUF token-embed / lm_head weight, returning ``(embed, lm_head)``.
-
-    GGUF ties ``embed_tokens`` and ``lm_head`` to one quantized weight. The
-    returned ``lm_head`` is packed into ``model.lm_head`` after the streaming loop
-    (``_resolve_tied_lm_head``), or is ``None`` when this function already
-    assigned it.
-
-    * **MLX**: keep both tied on the raw ``ExportableGGUFTensor``.
-    * **CUDA** (Q6_K or Q4_K): untie so ``lm_head`` keeps a packed low-bit matmul
-      weight while the token embedding becomes a gatherable int8
-      ``IntxUnpackedToInt8Tensor`` -- the truly packed int4/int6 tensors can't
-      gather. Instead of dequantizing the whole ~1.4 B-element weight to bf16
-      (2 B/elem), decode it once to int8 (1 B/elem; the decode is lossless so the
-      result is numerically identical), halving the embedding's host + GPU-constant
-      footprint. The token embedding (Q4_K for the Gemma checkpoint) is the single
-      biggest weight, so this is the dominant saving vs the bf16 path. ``lm_head``:
-        - Q6_K -> ``CudaDp4aPlanarInt6Tensor`` from the *same* int8 decode and
-          assigned here (``pack_linear_for_cuda`` would mis-route an int8 tensor to
-          the int8 path), so the post-loop resolve is a no-op.
-        - Q4_K -> kept as the native ``Int4Tensor`` and returned, so
-          ``_resolve_tied_lm_head`` packs it to ``CudaCoalescedInt4Tensor`` (same
-          as a regular Q4_K linear).
-    * **CUDA, other types**: fall back to the bf16 embedding.
-    """
-    if backend == "mlx":
-        return weight, gtensor
-
-    if gtensor.ggml_type in ("q6_k", "q4_k"):
-        intx = gtensor.to_intx_unpacked_to_int8_tensor()
-        if gtensor.ggml_type == "q6_k":
-            import torch.nn as nn
-            from executorch.backends.cuda.dp4a_planar_int6_tensor import (
-                CudaDp4aPlanarInt6Tensor,
-            )
-
-            model.lm_head.weight = nn.Parameter(
-                CudaDp4aPlanarInt6Tensor._from_intx_int8(intx), requires_grad=False
-            )
-            return intx, None
-        # Q4_K: ``weight`` is the native Int4Tensor; let _resolve_tied_lm_head
-        # pack it to CudaCoalescedInt4Tensor. Only the embedding switches to int8.
-        return intx, weight
-
-    from executorch.examples.models.gemma4_31b.quant import dequantize_weight
-
-    return dequantize_weight(weight, torch.bfloat16), weight
-
-
 def load_gguf_model(
     gguf_path: str,
     max_seq_len: int = 4096,
@@ -192,7 +140,7 @@ def load_gguf_model(
         Gemma4_31BConfig,
         materialize_runtime_buffers,
     )
-    from executorch.examples.models.gemma4_31b.quant import pack_one
+    from executorch.examples.models.gemma4_31b.quant import dequantize_weight, pack_one
     from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf
 
     if backend == "cuda":
@@ -213,7 +161,7 @@ def load_gguf_model(
     with torch.device("meta"):
         model = Gemma4_31B(config)
 
-    lm_head_weight = None  # tied weight resolved into lm_head after the loop
+    lm_head_weight = None  # weight reused for a tied lm_head
     n_processed = 0
 
     print(f"Streaming GGUF from {gguf_path}...")
@@ -225,9 +173,11 @@ def load_gguf_model(
         if isinstance(value, ExportableGGUFTensor):
             weight = _convert_weight(model, model_key, value, backend)
             if model_key == "embed_tokens.weight":
-                weight, lm_head_weight = _untie_embed_lm_head(
-                    model, value, weight, backend
-                )
+                # Tied lm_head reuses the embedding weight: MLX wants the raw
+                # ExportableGGUFTensor (linear pattern), CUDA the quant tensor.
+                lm_head_weight = value if backend == "mlx" else weight
+                if backend == "cuda":
+                    weight = dequantize_weight(weight, torch.bfloat16)
             value = weight
         elif value.dtype == torch.float32:
             value = value.to(torch.bfloat16)
diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp
index 3d9970b1610..83d1f639e75 100644
--- a/examples/models/gemma4_31b/main.cpp
+++ b/examples/models/gemma4_31b/main.cpp
@@ -23,11 +23,8 @@
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/backend/options.h>
-#include <executorch/runtime/core/portable_type/device.h>
-#include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 
@@ -79,29 +76,25 @@ DEFINE_bool(
     cuda_graph,
     false,
     "Enable CUDA graph capture for the decode method. CUDA only.");
+DEFINE_bool(
+    ignore_eos,
+    false,
+    "Do not stop at EOS; always generate exactly max_new_tokens. For "
+    "benchmarking decode throughput at a fixed token count (mirrors "
+    "llama.cpp --ignore-eos).");
 
 namespace llm = ::executorch::extension::llm;
 using ::executorch::extension::from_blob;
-using ::executorch::extension::make_tensor_ptr;
 using ::executorch::extension::Module;
-using ::executorch::extension::TensorPtr;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
-#ifdef EXECUTORCH_BUILD_CUDA
-using ::executorch::extension::clone_tensor_ptr_to;
-#endif
 
 using SizesType = executorch::aten::SizesType;
 
-// Read a sampled token ID from a scalar int64 output (CUDA path).
-//
-// The model now emits the sampled token as int64 (see sampler.py), matching
-// the decode method's int64 token input so the on-device output buffer can be
-// aliased directly as the next step's input. We still copy the 8-byte scalar
-// back to the host here for EOS detection and detokenization.
+// Read a sampled token ID from a scalar float output (CUDA path).
 static uint64_t read_token(const executorch::aten::Tensor& output) {
   const void* ptr = output.const_data_ptr();
-  int64_t val = 0;
+  float val = 0.0f;
 
 #ifdef EXECUTORCH_BUILD_CUDA
   cudaPointerAttributes attrs{};
@@ -109,7 +102,7 @@ static uint64_t read_token(const executorch::aten::Tensor& output) {
       attrs.type == cudaMemoryTypeDevice;
   if (on_device) {
     cudaError_t err =
-        cudaMemcpy(&val, ptr, sizeof(int64_t), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&val, ptr, sizeof(float), cudaMemcpyDeviceToHost);
     if (err != cudaSuccess) {
       ET_LOG(
           Error,
@@ -118,13 +111,13 @@ static uint64_t read_token(const executorch::aten::Tensor& output) {
       return 0;
     }
   } else {
-    memcpy(&val, ptr, sizeof(int64_t));
+    memcpy(&val, ptr, sizeof(float));
   }
 #else
-  memcpy(&val, ptr, sizeof(int64_t));
+  memcpy(&val, ptr, sizeof(float));
 #endif
 
-  return static_cast<uint64_t>(val);
+  return static_cast<uint64_t>(llrintf(val));
 }
 
 int main(int argc, char** argv) {
@@ -194,8 +187,6 @@ int main(int argc, char** argv) {
       FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
 
 #ifdef EXECUTORCH_BUILD_CUDA
-  const auto cuda_device =
-      executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0);
   if (FLAGS_cuda_graph) {
     executorch::runtime::BackendOptions<2> cuda_opts;
     cuda_opts.set_option("enable_cuda_graph_for_method", "decode");
@@ -232,9 +223,8 @@ int main(int argc, char** argv) {
     ET_LOG(Error, "Failed to load decode method");
     return 1;
   }
-  auto temp_tensor = clone_tensor_ptr_to(
-      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float),
-      cuda_device);
+  auto temp_tensor =
+      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
 #else
   if (FLAGS_cuda_graph) {
     ET_LOG(Info, "--cuda_graph ignored on non-CUDA build");
@@ -302,12 +292,6 @@ int main(int argc, char** argv) {
   // ---------------------------------------------------------------
   uint64_t cur_token = 0;
   int64_t prefill_pos = 0;
-#ifdef EXECUTORCH_BUILD_CUDA
-  // Alias of the most recent forward's on-device int64 output token. The last
-  // prefill chunk's output seeds the first decode step (no token H2D); each
-  // decode step then re-aliases its own output for the next step.
-  TensorPtr device_out_token;
-#endif
   while (prefill_pos < num_prompt_tokens) {
     int64_t chunk_len =
         std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk);
@@ -326,12 +310,6 @@ int main(int argc, char** argv) {
     auto pos_tensor = from_blob(
         pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long);
 
-#ifdef EXECUTORCH_BUILD_CUDA
-    // skip_h2d: prefill/decode method inputs must already live in CUDA memory.
-    tokens_tensor = clone_tensor_ptr_to(tokens_tensor, cuda_device);
-    pos_tensor = clone_tensor_ptr_to(pos_tensor, cuda_device);
-#endif
-
     std::vector<EValue> inputs;
     inputs.push_back(EValue(tokens_tensor));
     inputs.push_back(EValue(pos_tensor));
@@ -350,11 +328,7 @@ int main(int argc, char** argv) {
     }
 
 #ifdef EXECUTORCH_BUILD_CUDA
-    const auto& out_tensor = result.get()[0].toTensor();
-    cur_token = read_token(out_tensor);
-    // Keep the sampled token on device: alias the output buffer so it feeds
-    // straight into the next forward as the int64 token input (zero copy).
-    device_out_token = make_tensor_ptr(out_tensor);
+    cur_token = read_token(result.get()[0].toTensor());
 #else
     cur_token = static_cast<uint64_t>(
         llm::logits_to_token(result.get()[0].toTensor(), temp_val));
@@ -386,69 +360,22 @@ int main(int argc, char** argv) {
   // Decode loop
   // ---------------------------------------------------------------
   int64_t pos = num_prompt_tokens;
-  std::vector<int64_t> decode_pos_data = {pos};
-  auto decode_pos_cpu = from_blob(
-      decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
-#ifdef EXECUTORCH_BUILD_CUDA
-  // Fixed device-resident position input slot: the decode method always reads
-  // the position from this same address every step (cuda-graph-safe). Seeded
-  // once here with a one-time H2D; refreshed each step by an on-device D2D.
-  auto decode_pos = clone_tensor_ptr_to(decode_pos_cpu, cuda_device);
-  // Upload the FULL decode position array to device ONCE (a single H2D - the
-  // one-time copy we keep). Each step copies its position from here into the
-  // fixed slot with a device-to-device copy, so there is NO per-round pos H2D.
-  std::vector<int64_t> pos_seq_data(FLAGS_max_new_tokens);
-  for (int32_t i = 0; i < FLAGS_max_new_tokens; i++) {
-    pos_seq_data[i] = num_prompt_tokens + i;
-  }
-  auto pos_seq_dev = clone_tensor_ptr_to(
-      from_blob(
-          pos_seq_data.data(),
-          {S(FLAGS_max_new_tokens)},
-          executorch::aten::ScalarType::Long),
-      cuda_device);
-  auto* pos_seq_dev_ptr =
-      static_cast<int64_t*>(pos_seq_dev->mutable_data_ptr());
-  auto* decode_pos_slot_ptr =
-      static_cast<int64_t*>(decode_pos->mutable_data_ptr());
-#else
-  // Non-CUDA (MLX) path: keep host token/pos buffers; the backend stages them
-  // and the host samples from the returned logits.
   std::vector<int64_t> decode_token_data = {static_cast<int64_t>(cur_token)};
+  std::vector<int64_t> decode_pos_data = {pos};
   auto decode_tokens = from_blob(
       decode_token_data.data(), {1, 1}, executorch::aten::ScalarType::Long);
-  auto decode_pos = decode_pos_cpu;
-#endif
+  auto decode_pos = from_blob(
+      decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
 
   uint64_t prev_token = cur_token;
-  bool hit_eos = eos_ids.find(cur_token) != eos_ids.end();
+  bool hit_eos =
+      !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end();
   for (int32_t step = 0; step < FLAGS_max_new_tokens && !hit_eos; step++) {
-#ifdef EXECUTORCH_BUILD_CUDA
-    // No per-round H2D: copy this step's position from the pre-uploaded device
-    // position array into the fixed position slot with an on-device D2D. With
-    // the token aliased on device (Option A) and the position staged via D2D,
-    // the per-round HtoD count is zero (independent of decode length).
-    // cudaMemcpy D2D is host-synchronous, so the slot is updated before the
-    // decode kernels read it; with cuda graph enabled this becomes a captured
-    // cudaMemcpyAsync on the decode stream into this same fixed slot.
-    ET_CHECK_MSG(
-        cudaMemcpy(
-            decode_pos_slot_ptr,
-            pos_seq_dev_ptr + step,
-            sizeof(int64_t),
-            cudaMemcpyDeviceToDevice) == cudaSuccess,
-        "Failed to copy decode position D2D");
-#else
-    decode_pos_data[0] = pos;
     decode_token_data[0] = static_cast<int64_t>(cur_token);
-#endif
+    decode_pos_data[0] = pos;
 
     std::vector<EValue> inputs;
-#ifdef EXECUTORCH_BUILD_CUDA
-    inputs.push_back(EValue(device_out_token));
-#else
     inputs.push_back(EValue(decode_tokens));
-#endif
     inputs.push_back(EValue(decode_pos));
 
 #ifdef EXECUTORCH_BUILD_CUDA
@@ -465,10 +392,7 @@ int main(int argc, char** argv) {
 
     prev_token = cur_token;
 #ifdef EXECUTORCH_BUILD_CUDA
-    const auto& out_tensor = result.get()[0].toTensor();
-    cur_token = read_token(out_tensor);
-    // Alias this step's on-device output token as the next step's token input.
-    device_out_token = make_tensor_ptr(out_tensor);
+    cur_token = read_token(result.get()[0].toTensor());
 #else
     cur_token = static_cast<uint64_t>(
         llm::logits_to_token(result.get()[0].toTensor(), temp_val));
@@ -481,7 +405,7 @@ int main(int argc, char** argv) {
       fflush(stdout);
     }
 
-    hit_eos = eos_ids.find(cur_token) != eos_ids.end();
+    hit_eos = !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end();
   }
   printf("\n");
 
diff --git a/examples/models/gemma4_31b/model.py b/examples/models/gemma4_31b/model.py
index d953541a244..bfaa73a754b 100644
--- a/examples/models/gemma4_31b/model.py
+++ b/examples/models/gemma4_31b/model.py
@@ -484,7 +484,7 @@ def forward(
             temperature: 1-D float tensor for Gumbel-max sampling.
 
         Returns:
-            (B, 1) sampled token IDs as int64.
+            (B, 1) sampled token IDs as float.
         """
         x = self.embed_tokens(tokens) * self.embed_normalizer
 
diff --git a/examples/models/gemma4_31b/sampler.py b/examples/models/gemma4_31b/sampler.py
index 2ce428224a2..690344fd2e4 100644
--- a/examples/models/gemma4_31b/sampler.py
+++ b/examples/models/gemma4_31b/sampler.py
@@ -26,12 +26,9 @@ def sample(
             temperature still works ("near-greedy").
 
     Returns:
-        ``[B, 1]`` int64 token IDs (``argmax(logits/T + gumbel_noise)``).
-        Emitting int64 (rather than casting to float) lets the runner alias the
-        on-device output token directly as the next decode step's int64 token
-        input — no D2H/H2D round-trip and no dtype cast.
+        ``[B, 1]`` float32 token IDs (``argmax(logits/T + gumbel_noise)``).
     """
     logits = logits / temperature.clamp(min=1e-6)
     noise = torch.rand_like(logits)
     gumbel = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
-    return (logits + gumbel).argmax(dim=-1, keepdim=True).to(torch.int64)
+    return (logits + gumbel).argmax(dim=-1, keepdim=True).float()
diff --git a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
index c346c1d2f82..caf0a44e03b 100644
--- a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
+++ b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
@@ -246,14 +246,13 @@ def _load(self, tmp):
 
     def test_load_converts_weights(self):
         """GGUF -> CUDA: Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> CudaDp4aPlanarInt6Tensor,
-        embedding int8 (gatherable)."""
+        embedding bf16."""
         from executorch.backends.cuda.coalesced_int4_tensor import (
             CudaCoalescedInt4Tensor,
         )
         from executorch.backends.cuda.dp4a_planar_int6_tensor import (
             CudaDp4aPlanarInt6Tensor,
         )
-        from torchao.quantization import IntxUnpackedToInt8Tensor
 
         with tempfile.TemporaryDirectory() as tmp:
             model, _ = self._load(tmp)
@@ -264,49 +263,11 @@ def test_load_converts_weights(self):
         self.assertIsInstance(
             model.layers[0].mlp.down_proj.weight.data, CudaDp4aPlanarInt6Tensor
         )
-        # Tied lm_head keeps a packed int6 matmul weight.
+        # Tied lm_head is repacked to int6 by pack_cuda (it keeps quantization,
+        # unlike the token embedding which is dequantized for the gather).
         self.assertIsInstance(model.lm_head.weight.data, CudaDp4aPlanarInt6Tensor)
-        # Token embedding is decoded to a gatherable int8 tensor (not bf16): the
-        # Q6_K decode is lossless and shared with lm_head. Keeping it int8 (vs
-        # bf16) avoids a ~5.6 GB fp32 dequant transient and ~1.4 GB resident at
-        # export time.
-        self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor)
-
-    def test_int8_embedding_matches_bf16(self):
-        """Guard the bf16 -> int8 token-embedding switch.
-
-        The embedding is now loaded as a gatherable int8 ``IntxUnpackedToInt8Tensor``
-        instead of being dequantized to bf16. Its gathered rows must match the bf16
-        dequant of the *source* GGUF token embedding -- i.e. exactly what the old
-        ``dequantize_weight(..., bf16)`` path returned. The GGUF decode is lossless,
-        so they agree to bf16 precision.
-        """
-        from executorch.examples.models.gemma4_31b.gguf_loader import gguf_to_model_key
-        from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf
-        from torchao.quantization import IntxUnpackedToInt8Tensor
-
-        with tempfile.TemporaryDirectory() as tmp:
-            path = os.path.join(tmp, "tiny.gguf")
-            build_gguf_checkpoint(path)
-            # Reference = bf16 dequant of the source GGUF token embedding (the
-            # tensor the previous bf16 embedding path materialized).
-            ref_bf16 = None
-            for name, val in iter_gguf(path):
-                if gguf_to_model_key(name) == "embed_tokens.weight":
-                    self.assertIsInstance(val, ExportableGGUFTensor)
-                    ref_bf16 = val.dequantize(torch.bfloat16)
-                    break
-            self.assertIsNotNone(ref_bf16, "token_embd.weight not found in GGUF")
-            model, _ = load_gguf_model(path, backend="cuda", config=GGUF_CONFIG)
-
-        self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor)
-
-        ids = torch.tensor([0, 1, 7, GGUF_CONFIG.vocab_size - 1])
-        out = model.embed_tokens(ids)  # int8 gather + dequant
-        ref = ref_bf16[ids]
-        self.assertEqual(out.shape, ref.shape)
-        rel_err = (out.float() - ref.float()).abs().mean() / ref.float().abs().mean()
-        self.assertLess(rel_err.item(), 0.02)
+        # Token embedding is dequantized to bf16 (Int4/packed-int6 can't gather).
+        self.assertEqual(model.embed_tokens.weight.dtype, torch.bfloat16)
 
     def test_generate(self):
         """GGUF -> CUDA -> eager generate produces valid tokens (inference.py)."""
diff --git a/examples/models/parakeet/CMakeLists.txt b/examples/models/parakeet/CMakeLists.txt
index a2b798de557..810f2815abd 100644
--- a/examples/models/parakeet/CMakeLists.txt
+++ b/examples/models/parakeet/CMakeLists.txt
@@ -109,49 +109,32 @@ if(EXECUTORCH_BUILD_VULKAN)
   executorch_target_link_options_shared_lib(vulkan_backend)
 endif()
 
-set(parakeet_shared_sources parakeet_transcriber.cpp timestamp_utils.cpp
-                            tokenizer_utils.cpp
-)
-
-set(parakeet_common_include_directories
-    ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
-)
-
-add_executable(parakeet_runner main.cpp ${parakeet_shared_sources})
-add_executable(
-  parakeet_helper parakeet_helper.cpp parakeet_helper_protocol.cpp
-                  ${parakeet_shared_sources}
-)
-
-foreach(parakeet_target parakeet_runner parakeet_helper)
-  if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-    target_link_options_gc_sections(${parakeet_target})
-    if(NOT APPLE AND NOT MSVC)
-      target_link_options(${parakeet_target} PRIVATE "LINKER:-s")
-    endif()
+add_executable(parakeet_runner main.cpp timestamp_utils.cpp tokenizer_utils.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(parakeet_runner)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(parakeet_runner PRIVATE "LINKER:-s")
   endif()
+endif()
 
-  if(TARGET mlxdelegate)
-    executorch_target_copy_mlx_metallib(${parakeet_target})
-  endif()
+# Copy MLX metallib for runtime if MLX delegate is enabled
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(parakeet_runner)
+endif()
 
-  target_include_directories(
-    ${parakeet_target} PUBLIC ${parakeet_common_include_directories}
-  )
-  target_link_libraries(${parakeet_target} PUBLIC ${link_libraries})
-  target_compile_options(${parakeet_target} PUBLIC ${_common_compile_options})
-endforeach()
+target_include_directories(
+  parakeet_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(parakeet_runner PUBLIC ${link_libraries})
+target_compile_options(parakeet_runner PUBLIC ${_common_compile_options})
 
 # On Windows, copy required DLLs to the executable directory
 if(MSVC AND EXECUTORCH_BUILD_CUDA)
-  foreach(parakeet_target parakeet_runner parakeet_helper)
-    add_custom_command(
-      TARGET ${parakeet_target}
-      POST_BUILD
-      COMMAND
-        ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
-        $<TARGET_FILE_DIR:${parakeet_target}>
-      COMMENT "Copying aoti_cuda_shims.dll to ${parakeet_target} directory"
-    )
-  endforeach()
+  add_custom_command(
+    TARGET parakeet_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:parakeet_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to parakeet_runner directory"
+  )
 endif()
diff --git a/examples/models/parakeet/CMakePresets.json b/examples/models/parakeet/CMakePresets.json
index 90a90fbbdf5..87ace61e315 100644
--- a/examples/models/parakeet/CMakePresets.json
+++ b/examples/models/parakeet/CMakePresets.json
@@ -89,42 +89,42 @@
             "displayName": "Build Parakeet runner (CPU)",
             "configurePreset": "parakeet-cpu",
             "configuration": "Release",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         },
         {
             "name": "parakeet-cuda",
             "displayName": "Build Parakeet runner (CUDA)",
             "configurePreset": "parakeet-cuda",
             "configuration": "Release",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         },
         {
             "name": "parakeet-cuda-debug",
             "displayName": "Build Parakeet runner (CUDA, Debug)",
             "configurePreset": "parakeet-cuda-debug",
             "configuration": "Debug",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         },
         {
             "name": "parakeet-metal",
             "displayName": "Build Parakeet runner (Metal)",
             "configurePreset": "parakeet-metal",
             "configuration": "Release",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         },
         {
             "name": "parakeet-mlx",
             "displayName": "Build Parakeet runner (MLX)",
             "configurePreset": "parakeet-mlx",
             "configuration": "Release",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         },
         {
             "name": "parakeet-vulkan",
             "displayName": "Build Parakeet runner (Vulkan)",
             "configurePreset": "parakeet-vulkan",
             "configuration": "Release",
-            "targets": ["parakeet_runner", "parakeet_helper"]
+            "targets": ["parakeet_runner"]
         }
     ],
     "workflowPresets": [
diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md
index e2f09f8aa99..62cec6a9cc4 100644
--- a/examples/models/parakeet/README.md
+++ b/examples/models/parakeet/README.md
@@ -242,11 +242,6 @@ make parakeet-cuda
 make parakeet-mlx
 ```
 
-Each Parakeet build now produces both:
-
-- `parakeet_runner` for one-shot CLI transcription from an audio file
-- `parakeet_helper` for long-lived host integrations that keep the model warm and stream PCM requests over stdin/stdout
-
 On Windows (PowerShell), use CMake workflow presets directly:
 
 ```powershell
@@ -315,26 +310,6 @@ If your generator is single-config, the runner may be at `.\cmake-out\examples\m
 | `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) |
 | `--timestamps`     | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) |
 
-### Persistent Helper
-
-The helper binary uses the same Parakeet transcription stack as `parakeet_runner`,
-but keeps the model loaded across multiple requests so host apps can avoid repeated
-startup and model load overhead.
-
-Example:
-
-```bash
-# Metal
-DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_helper \
-  --model_path examples/models/parakeet/parakeet_metal/model.pte \
-  --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model
-```
-
-The helper accepts framed requests over stdin, validates 16 kHz mono float32 PCM
-payloads, and returns status/result messages over stdout. It is intended for app
-integrations such as the macOS `ExecuWhisper` frontend in the separate
-`executorch-examples` repository.
-
 ### Mobile App
 
 Check out a [demo Android app](https://github.com/meta-pytorch/executorch-examples/tree/main/parakeet/android/ParakeetApp) for Parakeet in the separate `executorch-examples` repository.
diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp
index 410ba6cea62..b8a052004e4 100644
--- a/examples/models/parakeet/main.cpp
+++ b/examples/models/parakeet/main.cpp
@@ -6,14 +6,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <gflags/gflags.h>
-
+#include <cmath>
+#include <cstdint>
 #include <exception>
 #include <iostream>
 #include <string>
+#include <unordered_set>
+#include <vector>
 
-#include "parakeet_transcriber.h"
+#include <gflags/gflags.h>
 
+#include "timestamp_utils.h"
+#include "tokenizer_utils.h"
+#include "types.h"
+
+#include <executorch/extension/asr/runner/transducer_runner.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/log.h>
 #ifdef ET_BUILD_METAL
 #include <executorch/backends/apple/metal/runtime/stats.h>
@@ -33,17 +44,69 @@ DEFINE_string(
     timestamps,
     "segment",
     "Timestamp output mode: none|token|word|segment|all");
-DEFINE_bool(
-    runtime_profile,
-    false,
-    "Print a detailed runtime profile for preprocessor, encoder, and decode-loop execution.");
+
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+
+using ::parakeet::TextWithOffsets;
+using ::parakeet::TokenWithTextInfo;
+
+namespace {
+// TDT duration values for Parakeet models
+const std::vector<int> DURATIONS = {0, 1, 2, 3, 4};
+
+struct TimestampOutputMode {
+  bool token = false;
+  bool word = false;
+  bool segment = false;
+
+  bool enabled() const {
+    return token || word || segment;
+  }
+};
+
+std::string to_lower_ascii(std::string s) {
+  for (char& ch : s) {
+    ch = static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
+  }
+  return s;
+}
+
+TimestampOutputMode parse_timestamp_output_mode(const std::string& raw_arg) {
+  if (raw_arg.empty()) {
+    throw std::invalid_argument(
+        "Invalid --timestamps value (empty). Expected: token, word, segment, all.");
+  }
+  const std::string mode = to_lower_ascii(raw_arg);
+  if (mode == "none") {
+    return {false, false, false};
+  }
+  if (mode == "token") {
+    return {true, false, false};
+  }
+  if (mode == "word") {
+    return {false, true, false};
+  }
+  if (mode == "segment") {
+    return {false, false, true};
+  }
+  if (mode == "all") {
+    return {true, true, true};
+  }
+  throw std::invalid_argument(
+      "Invalid --timestamps value '" + raw_arg +
+      "'. Expected: token, word, segment, all.");
+}
+} // namespace
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  parakeet::TimestampOutputMode timestamp_mode;
+  TimestampOutputMode timestamp_mode;
   try {
-    timestamp_mode = parakeet::parse_timestamp_output_mode(FLAGS_timestamps);
+    timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps);
   } catch (const std::invalid_argument& e) {
     ET_LOG(Error, "%s", e.what());
     return 1;
@@ -54,57 +117,162 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  try {
-    parakeet::ParakeetTranscriber transcriber(
-        FLAGS_model_path, FLAGS_tokenizer_path, FLAGS_data_path);
-    const auto result = transcriber.transcribe_wav_path(
-        FLAGS_audio_path,
-        parakeet::TranscribeConfig{timestamp_mode, FLAGS_runtime_profile});
-
-    std::cout << "Transcribed text: " << result.text << std::endl;
-    if (!result.stats_json.empty()) {
-      std::cout << "PyTorchObserver " << result.stats_json << std::endl;
-    }
-    if (result.runtime_profile_report.has_value()) {
-      std::cout << *result.runtime_profile_report;
-    }
+  // --- Build config and runner ---
+  executorch::extension::asr::TransducerConfig config;
+  config.durations = DURATIONS;
 
-#ifdef ET_BUILD_METAL
-    executorch::backends::metal::print_metal_backend_stats();
-#endif
+  std::optional<std::string> data_path_opt;
+  if (!FLAGS_data_path.empty()) {
+    data_path_opt = FLAGS_data_path;
+  }
 
-    if (timestamp_mode.segment) {
-      std::cout << "\nSegment timestamps:" << std::endl;
-      for (const auto& segment : result.segment_offsets) {
-        const double start = segment.start_offset * result.frame_to_seconds;
-        const double end = segment.end_offset * result.frame_to_seconds;
-        std::cout << start << "s - " << end << "s : " << segment.text
-                  << std::endl;
-      }
-    }
+  executorch::extension::asr::TransducerRunner runner(
+      FLAGS_model_path, FLAGS_tokenizer_path, config, data_path_opt);
 
-    if (timestamp_mode.word) {
-      std::cout << "\nWord timestamps:" << std::endl;
-      for (const auto& word : result.word_offsets) {
-        const double start = word.start_offset * result.frame_to_seconds;
-        const double end = word.end_offset * result.frame_to_seconds;
-        std::cout << start << "s - " << end << "s : " << word.text << std::endl;
-      }
-    }
+  auto load_err = runner.load();
+  if (load_err != Error::Ok) {
+    ET_LOG(Error, "Failed to load model.");
+    return 1;
+  }
 
-    if (timestamp_mode.token) {
-      std::cout << "\nToken timestamps:" << std::endl;
-      for (const auto& token : result.token_offsets) {
-        const double start = token.start_offset * result.frame_to_seconds;
-        const double end = token.end_offset * result.frame_to_seconds;
-        std::cout << start << "s - " << end << "s : " << token.decoded_text
-                  << std::endl;
-      }
-    }
+  // --- Load and preprocess audio ---
+  ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
+  std::vector<float> audio_data =
+      ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
+  ET_LOG(Info, "Loaded %zu audio samples", audio_data.size());
+
+  auto audio_tensor = from_blob(
+      audio_data.data(),
+      {static_cast<::executorch::aten::SizesType>(audio_data.size())},
+      ::executorch::aten::ScalarType::Float);
+
+  ET_LOG(Info, "Running preprocessor...");
+  auto preprocess_result = runner.preprocess(audio_tensor);
+  if (!preprocess_result.ok()) {
+    ET_LOG(Error, "Preprocessing failed.");
+    return 1;
+  }
+  auto preprocess_out = preprocess_result.get();
 
+  // --- Transcribe ---
+  ET_LOG(Info, "Running TDT greedy decode...");
+  auto result = runner.transcribe(
+      preprocess_out.features,
+      [](const std::string& piece) { std::cout << piece << std::flush; },
+      preprocess_out.length);
+
+  if (!result.ok()) {
+    ET_LOG(Error, "Transcription failed.");
+    return 1;
+  }
+
+  auto& decoded_tokens = result.get();
+  ET_LOG(Info, "Decoded %zu tokens", decoded_tokens.size());
+
+  // Use the runner's tokenizer for text decoding and timestamps
+  const auto* tokenizer = runner.tokenizer();
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer not available.");
+    return 1;
+  }
+
+  // Print full transcribed text
+  std::string text = parakeet::tokenizer_utils::decode_token_sequence(
+      decoded_tokens, *tokenizer);
+  std::cout << "\nTranscribed text: " << text << std::endl;
+
+#ifdef ET_BUILD_METAL
+  executorch::backends::metal::print_metal_backend_stats();
+#endif // ET_BUILD_METAL
+
+  if (!timestamp_mode.enabled()) {
     return 0;
+  }
+
+  // --- Timestamps ---
+  // Query timestamp-related metadata from the model.
+  // These are Parakeet-specific constants, not part of TransducerRunner.
+  std::unique_ptr<Module> meta_module;
+  if (data_path_opt) {
+    meta_module = std::make_unique<Module>(
+        FLAGS_model_path, *data_path_opt, Module::LoadMode::Mmap);
+  } else {
+    meta_module =
+        std::make_unique<Module>(FLAGS_model_path, Module::LoadMode::Mmap);
+  }
+  auto meta_load_err = meta_module->load();
+  if (meta_load_err != Error::Ok) {
+    ET_LOG(Error, "Failed to load model for timestamp metadata.");
+    return 1;
+  }
+
+  std::vector<::executorch::runtime::EValue> empty_inputs;
+  auto window_stride_result =
+      meta_module->execute("window_stride", empty_inputs);
+  auto encoder_subsampling_factor_result =
+      meta_module->execute("encoder_subsampling_factor", empty_inputs);
+
+  if (!window_stride_result.ok() || !encoder_subsampling_factor_result.ok()) {
+    ET_LOG(
+        Error,
+        "Failed to query timestamp metadata (window_stride, encoder_subsampling_factor).");
+    return 1;
+  }
+
+  double window_stride = window_stride_result.get()[0].toDouble();
+  int64_t encoder_subsampling_factor =
+      encoder_subsampling_factor_result.get()[0].toInt();
+  meta_module.reset();
+
+  ET_LOG(Info, "Computing timestamps...");
+  std::unordered_set<std::string> supported_punctuation =
+      parakeet::tokenizer_utils::derive_supported_punctuation(*tokenizer);
+
+  std::vector<TokenWithTextInfo> tokens_with_text_info;
+  try {
+    tokens_with_text_info =
+        parakeet::timestamp_utils::get_tokens_with_text_info(
+            decoded_tokens, *tokenizer, supported_punctuation);
   } catch (const std::exception& e) {
-    ET_LOG(Error, "%s", e.what());
+    ET_LOG(Error, "Failed to get tokens with text info: %s", e.what());
     return 1;
   }
+  const auto word_offsets = parakeet::timestamp_utils::get_words_offsets(
+      tokens_with_text_info, *tokenizer, supported_punctuation);
+  const auto segment_offsets =
+      parakeet::timestamp_utils::get_segment_offsets(word_offsets);
+
+  const double frame_to_seconds =
+      window_stride * static_cast<double>(encoder_subsampling_factor);
+
+  if (timestamp_mode.segment) {
+    std::cout << "\nSegment timestamps:" << std::endl;
+    for (const auto& segment : segment_offsets) {
+      const double start = segment.start_offset * frame_to_seconds;
+      const double end = segment.end_offset * frame_to_seconds;
+      std::cout << start << "s - " << end << "s : " << segment.text
+                << std::endl;
+    }
+  }
+
+  if (timestamp_mode.word) {
+    std::cout << "\nWord timestamps:" << std::endl;
+    for (const auto& word : word_offsets) {
+      const double start = word.start_offset * frame_to_seconds;
+      const double end = word.end_offset * frame_to_seconds;
+      std::cout << start << "s - " << end << "s : " << word.text << std::endl;
+    }
+  }
+
+  if (timestamp_mode.token) {
+    std::cout << "\nToken timestamps:" << std::endl;
+    for (const auto& token : tokens_with_text_info) {
+      const double start = token.start_offset * frame_to_seconds;
+      const double end = token.end_offset * frame_to_seconds;
+      std::cout << start << "s - " << end << "s : " << token.decoded_text
+                << std::endl;
+    }
+  }
+
+  return 0;
 }
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
index 726657a3779..e1b54d644b2 100644
--- a/examples/models/qwen3_5_moe/CMakeLists.txt
+++ b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -54,14 +54,9 @@ elseif(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda_backend)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
-elseif(TARGET mlxdelegate)
-  list(APPEND link_libraries mlxdelegate mlx)
-  executorch_target_link_options_shared_lib(mlxdelegate)
-  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(
-    FATAL_ERROR
-      "Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON"
+    FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
   )
 endif()
 
@@ -74,21 +69,9 @@ target_include_directories(
 )
 target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})
 
-add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
-target_include_directories(
-  qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
-)
-target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
-
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
-  target_link_options_gc_sections(qwen3_5_moe_worker)
-  target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
-endif()
-
-if(TARGET mlxdelegate)
-  executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
 endif()
 
 if(EXECUTORCH_BUILD_CUDA)
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
index 276c2116148..0d6de7f60eb 100644
--- a/examples/models/qwen3_5_moe/CMakePresets.json
+++ b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -36,42 +36,19 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
-        },
-        {
-            "name": "qwen3-5-moe-mlx",
-            "displayName": "Qwen3.5 MoE runner (MLX)",
-            "inherits": ["qwen3-5-moe-base"],
-            "cacheVariables": {
-                "EXECUTORCH_BUILD_MLX": "ON"
-            },
-            "condition": {
-                "type": "equals",
-                "lhs": "${hostSystemName}",
-                "rhs": "Darwin"
-            }
         }
     ],
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner, worker, and no-bleed test (CUDA)",
+            "displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
-            "targets": [
-                "qwen3_5_moe_runner",
-                "qwen3_5_moe_worker",
-                "test_qwen35_moe_nobleed"
-            ]
+            "targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
         },
         {
             "name": "qwen3-5-moe-metal",
-            "displayName": "Build Qwen3.5 MoE runner and worker (Metal)",
+            "displayName": "Build Qwen3.5 MoE runner (Metal)",
             "configurePreset": "qwen3-5-moe-metal",
-            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
-        },
-        {
-            "name": "qwen3-5-moe-mlx",
-            "displayName": "Build Qwen3.5 MoE runner (MLX)",
-            "configurePreset": "qwen3-5-moe-mlx",
             "targets": ["qwen3_5_moe_runner"]
         }
     ],
@@ -103,20 +80,6 @@
                     "name": "qwen3-5-moe-metal"
                 }
             ]
-        },
-        {
-            "name": "qwen3-5-moe-mlx",
-            "displayName": "Configure and build Qwen3.5 MoE runner (MLX)",
-            "steps": [
-                {
-                    "type": "configure",
-                    "name": "qwen3-5-moe-mlx"
-                },
-                {
-                    "type": "build",
-                    "name": "qwen3-5-moe-mlx"
-                }
-            ]
         }
     ]
 }
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
index c275641bfd7..e3f13cc77d6 100644
--- a/examples/models/qwen3_5_moe/README.md
+++ b/examples/models/qwen3_5_moe/README.md
@@ -147,56 +147,6 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 `--cuda_graph` is intentionally single-session only. CUDA graph replay captures
 device pointers, so it is not combined with per-session mutable-state rebinding.
 
-## OpenAI-compatible serving
-
-The CUDA build also produces `qwen3_5_moe_worker`, a C++ model-execution worker
-used by the generic `examples/llm_server` control plane. The Qwen launcher wires
-in the model's Hugging Face chat template and Qwen XML tool-call parser:
-
-```bash
-python -m executorch.examples.models.qwen3_5_moe.serve \
-    --model-path qwen35_moe_exports/model.pte \
-    --data-path qwen35_moe_exports/aoti_cuda_blob.ptd \
-    --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
-    --hf-tokenizer ~/models/Qwen3.5-35B-A3B \
-    --model-id qwen3.5-moe \
-    --max-context 4096 \
-    --max-sessions 4 \
-    --no-think
-```
-
-`--max-sessions` controls how many isolated sessions the worker can host on one
-weight load. One slot is reserved for anonymous requests; clients should send a
-stable `session_id` (or session-affinity header) to get per-conversation
-isolation and warm append-only resume.
-
-### Use from pi
-
-Point pi at the Qwen server via `~/.pi/agent/models.json`:
-
-```json
-{
-  "providers": {
-    "executorch": {
-      "baseUrl": "http://127.0.0.1:8000/v1",
-      "api": "openai-completions",
-      "apiKey": "x",
-      "models": [
-        {
-          "id": "qwen3.5-moe",
-          "compat": { "sendSessionAffinityHeaders": true }
-        }
-      ]
-    }
-  }
-}
-```
-
-The model id must match `--model-id`. `sendSessionAffinityHeaders` lets pi route
-each conversation or subagent to a stable server session; without it, requests
-use the anonymous scratch session and do not get per-conversation isolation or
-warm resume.
-
 ### CUDA no-bleed test
 
 The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
@@ -261,38 +211,7 @@ python export.py \
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--tiny-test` | off | Build tiny model with random weights for CI testing |
 
-### Build (MLX)
-
-Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the
-MLX backend and the runner binary. Requires Apple Silicon (Darwin).
-
-```bash
-make qwen3_5_moe-mlx
-```
-
-This builds ExecuTorch with MLX support, then the runner binary at
-`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib`
-copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd`
-data file is produced or needed.
-
-### Run (MLX, C++ runner)
-
-The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and
-a `tokenizer.json`; no `--data_path`):
-
-```bash
-cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
-    --model_path ./qwen35_moe_mlx/model.pte \
-    --tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
-    --prompt "What is the capital of France?" \
-    --max_new_tokens 50
-```
-
-The MLX export emits a single dynamic-seq `forward` method; the runner loads and
-calls it for both prefill and decode (sampling on host), matching the Python
-runner. See the [Run](#run) section above for the full flag list.
-
-### Run (MLX, Python)
+### Run (MLX)
 
 ```bash
 python -m executorch.examples.models.qwen3_5_moe.run \
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
index 566d61e6cfc..d7e7d9ca293 100644
--- a/examples/models/qwen3_5_moe/export.py
+++ b/examples/models/qwen3_5_moe/export.py
@@ -768,16 +768,10 @@ def _export_mlx(model, config, args):
     gc.collect()
 
     print("Lowering to ExecuTorch with MLX backend...")
-    # Largest prefill chunk the runner may submit in one forward call. The MLX
-    # runner chunks long prompts to cap peak memory; bound it by the compiled
-    # dynamic max (max_seq_len - 1) so a chunk can never exceed what `forward`
-    # was compiled for.
-    max_prefill_chunk = min(1024, config.max_seq_len - 1)
     metadata = {
         "get_max_seq_len": config.max_seq_len,
         "get_vocab_size": config.vocab_size,
         "get_n_layers": config.num_hidden_layers,
-        "get_max_prefill_chunk": max_prefill_chunk,
         "use_kv_cache": True,
         "use_sdpa_with_kv_cache": False,
         "enable_dynamic_shape": True,
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
index 713f6211330..3c5b2eec439 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -19,8 +19,6 @@
 #include <cmath>
 #include <cstring>
 
-#include <algorithm>
-
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
 #include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
@@ -41,22 +39,6 @@ using SizesType = executorch::aten::SizesType;
 
 namespace {
 
-#ifdef EXECUTORCH_BUILD_MLX
-// The MLX export emits a single dynamic-seq `forward` method that handles both
-// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads
-// and calls `forward` for both phases.
-constexpr const char* kPrefillMethod = "forward";
-constexpr const char* kDecodeMethod = "forward";
-#else
-// CUDA/Metal exports emit two separate methods.
-constexpr const char* kPrefillMethod = "prefill";
-constexpr const char* kDecodeMethod = "decode";
-#endif
-
-// Constant method exported by the MLX .pte giving the largest prefill chunk the
-// `forward` method was compiled for. Read into the metadata map in create().
-constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk";
-
 Result<uint64_t> read_sampled_token(
     const executorch::aten::Tensor& output,
     float temperature) {
@@ -116,10 +98,8 @@ Result<std::unique_ptr<Module>> build_qwen_module(
   }
 #endif
 
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
-  if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
-  }
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode"));
   return module;
 }
 
@@ -260,63 +240,34 @@ class Qwen35MoESession : public LLMSession {
     }
 
     stop_.store(false, std::memory_order_relaxed);
-
-    // On MLX, run prefill in fixed-size chunks (caps peak memory and the
-    // compiled prefill shape). Other backends prefill the whole prompt in one
-    // pass. Only the final chunk's sampled token is kept; the recurrence/KV
-    // state from earlier chunks persists via pos_ advancement.
-#ifdef EXECUTORCH_BUILD_MLX
-    // Chunk size: default to the compiled max (kMaxSeqLen - 1), overridden by
-    // the exported get_max_prefill_chunk constant when present (mirrors
-    // gemma4_31b). Falls back to T (single pass) if no metadata is available at
-    // all.
-    int64_t chunk_size = T;
-    if (auto it = metadata_.find(kMaxSeqLen);
-        it != metadata_.end() && it->second > 1) {
-      chunk_size = it->second - 1;
-    }
-    if (auto it = metadata_.find(kMaxPrefillChunk);
-        it != metadata_.end() && it->second > 0) {
-      chunk_size = it->second;
+    std::vector<int64_t> token_data(tokens.begin(), tokens.end());
+    std::vector<int64_t> pos_data(T);
+    for (int64_t i = 0; i < T; ++i) {
+      pos_data[i] = pos_ + i;
     }
-#else
-    const int64_t chunk_size = T;
-#endif
-
-    uint64_t sampled_token = 0;
-    for (int64_t off = 0; off < T; off += chunk_size) {
-      const int64_t len = std::min(chunk_size, T - off);
-      std::vector<int64_t> token_data(
-          tokens.begin() + off, tokens.begin() + off + len);
-      std::vector<int64_t> pos_data(len);
-      for (int64_t i = 0; i < len; ++i) {
-        pos_data[i] = pos_ + i;
-      }
-      auto tokens_tensor = from_blob(
-          token_data.data(),
-          {1, static_cast<SizesType>(len)},
-          executorch::aten::ScalarType::Long);
-      auto pos_tensor = from_blob(
-          pos_data.data(),
-          {static_cast<SizesType>(len)},
-          executorch::aten::ScalarType::Long);
-
-      const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod;
-      std::vector<EValue> inputs;
-      inputs.push_back(tokens_tensor);
-      inputs.push_back(pos_tensor);
+    auto tokens_tensor = from_blob(
+        token_data.data(),
+        {1, static_cast<SizesType>(T)},
+        executorch::aten::ScalarType::Long);
+    auto pos_tensor = from_blob(
+        pos_data.data(),
+        {static_cast<SizesType>(T)},
+        executorch::aten::ScalarType::Long);
+
+    const char* method = (T >= 2) ? "prefill" : "decode";
+    std::vector<EValue> inputs;
+    inputs.push_back(tokens_tensor);
+    inputs.push_back(pos_tensor);
 #ifdef EXECUTORCH_BUILD_CUDA
-      set_temp(first_token_temp);
-      inputs.push_back(EValue(temp_tensor_));
+    set_temp(first_token_temp);
+    inputs.push_back(EValue(temp_tensor_));
 #endif
-      auto sampled =
-          run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
-      ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
-      sampled_token = sampled.get();
-      pos_ += len;
-    }
-    pending_ = sampled_token;
+    auto sampled =
+        run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
+    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+    pending_ = sampled.get();
     prev_decode_token_.reset();
+    pos_ += T;
     return Error::Ok;
   }
 
@@ -383,7 +334,7 @@ class Qwen35MoESession : public LLMSession {
     inputs.push_back(EValue(temp_tensor_));
 #endif
     auto sampled =
-        run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
+        run_locked("decode", inputs, temperature_, /*sync_after=*/false);
     ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
     pending_ = sampled.get();
     prev_decode_token_ = token;
@@ -506,14 +457,6 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
     ET_LOG(Error, "Qwen35MoEEngine: failed to read metadata");
     return metadata_result.error();
   }
-#ifdef EXECUTORCH_BUILD_MLX
-  // Surface the compiled max prefill chunk (a constant method get_llm_metadata
-  // doesn't harvest) into the metadata map so the session can chunk long
-  // prompts within the shape `forward` was compiled for.
-  if (auto mpc = meta_module->get(kMaxPrefillChunk); mpc.ok()) {
-    metadata_result.get()[kMaxPrefillChunk] = mpc->toScalar().to<int64_t>();
-  }
-#endif
   auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get());
   // This export's metadata doesn't carry the chat-turn EOS (config.json has no
   // eos_token_id and the .pte exports no get_eos_ids method), so get_eos_ids()
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 4a5d1fd023d..8e0dc70bbb5 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -130,12 +130,12 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 Default example using hybrid mode.
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
-
+```
 
 #### Codegen2
 Default example using kv mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():"
 ```
 
 #### Gemma 2B
@@ -210,17 +210,7 @@ Default example using hybrid mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
 ```
 
-#### Using custom calibration samples for LLMs
-
-Instead of `--calib_tasks`, you can supply your own conversation JSON files via `--calib_samples`. The samples are fed into the quantization calibration pass to collect activation observer statistics — they do not affect the inference prompt. This is useful when you want to calibrate on domain-specific or instruct-format data rather than a generic lm_eval task.
-
-```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
-```
-
-You can also provide both `--calib_tasks` and `--calib_samples` at the same time; the pipeline concatenates both data sources for calibration.
-
-
+## Multimodal Support
 
 ### Overview
 
@@ -278,7 +268,7 @@ pip install soundfile
 
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/audio.json
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
 ```
 
 ### Specifying Custom Audio
@@ -291,6 +281,9 @@ You can specify a custom audio file for ALM models using the `--audio_path` flag
 - **Local file paths**: Absolute or relative paths to `.wav` files on your system
   - Example: `"/path/to/your/audio.wav"`
 
+**Default behavior:**
+If `--audio_path` is not specified, the system will automatically use the default audio file defined in the model's configuration file (`encoder/encoder_config.py`).
+
 #### Audio Preprocessing
 
 The audio encoder configuration is defined in `encoder/encoder_config.py`:
@@ -301,6 +294,7 @@ The audio encoder configuration is defined in `encoder/encoder_config.py`:
 class GraniteSpeechEncoder(AudioModalityConfig):
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
+    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"  # Default audio (content: "After his nap, ...")
     quant_recipe = GraniteSpeechEncoderQuantRecipe
 ```
 
@@ -357,13 +351,13 @@ Vision-Language Models (VLMs) combine computer vision and natural language proce
 #### SmolVLM 500M
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
 ```
 
 #### InternVL 1B
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg"
 ```
 
 ### Specifying Custom Image
@@ -376,6 +370,9 @@ Take a example image of Statue-of-Liberty in New York Bay
 - **Local file paths**: Absolute or relative paths to image files on your system
   - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png)
 
+**Default behavior:**
+If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`).
+
 #### Image Preprocessing
 
 Each VLM model has specific preprocessing requirements defined in its configuration:
@@ -388,6 +385,7 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_seq_len = 64
     img_resized_h = 512
     img_resized_w = 512
+    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"  # Default image
     quant_recipe = SmolVLMEncoderQuantRecipe
 ```
 
@@ -429,7 +427,7 @@ PROMPT2="Answer the question: What's the main object in first image?"
 PROMPT3="<image>Caption this image."
 
 # Execute the multi-turn conversation
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
 ```
 
 **How it works:**
@@ -455,19 +453,16 @@ The VLM inference pipeline consists of:
    - Special tokens (e.g., `<image>`, `<|fake_token_around_image|>`, `<fake_token_around_image>`) mark modality boundaries (see [tokenizer.py](tokenizer.py))
 
    ```python
-   # Token fields on each encoder config subclass (encoder/encoder_config.py)
-   @dataclass(init=False, frozen=True)
-   class SmolVLMEncoder(VisionModalityConfig):
-       img_token = "<image>"
-       fake_wrap_start = "<fake_token_around_image>"
-       fake_wrap_end = "<fake_token_around_image>"
-       global_img_token = "<global-img>"
-
-   @dataclass(init=False, frozen=True)
-   class InternVL3Encoder(VisionModalityConfig):
-       img_token = "<IMG_CONTEXT>"
-       fake_wrap_start = "<img>"
-       fake_wrap_end = "</img>"
+   # Special tokens for Vision-Language Model
+   VLM_SPECIAL_TOKENS = {
+       "smolvlm_500m_instruct": {
+           "image_token": "<image>",
+           "global_img": "<global-img>",
+           "fake_wrap_start": "<fake_token_around_image>",
+           "fake_wrap_end": "<fake_token_around_image>",
+       },
+       ...
+   }
    ```
    - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]`
 
@@ -550,13 +545,16 @@ From the example script above, 1 wikitext sample is used to evaluate all 3 phase
 Example:
 ```bash
 # 1st run to compile with --calib_limit 1
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 -a ${FOLDER_TO_PRE_GEN_PTE} --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 --compile_only
 ```
 ```bash
 # 2nd run to perform QNN device execution with --eval_limit 3
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
 ```
 
+#### Tasks quantization calibration
+If `--calib_tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration.
+`--calib_tasks` and `--eval_tasks` are independent flags. `--calib_tasks` controls which tasks are used for quantization calibration, while `--eval_tasks` controls which tasks are used for perplexity evaluation. They can be set to different tasks or limits as needed.
 
 #### SQNR Evalution
 To evaluate QNN's output logits against the golden logits from `nn.Module`, users can provide the flag `--sqnr_eval`. Please note that SQNR evaluation will only compare the logits of the user's prompt and will not compare the new tokens generated by the model.
@@ -565,52 +563,6 @@ Example:
 python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods sqnr_eval
 ```
 
-
-
-#### Quantization
-
-The calibration data is independent from the runtime evaluation set, and only affects quantization quality, not the inference output.
-
-Calibration data is required for compilation. There are two ways to supply it:
-
-1. **`--calib_tasks`** — calibrate on one or more lm_eval tasks (tune with `--calib_limit` and `--calib_num_fewshot`). LLM-only.
-2. **`--calib_samples`** — calibrate on custom conversation samples provided as JSON files (see format below). Required for multimodal models (VLM/ALM).
-
-For LLMs, provide at least one of the two; for multimodal models, `--calib_samples` is mandatory.
-
-Calibration and runtime evaluation use separate flag sets and can target different tasks or limits as needed:
-
-| Purpose | Flags |
-|---|---|
-| Calibration data (lm_eval tasks) | `--calib_tasks`, `--calib_limit`, `--calib_num_fewshot` |
-| Calibration data (custom samples) | `--calib_samples` (JSON files, HuggingFace message format) |
-
-##### Custom calibration samples (`--calib_samples`)
-
-`--calib_samples` accepts one or more JSON files. Each file is a flat list of sample objects. Each sample has a `messages` field following the HuggingFace chat template, and an optional `files` field for media inputs (local paths or URLs):
-
-```json
-[
-  {
-    "files": ["path/or/url/to/files"],
-    "messages": [
-      {"role": "user",    "content": "..." },
-      {"role": "assistant", "content": "..."}
-    ]
-  }
-]
-```
-
-`files` is only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). For LLM-only models, `files` can be omitted. `content` can be a plain string or a list of HuggingFace content blocks (e.g. `[{"type": "image"}, {"type": "text", "text": "..."}]` for vision inputs).
-
-Ready-to-use examples for each model type are provided under `assets/samples/`:
-
-| Model type | Example file |
-|---|---|
-| LLM | [assets/samples/text.json](assets/samples/text.json) |
-| ALM (audio) | [assets/samples/audio.json](assets/samples/audio.json) |
-| VLM (vision) | [assets/samples/vision.json](assets/samples/vision.json) |
-
 #### Quantization Guidance
 
 To automatically identify sensitive layers and generate a mixed-precision recipe suggestion, add the `--quant_recipe_suggestion` flag. During calibration, the analyzer compares FP32 and QDQ intermediate outputs layer-by-layer using SQNR, then writes two files to the working directory:
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index c00525d6fe7..30b86eabb01 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -23,6 +23,17 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "decoder_utils",
+    srcs = [
+        "decoder_utils.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:eval_library",
+    ],
+)
+
 runtime.python_library(
     name = "masking_utils",
     srcs = [
@@ -70,112 +81,19 @@ runtime.python_library(
     srcs = [
         "tokenizer.py",
     ],
-    deps = [
-        ":decoder_constants",
-        ":static_llama",
-        "//caffe2:torch",
-        "fbsource//third-party/pypi/transformers:transformers",
-    ],
-)
-
-runtime.python_library(
-    name = "utils",
-    srcs = [
-        "utils.py",
-    ],
     deps = [
         "//caffe2:torch",
-        "//executorch/exir:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "inference",
-    srcs = [
-        "inference/__init__.py",
-        "inference/decoder.py",
-        "inference/encoder.py",
-        "inference/model.py",
-    ],
-    deps = [
-        ":masking_utils",
-        "//caffe2:torch",
     ],
 )
 
 runtime.python_library(
     name = "dataset",
     srcs = [
-        "dataset/__init__.py",
-        "dataset/builders.py",
-        "dataset/collators.py",
-        "dataset/config.py",
-        "dataset/datasets.py",
-        "dataset/loaders.py",
-        "dataset/preprocessors.py",
-        "dataset/schema.py",
-    ],
-    deps = [
-        ":decoder_constants",
-        ":encoder",
-        ":masking_utils",
-        ":tokenizer",
-        "//caffe2:torch",
-        "//executorch/examples/models/llama:eval_library",
-        "fbsource//third-party/pypi/lm-eval:lm-eval",
-        "fbsource//third-party/pypi/transformers:transformers",
-    ],
-)
-
-runtime.python_library(
-    name = "quantize",
-    srcs = [
-        "quantize/__init__.py",
-        "quantize/ptq.py",
-        "quantize/strategy.py",
-    ],
-    deps = [
-        ":decoder_constants",
-        ":inference",
-        ":utils",
-        "//caffe2:torch",
-        "//executorch/backends/qualcomm/_passes:passes",
-    ],
-)
-
-runtime.python_library(
-    name = "mix_precision_analyzer",
-    srcs = [
-        "mix_precision_analyzer.py",
-    ],
-    deps = [
-        ":inference",
-        "//caffe2:torch",
-        "//executorch/backends/qualcomm/quantizer:quantizer",
-        "//executorch/devtools:lib",
-        "//executorch/exir:lib",
-        "//pytorch/ao:torchao",
-    ],
-)
-
-runtime.python_library(
-    name = "evaluator",
-    srcs = [
-        "evaluator/__init__.py",
-        "evaluator/device_evaluator.py",
-        "evaluator/lm_eval_adapter.py",
+        "dataset.py",
     ],
     deps = [
-        ":dataset",
-        ":decoder_constants",
-        ":inference",
         ":tokenizer",
-        ":utils",
         "//caffe2:torch",
-        "//executorch/backends/qualcomm:export_utils",
-        "//executorch/examples/models/llama:eval_library",
-        "//pytorch/ao:torchao",
-        "fbsource//third-party/pypi/lm-eval:lm-eval",
     ],
 )
 
@@ -188,16 +106,10 @@ runtime.python_library(
         "wrappers/llm_wrappers.py",
     ],
     deps = [
-        ":dataset",
         ":decoder_constants",
         ":encoder",
-        ":evaluator",
-        ":inference",
-        ":mix_precision_analyzer",
-        ":quantize",
         ":static_llama",
         ":static_llm_quant_recipe",
-        ":tokenizer",
         "//caffe2:torch",
         "//executorch/backends/qualcomm:export_utils",
         "//executorch/backends/qualcomm/_passes:passes",
@@ -217,11 +129,10 @@ runtime.python_library(
     deps = [
         ":dataset",
         ":decoder_constants",
+        ":decoder_utils",
         ":encoder",
-        ":evaluator",
         ":masking_utils",
         ":static_llm_quant_recipe",
-        ":tokenizer",
         ":wrappers",
         "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
@@ -269,6 +180,22 @@ python_binary(
     ],
 )
 
+python_binary(
+    name = "eval_llama_qnn",
+    srcs = ["eval_llama_qnn.py"],
+    main_function = "executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn.main",
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:model_sharding_py",
+    ],
+    deps = [
+        ":llama_lib",
+        "//executorch/examples/models/llama:eval_library",
+        "//executorch/examples/qualcomm/oss_scripts/llama:range_setting_pt2e",
+        "fbsource//third-party/pypi/lm-eval:lm-eval",
+    ],
+    keep_gpu_sections = True,
+)
+
 runtime.command_alias(
     name = "llama_qnn",
     env = {
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
index b53f4bda689..9ed44f6f3e0 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
@@ -49,10 +49,12 @@ class AudioModalityConfig(MultiModalityConfig):
 
     Attributes:
         audio_seq_len: Number of audio tokens in the sequence.
+        audio_url: Default audio URL for validation and calibration.
     """
 
     audio_seq_len: int
     n_bins: int
+    audio_url: str
 
     def create_encoder(self, config):
         return self.encoder_class(config, n_bins=self.n_bins)
@@ -69,11 +71,13 @@ class VisionModalityConfig(MultiModalityConfig):
         img_seq_len: Number of image tokens/patches in the sequence.
         img_resized_h: Target height for image resizing (pixels).
         img_resized_w: Target width for image resizing (pixels).
+        img_url: Default image URL for validation and calibration.
     """
 
     img_seq_len: int
     img_resized_h: int
     img_resized_w: int
+    img_url: str
 
     def create_encoder(self, config):
         return self.encoder_class(
@@ -90,6 +94,7 @@ class GraniteSpeechEncoder(AudioModalityConfig):
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
     n_bins = 844
+    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
     quant_recipe = GraniteSpeechEncoderQuantRecipe
     num_sharding = 8
 
@@ -104,6 +109,7 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_seq_len = 64
     img_resized_h = 512
     img_resized_w = 512
+    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
     quant_recipe = SmolVLMEncoderQuantRecipe
 
 
@@ -117,4 +123,5 @@ class InternVL3Encoder(VisionModalityConfig):
     img_seq_len = 256
     img_resized_h = 448
     img_resized_w = 448
+    img_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     quant_recipe = InternVL3EncoderQuantRecipe
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index d3d4a475288..ea09451a697 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -12,7 +12,7 @@
 import os
 import sys
 from multiprocessing.connection import Client
-from typing import Dict, List
+from typing import Dict
 
 import torch
 from executorch.backends.qualcomm.export_utils import (
@@ -30,11 +30,7 @@
     LLMModelConfig,
     SUPPORTED_LLM_MODELS,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.dataset import (
-    DataConfig,
-    DatasetBuilder,
-    MessageSample,
-)
+from executorch.examples.qualcomm.oss_scripts.llama.dataset import DatasetBuilder
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     ATTENTION_SINK_EVICTOR,
     AUDIO_ENCODER,
@@ -50,7 +46,7 @@
     TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.evaluator.device_evaluator import (
+from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import (
     DefaultEval,
     SqnrEval,
     TaskEval,
@@ -100,9 +96,10 @@ def compile(
     args,
     decoder_model_config: LLMModelConfig,
     pte_filenames: Dict[str, str],
-    tokenizer_wrapper,
+    tokenizer,
+    calibration_data,
     is_multimodal,
-) -> Dict[str, List]:
+):
     os.makedirs(args.artifact, exist_ok=True)
     multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
 
@@ -188,8 +185,9 @@ def compile(
 
     # perform ptq
     multi_modal_mgr.quantize(
-        tokenizer_wrapper=tokenizer_wrapper,
+        calibration_data=calibration_data,
         skip_quantize=skip_quantize,
+        tokenizer=tokenizer,
         backend=get_backend_type(args.backend),
         soc_model=args.soc_model,
     )
@@ -206,14 +204,15 @@ def inference(
     args,
     decoder_model_config: LLMModelConfig,
     runtime_tokenizer_path,
-    tokenizer_wrapper: TokenizerWrapper,
+    tokenizer,
+    chat_template,
     text_decoder_pte_path: str,
     encoder_pte_paths: Dict[str, str],
     tok_embedding_pte_path: str,
     attention_sink_evictor_pte_path: str,
+    calibration_data,
     is_multimodal,
 ):
-    tokenizer = tokenizer_wrapper.tokenizer
 
     assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}."
 
@@ -251,35 +250,15 @@ def inference(
                 {modality: encoder_pte_path},
             )
 
-    multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
-    audio_encoder = multi_modal_mgr.audio_encoder.model
-    vision_encoder = multi_modal_mgr.vision_encoder.model
-    tok_embedding = multi_modal_mgr.text_decoder.calibration_prefill.tok_embedding
-    source_model = multi_modal_mgr.text_decoder.calibration_prefill.decoder
-    audio_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get(
-        "audio_token_id", None
-    )
-    image_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get(
-        "image_token_id", None
-    )
-    dataset_builder = DatasetBuilder(
-        DataConfig.from_args(args),
-        decoder_model_config,
-        tokenizer_wrapper,
-        attn_mask=source_model.get_example_inputs()[1],
-    )
     if PROMPT_EVAL in args.eval_methods:
         prompt_evaluator = DefaultEval(
             args=args,
-            decoder_model_config=decoder_model_config,
             pte_paths=pte_paths,
             runtime_tokenizer_path=runtime_tokenizer_path,
             is_multimodal=is_multimodal,
-            dataset_builder=dataset_builder,
-        )
-        output_prompt = prompt_evaluator.run(
-            prompt=args.prompt, audio_paths=args.audio_path, image_paths=args.image_path
+            modality_inputs=calibration_data,
         )
+        output_prompt = prompt_evaluator.run(prompt=args.prompt)
         eval_results.update(
             {
                 "inference_speed": prompt_evaluator.inference_speed,
@@ -291,31 +270,31 @@ def inference(
 
     if SQNR_EVAL in args.eval_methods:
         assert not is_multimodal, "Modality Model does not support SQNR_EVAL."
-        runtime_message = tokenizer_wrapper.prepare_messages(args.prompt)[0]
-        message = MessageSample(
-            files=runtime_message["files_path"],
-            messages=tokenizer_wrapper.make_chat_template(
-                runtime_message["text"], args.system_prompt
-            ),
+        tokenizer_wrapper = TokenizerWrapper(
+            args,
+            decoder_model_config,
+        )
+        prompt = (
+            tokenizer_wrapper.apply_prompt_template(
+                chat_template, args.prompt[0], args.system_prompt
+            )
+            if chat_template is not None
+            else args.prompt[0]
         )
+        multi_modal_mgr = MultiModalManager(
+            control_args=args, config=decoder_model_config
+        )
+        source_model = multi_modal_mgr.text_decoder.decode.decoder
         sqnr_evaluator = SqnrEval(
             source_model=source_model,
             get_example_inputs=source_model.get_example_inputs,
             args=args,
             pte_paths=pte_paths,
-            tokenizer_wrapper=tokenizer_wrapper,
-            decoder_model_config=decoder_model_config,
+            tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
             is_multimodal=is_multimodal,
-            dataset_builder=dataset_builder,
-            encoder=audio_encoder or vision_encoder,
-            tok_embedding=tok_embedding,
-            audio_token_id=audio_token_id,
-            image_token_id=image_token_id,
-        )
-        sqnr, golden_logits, _ = sqnr_evaluator.run(
-            message, audio_paths=args.audio_path, image_paths=args.image_path
         )
+        sqnr, golden_logits, _ = sqnr_evaluator.run(prompt=prompt)
         logging.info(f"SQNR Eval Score between FP32 nn.Module and QNN: {sqnr}")
         eval_results.update(
             {
@@ -336,19 +315,11 @@ def inference(
                 get_example_inputs=source_model.get_example_inputs,
                 args=args,
                 pte_paths=pte_paths,
-                tokenizer_wrapper=tokenizer_wrapper,
-                decoder_model_config=decoder_model_config,
+                tokenizer=tokenizer,
                 runtime_tokenizer_path=runtime_tokenizer_path,
                 is_multimodal=is_multimodal,
-                dataset_builder=dataset_builder,
-                encoder=audio_encoder or vision_encoder,
-                tok_embedding=tok_embedding,
-                audio_token_id=audio_token_id,
-                image_token_id=image_token_id,
-            )
-            qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(
-                message, audio_paths=args.audio_path, image_paths=args.image_path
             )
+            qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(prompt=prompt)
             eval_results["qdq_sqnr"] = qdq_sqnr
             logging.info(f"SQNR Eval Score between CPU QDQ and QNN: {qdq_sqnr}")
             logging.info(
@@ -364,7 +335,6 @@ def inference(
         # Generate the eval wrapper
         ppl_evaluator = TaskEval(
             args=args,
-            decoder_model_config=decoder_model_config,
             pte_paths=pte_paths,
             tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
@@ -440,7 +410,7 @@ def _build_parser():
 
     parser.add_argument(
         "--prompt",
-        help="User prompts used during runtime inference only (not compilation or calibration). When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
+        help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
         required=True,
         type=str,
         nargs="+",
@@ -536,7 +506,7 @@ def _build_parser():
 
     parser.add_argument(
         "--audio_path",
-        help="Path to the audio file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.",
+        help="Path to the audio file for multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.",
         default=[],
         type=str,
         nargs="+",
@@ -544,7 +514,7 @@ def _build_parser():
 
     parser.add_argument(
         "--image_path",
-        help="Path to the image file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
+        help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
         default=[],
         type=str,
         nargs="+",
@@ -558,7 +528,7 @@ def _build_parser():
         help="Choose eval methods(default: prompt_eval). Users can provide more than 1 eval methods. For example: --eval_methods tasks_eval sqnr_eval."
         "Following eval methods are supported:"
         "1) prompt_eval: Model will generate the output response based on the provided prompt through the flag --prompt."
-        "2) tasks_eval: This will eval the tasks provided through the flag --eval_tasks."
+        "2) tasks_eval: This will eval the tasks provided through the flag --tasks."
         "3) sqnr_eval: This will eval the sqnr between between QNN's output logit V.S. Static Llama nn.Module's output logit. Eval is based on the provided prompt through the --prompt flag. Please note that sqnr will only eval the prompt's logit but not the new generated token's logit.",
     )
 
@@ -576,7 +546,6 @@ def _build_parser():
         default=1,
         help="number of samples to evalulate. If not set, evaluate all samples",
     )
-
     parser.add_argument(
         "--eval_num_fewshot",
         type=int,
@@ -608,19 +577,6 @@ def _build_parser():
         help="Number of examples to calibrate in few-shot context",
     )
 
-    parser.add_argument(
-        "--calib_samples",
-        nargs="+",
-        type=str,
-        default=None,
-        help="One or more paths to calibration sample JSON files. Only JSON format is supported. "
-        "Each file must be a flat list of sample objects: "
-        '[{"files": ["path_or_url", ...], "messages": [{"role": "user"|"assistant", "content": "..." | [...]}]}]. '
-        '"files" is optional and only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). '
-        '"messages" follows the HuggingFace chat template; "content" can be a plain string or a list of content blocks. '
-        "Multiple files are merged.",
-    )
-
     parser.add_argument(
         "-F",
         "--use_fp16",
@@ -631,16 +587,31 @@ def _build_parser():
 
     parser.add_argument("-v", "--verbose", action="store_true")
 
+    parser.add_argument(
+        "--calibration_num_threads",
+        type=int,
+        default=0,
+        help="Thread count for calibration forward passes. 0 = auto-tune (default).",
+    )
+
     parser.add_argument(
         "--quant_recipe_suggestion",
         action="store_true",
         help="Enable automatic quant recipe suggestion in PTQ",
     )
 
+    parser.add_argument(
+        "--skip_user_prompt_calibration",
+        action="store_true",
+        help="Skip using user prompt for calibration. Useful when only dataset-based calibration is desired.",
+    )
+
     return parser
 
 
 def export_llama(args) -> None:
+    if args.calibration_num_threads < 0:
+        raise ValueError("--calibration_num_threads must be >= 0")
     if args.compile_only and args.pre_gen_pte:
         raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true")
     if (TASKS_EVAL or SQNR_EVAL) in args.eval_methods and args.model_mode not in {
@@ -651,12 +622,6 @@ def export_llama(args) -> None:
             "Eval device perplexity is only supported for KV mode. Hybrid mode will only use KV mode when evaluating tasks/sqnr."
         )
     if TASKS_EVAL in args.eval_methods and args.eval_tasks is None:
-        if args.calib_tasks is None:
-            logging.warning(
-                "--eval_tasks is set but --calib_tasks is not; quantization "
-                "calibration will use --prompt instead of a task dataset. "
-                "Pass --calib_tasks to match the previous --tasks behavior."
-            )
         raise RuntimeError("Please provide --eval_tasks to eval perplexity")
     assert (
         args.decoder_model in SUPPORTED_LLM_MODELS
@@ -709,9 +674,17 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
     )
-    runtime_tokenizer_path = tokenizer_wrapper.runtime_tokenizer_path
+    runtime_tokenizer_path, tokenizer, chat_template = (
+        tokenizer_wrapper.get_runtime_tokenizer(
+            args.tokenizer_model, args.tokenizer_bin
+        )
+    )
 
     # Prepare dataset
+    dataset_builder = DatasetBuilder(args, decoder_model_config, tokenizer_wrapper)
+    calibration_data = dataset_builder.prepare_calibration_dataset(
+        args.prompt, chat_template
+    )
     text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte"
     attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte"
     tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
@@ -728,26 +701,13 @@ def export_llama(args) -> None:
             hasattr(decoder_model_config, AUDIO_ENCODER),
         ]
     )
-    if is_multimodal:
-        # TODO: Implement attention sink support for multimodal models (vision/audio).
-        if args.use_attention_sink is not None:
-            raise ValueError(
-                "Multimodal models currently do not support attention sink feature."
-            )
-        if args.eval_tasks is not None:
-            raise ValueError("Multimodal models do not support --eval_tasks.")
-
-    if not args.pre_gen_pte:
-        if is_multimodal and args.calib_samples is None:
-            raise ValueError(
-                "For MLLMs calibration data is required for compilation. "
-                "Provide --calib_samples with a vision/audio JSON file."
-            )
-        if not is_multimodal and not any((args.calib_tasks, args.calib_samples)):
-            raise ValueError(
-                "For LLMs calibration data is required for compilation. "
-                "Provide --calib_tasks or --calib_samples."
-            )
+    # TODO: Implement attention sink support for multimodal models (vision/audio).
+    assert (
+        not is_multimodal or args.use_attention_sink is None
+    ), "Multimodal models currently do not support attention sink feature."
+    assert (
+        not is_multimodal or not args.skip_user_prompt_calibration
+    ), "--skip_user_prompt_calibration is not supported for multimodal models (VLM/ALM) as they do not support task-based calibration yet."
 
     if args.pre_gen_pte:
         text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte"
@@ -775,11 +735,13 @@ def export_llama(args) -> None:
             args,
             decoder_model_config,
             runtime_tokenizer_path,
-            tokenizer_wrapper,
+            tokenizer,
+            chat_template,
             text_decoder_pte_path,
             encoder_pte_paths,
             tok_embedding_pte_path,
             attention_sink_evictor_pte_path,
+            calibration_data,
             is_multimodal,
         )
         print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
@@ -789,7 +751,8 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
         pte_filenames,
-        tokenizer_wrapper,
+        tokenizer,
+        calibration_data,
         is_multimodal,
     )
     if args.use_attention_sink:
@@ -834,11 +797,13 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
         runtime_tokenizer_path,
-        tokenizer_wrapper,
+        tokenizer,
+        chat_template,
         text_decoder_pte_path,
         encoder_pte_paths,
         tok_embedding_pte_path,
         attention_sink_evictor_pte_path,
+        calibration_data,
         is_multimodal,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama/masking_utils.py b/examples/qualcomm/oss_scripts/llama/masking_utils.py
index a09cdf1240f..7725b7589e1 100644
--- a/examples/qualcomm/oss_scripts/llama/masking_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/masking_utils.py
@@ -5,12 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Union
+from typing import List, Union
 
 import torch
 
-PADDING_MASK_VALUE = -255.0
-
 
 def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: int):
     """
@@ -23,14 +21,14 @@ def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: i
 
     ● = activate (can attend), ○ = inactivate (masked)
     """
-    mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE)
+    mask = torch.full((ar_len, ar_len), -255.0)
     mask_cond = torch.arange(ar_len)
     mask.masked_fill_(mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1), 0)
 
     if max_context_len != ar_len:
         mask = torch.cat(
             [
-                torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE,
+                torch.ones(ar_len, max_context_len - ar_len) * -255.0,
                 mask,
             ],
             dim=-1,
@@ -52,7 +50,7 @@ def create_sliding_window_attn_mask(
 
     ● = activate (can attend), ○ = inactivate (masked)
     """
-    mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE)
+    mask = torch.full((ar_len, ar_len), -255.0)
     mask_cond = torch.arange(ar_len)
     mask.masked_fill_(
         (mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1))
@@ -63,7 +61,7 @@ def create_sliding_window_attn_mask(
     if max_context_len != ar_len:
         mask = torch.cat(
             [
-                torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE,
+                torch.ones(ar_len, max_context_len - ar_len) * -255.0,
                 mask,
             ],
             dim=-1,
@@ -98,6 +96,7 @@ def mask(self) -> torch.Tensor:
     def smart_mask_init(self, pos):
         """
         Initialize the attention mask by smart mask initialization method after model forward.
+
         Args:
             pos (int): Current position in the sequence.
         """
@@ -115,17 +114,6 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         pass
 
-    def _extra_init_kwargs(self) -> dict:
-        return {}
-
-    def _mask_padding_positions(
-        self, input_ids: List[List[int]], max_seq_length: int
-    ) -> None:
-        """Mask positions beyond each sequence's actual length."""
-        actual_lens = torch.tensor([len(seq) for seq in input_ids])
-        pad_rows = torch.arange(max_seq_length).unsqueeze(0) >= actual_lens.unsqueeze(1)
-        self.mask.masked_fill_(pad_rows.unsqueeze(-1), PADDING_MASK_VALUE)
-
 
 class CausalAttentionMask(BaseAttentionMask):
     def __init__(self, max_batch_size: int, ar_len: int, max_context_len: int):
@@ -146,22 +134,28 @@ def smart_mask_init(self, pos):
     def smart_mask_update(self, pos, n_updates, _):
         """
         Smart Mask mechanism for attention mask updating
+
         Initial mask(5x15) layout (before any updates):
             Each row represents a query token in the autoregressive context.
             ● = activate (can attend), ○ = inactivate (masked)
+
             0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ●
+
         After 1st update (e.g., pos=0, n_updates=5, sliding_window=3):
             Newly added tokens are unmasked (set to 0).
+
             0 ● ● ● ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ○
             4 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ●
+
         After 2nd update (e.g., pos=5, n_updates=5):
+
             0 ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○ ○
             1 ● ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○
             2 ● ● ● ● ● ● ● ● ● ● ● ● ● ○ ○
@@ -172,16 +166,6 @@ def smart_mask_update(self, pos, n_updates, _):
         end_pos = pos + n_updates
         self.mask[:, :, start_pos:end_pos] = 0
 
-    @classmethod
-    def from_input_ids(
-        cls, input_ids: List[List[int]], max_seq_length: int, **kwargs
-    ) -> "CausalAttentionMask":
-        """Build a causal mask and apply padding for variable-length sequences."""
-        mask = cls(len(input_ids), max_seq_length, max_seq_length)
-        mask._mask = mask._mask.clone()
-        mask._mask_padding_positions(input_ids, max_seq_length)
-        return mask
-
 
 class SlidingWindowAttentionMask(BaseAttentionMask):
     def __init__(
@@ -210,24 +194,31 @@ def smart_mask_init(self, pos):
     def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         Smart Mask mechanism for attention mask updating
+
         Initial mask(5x15) layout (before any updates):
             Each row represents a query token in the autoregressive context.
             ● = activate (can attend), ○ = inactivate (masked)
+
             0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
+
         After 1st update (e.g., pos=0, n_updates=5, sliding_window=3):
             Newly added tokens are unmasked (set to 0).
             Earlier tokens lose access to older cache due to sliding window limits.
+
             0 ○ ○ ○ ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ● ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
+
+
         After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3):
             Sliding window shifts again, masking older positions and activate new position.
+
             0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
@@ -249,24 +240,7 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset):
             if end_pos > available_cache_len:
                 # Mask tokens that are no longer within the sliding window
                 # TODO: [Optional]: it can be optimized by computing the exact start index
-                self.mask[:, i, : end_pos - available_cache_len] = PADDING_MASK_VALUE
-
-    def _extra_init_kwargs(self) -> dict:
-        return {"sliding_window": self.sliding_window}
-
-    @classmethod
-    def from_input_ids(
-        cls,
-        input_ids: List[List[int]],
-        max_seq_length: int,
-        sliding_window: int,
-        **kwargs,
-    ) -> "SlidingWindowAttentionMask":
-        """Build a sliding-window mask and apply padding for variable-length sequences."""
-        mask = cls(len(input_ids), max_seq_length, max_seq_length, sliding_window)
-        mask._mask = mask._mask.clone()
-        mask._mask_padding_positions(input_ids, max_seq_length)
-        return mask
+                self.mask[:, i, : end_pos - available_cache_len] = -255.0
 
 
 class AttentionMask:
@@ -283,28 +257,3 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset=None):
 
     def __iter__(self):
         return iter([mask.mask for mask in self.masks])
-
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, ...]:
-        return tuple(m.mask[idx] for m in self.masks)
-
-    @classmethod
-    def from_input_ids(
-        cls,
-        template: "AttentionMask",
-        input_ids: List[List[int]],
-        max_seq_length: int,
-    ) -> "AttentionMask":
-        """
-        Build a calibration AttentionMask that mirrors template's mask types.
-
-        Delegates construction to each mask's own classmethod so that adding a
-        new mask type only requires implementing from_input_ids on that class —
-        no edits needed here.
-        """
-        masks = [
-            type(base_mask).from_input_ids(
-                input_ids, max_seq_length, **base_mask._extra_init_kwargs()
-            )
-            for base_mask in template.masks
-        ]
-        return cls(masks)
diff --git a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
index b16a5e2a252..02f19a0b676 100644
--- a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
+++ b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
@@ -26,9 +26,7 @@
 from executorch.devtools.inspector._intermediate_output_capturer import (
     IntermediateOutputCapturer,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.inference import DecoderInference
 from executorch.exir.debug_handle_utils import DEBUG_HANDLE_KEY
-from torch.utils.data import DataLoader
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.utils import compute_error
 
@@ -76,49 +74,45 @@ def __init__(
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
         }
 
-    def analyze(
-        self,
-        decoder_inference: DecoderInference,
-        text_dataloader: DataLoader,
-        num_sharding: int = 5,
-    ) -> "SqnrReport":
+    def analyze(self, samples: List[Tuple], num_sharding: int = 5) -> "SqnrReport":
         """
-        Evaluates both the fp32 and QDQ graphs using batches from text_dataloader
+        Evaluates both the fp32 and QDQ graphs using the provided input_samples
         and computes the per-node Signal-to-Quantization-Noise Ratio (SQNR).
 
         Args:
-            decoder_inference: Provides get_inputs() to assemble each
-                batch into the compiled model's input signature.
-            text_dataloader: DataLoader for text-only calibration batches.
-            num_sharding: Number of contiguous layer groups to bucket the model
-                into for SQNR aggregation.
+            input_samples: A list of tuples containing tensors corresponding to the model's inputs.
+            num_sharding: Number of contiguous layer groups to bucket the model into for SQNR
+                aggregation. Rather than flagging individual layers, layers are grouped into
+                ``num_sharding`` consecutive ranges (e.g. layers 0-7, 8-15, …) and the SQNR
+                is averaged within each group. Because upgrading isolated layers is usually ineffective: quantization error from surrounding
+                low-precision layers accumulates and dominates downstream behavior.
 
         Returns:
             An ``SqnrReport`` object containing the aggregated analysis results.
         """
+        input_samples = [sample for sample in samples if sample is not None]
+
+        if not input_samples:
+            logging.warning("No input samples provided for analysis.")
+            return SqnrReport(
+                self.model_name, defaultdict(list), [], self.analysis_recipe
+            )
+
         self._assign_debug_handles(self.fp32_gm)
         self._assign_debug_handles(self.qdq_gm)
 
-        num_samples = 0
+        num_samples = len(input_samples)
+        logging.info(f"num samples: {num_samples}")
+
+        # Accumulate SQNR per module path across all input samples
         path_sqnr_sum = defaultdict(float)
-        for text_batch in text_dataloader:
-            input_ids = text_batch["input_ids"]
-            attn_mask = text_batch["attention_mask"]
-            sample = tuple(decoder_inference.get_inputs(input_ids, attn_mask))
+        for sample in input_samples:
             fp_outputs = self._capture(self.fp32_gm, sample)
             qdq_outputs = self._capture(self.qdq_gm, sample)
             for path, sqnr in self._match_and_score(fp_outputs, qdq_outputs).items():
                 path_sqnr_sum[path] += sqnr
-            num_samples += 1
-
-        if num_samples == 0:
-            logging.warning("No input samples provided for analysis.")
-            return SqnrReport(
-                self.model_name, defaultdict(list), [], self.analysis_recipe
-            )
-
-        logging.info(f"num samples: {num_samples}")
 
+        # Average the SQNRs and group them by normalized layer ranges
         report = defaultdict(list)
         for path, total_sqnr in path_sqnr_sum.items():
             group = self._normalize_group_name(
diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py
index 954b73384fa..2894777f776 100644
--- a/examples/qualcomm/oss_scripts/llama/tokenizer.py
+++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py
@@ -8,21 +8,19 @@
 import json
 import logging
 import re
-from typing import Dict, List
+import warnings
+from typing import Callable, List
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     VISION_ENCODER,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ModelArgs
 from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
 
 from transformers import AutoTokenizer
 
-
-# Generic special tokens for multimodality, used for runtime identification.
 IMG_TOKEN = "<image>"
 AUDIO_TOKEN = "<audio>"
 
@@ -68,32 +66,7 @@ def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
         self.control_args = control_args
         self.config = config
         self.repo_id = config.repo_id
-        self._instruct_model = config.instruct_model
-
-        self.tokenizer = None
-        self.chat_template = None
-
-        params_path = (
-            config.params_path if control_args.params is None else control_args.params
-        )
-        with open(params_path) as f:
-            model_args = ModelArgs(**json.load(f))
-        self.vocab_size = model_args.vocab_size
-
-        self.runtime_tokenizer_path = self._init_tokenizer(
-            control_args.tokenizer_model, control_args.tokenizer_bin
-        )
-
-    def _init_tokenizer(self, tokenizer_model, tokenizer_bin) -> str:
-        if self.decoder_model in {"stories110m", "stories260k"}:
-            path, self.tokenizer = self._from_tokenizer_model_and_bin(
-                tokenizer_model, tokenizer_bin
-            )
-        elif "llama3_2" in self.decoder_model:
-            path, self.tokenizer = self._from_tokenizer_model(tokenizer_model)
-        else:
-            path, self.tokenizer, self.chat_template = self._from_hf()
-        return path
+        self.apply_chat_template = config.instruct_model
 
     def _from_tokenizer_model_and_bin(self, tokenizer_model, tokenizer_bin):
         tokenizer = get_tokenizer(tokenizer_model)
@@ -116,7 +89,7 @@ def _from_hf(self):
         tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
         chat_template = (
             tokenizer.apply_chat_template
-            if hasattr(tokenizer, "apply_chat_template") and self._instruct_model
+            if hasattr(tokenizer, "apply_chat_template") and self.apply_chat_template
             else None
         )
         tokenizer_artifacts = tokenizer.save_pretrained(self.artifact)
@@ -139,6 +112,23 @@ def _from_hf(self):
 
         return runtime_tokenizer_path, tokenizer, chat_template
 
+    def get_runtime_tokenizer(self, tokenizer_model, tokenizer_bin):
+        tokenizer = None
+        runtime_tokenizer_path = ""
+        chat_template = None
+        if self.decoder_model in {"stories110m", "stories260k"}:
+            runtime_tokenizer_path, tokenizer = self._from_tokenizer_model_and_bin(
+                tokenizer_model, tokenizer_bin
+            )
+        elif "llama3_2" in self.decoder_model:
+            runtime_tokenizer_path, tokenizer = self._from_tokenizer_model(
+                tokenizer_model
+            )
+        else:
+            runtime_tokenizer_path, tokenizer, chat_template = self._from_hf()
+
+        return runtime_tokenizer_path, tokenizer, chat_template
+
     def prepare_messages(self, prompts: List[str]):  # noqa: C901
         """
         Validate and normalize a multi-turn prompt sequence, then prepare it into
@@ -194,9 +184,14 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
 
         audio_paths = self.control_args.audio_path
         if hasattr(self.config, AUDIO_ENCODER):
+            # Load audio from user-specified path (URL or local file)
+            # fall back to the default audio URL if no audio is provided.
             if not audio_paths:
-                raise ValueError(
-                    "No audio path/URL provided. Please specify --audio_path."
+                audio_paths = [getattr(self.config, AUDIO_ENCODER).audio_url]
+                warnings.warn(
+                    f"No audio path/URL provided, using default audio URL from huggingface: {audio_paths}",
+                    UserWarning,
+                    stacklevel=1,
                 )
             num_audios = len(audio_paths)
             total_audio_tokens = sum(prompt.count(AUDIO_TOKEN) for prompt in prompts)
@@ -205,17 +200,24 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
             elif total_audio_tokens != num_audios:
                 raise ValueError(
                     f"Number of <audio> tokens ({total_audio_tokens}) does not match "
-                    f"number of audios ({num_audios}). Please check your prompts and audio paths.\n\n"
+                    f"number of audios ({num_audios}). Please check your prompts and audio paths."
+                    "Please check your prompts and audio paths.\n\n"
                     f"=== Prompt ===\n{prompts}\n"
                     f"=== Audio paths ===\n{audio_paths}"
                 )
 
         image_paths = self.control_args.image_path
         if hasattr(self.config, VISION_ENCODER):
+            # Load image from user-specified path (URL or local file)
+            # fall back to the default image URL if no image is provided.
             if not image_paths:
-                raise ValueError(
-                    "No image path/URL provided. Please specify --image_path."
+                image_paths = [getattr(self.config, VISION_ENCODER).img_url]
+                warnings.warn(
+                    f"No image path/URL provided, using default image URL: {image_paths}",
+                    UserWarning,
+                    stacklevel=1,
                 )
+
             num_images = len(image_paths)
             total_image_tokens = sum(prompt.count(IMG_TOKEN) for prompt in prompts)
 
@@ -224,7 +226,8 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
             elif total_image_tokens != num_images:
                 raise ValueError(
                     f"Number of <image> tokens ({total_image_tokens}) does not match "
-                    f"number of images ({num_images}). Please check your prompts and image paths.\n\n"
+                    f"number of images ({num_images}). Please check your prompts and image paths."
+                    "Please check your prompts and image paths.\n\n"
                     f"=== Prompt ===\n{prompts}\n"
                     f"=== Image paths ===\n{image_paths}"
                 )
@@ -331,34 +334,26 @@ def _split_prompt(self, prompt: str):
         pattern = f"({'|'.join(map(re.escape, split_tokens))})"
         return [part for part in re.split(pattern, prompt) if part]
 
-    def make_chat_template(
+    def apply_prompt_template(
         self,
+        chat_template: Callable,
         prompt: str,
         system_prompt: str = None,
-        assistant_text: str = None,
-    ) -> List[Dict]:
-        """Build a HuggingFace-format message list for runtime evaluation.
-
-        Converts a raw prompt into the structured message format expected by
-        ``apply_chat_template``
+    ) -> str:
+        """
+        Apply chat template to format the prompt for different modalities.
 
         Args:
-            prompt: Raw user prompt, may contain ``<image>`` or ``<audio>`` tokens.
-            system_prompt: Optional system message appended to the message list.
-            assistant_text: Optional assistant turn; disables generation prompt when set.
+            chat_template: The chat template function from tokenizer
+            prompt: Input text prompt
+            system_prompt: Optional system prompt
 
         Returns:
-            HuggingFace-format message list
+            Formatted prompt string
         """
 
         messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
         message = {"role": "user", "content": prompt}
-        if self.chat_template is None:
-            messages.append(message)
-            return messages
-
         if self.decoder_model in VLM_SPECIAL_TOKENS:
             contents = self._split_prompt(prompt)
             message["content"] = []
@@ -372,43 +367,32 @@ def make_chat_template(
                         {"type": "text", "text": content},
                     )
         elif self.decoder_model in ALM_SPECIAL_TOKENS:
-            specials = ALM_SPECIAL_TOKENS[self.decoder_model]
-
             contents = self._split_prompt(prompt)
             message["content"] = ""
             for content in contents:
                 if content == AUDIO_TOKEN:
-                    message["content"] += specials[AUDIO_TOKEN]
+                    message["content"] += ALM_SPECIAL_TOKENS[self.decoder_model][
+                        AUDIO_TOKEN
+                    ]
                 else:
                     message["content"] += content
 
         messages.append(message)
-        if assistant_text is not None:
-            messages.append({"role": "assistant", "content": assistant_text})
-
-        return messages
-
-    def apply_chat_template(
-        self,
-        messages: List[Dict],
-    ) -> str:
-        """Format a message list into a prompt string.
-
-        Intended for calibration dataset formatting where the input is already
-        a HuggingFace-format message list (e.g. loaded from --calib_samples JSON).
-
-        If chat_template is not set (non-instruct or non-HF models), falls back
-        to concatenating each message's 'content' field directly.
-        """
-        if self.chat_template is None:
-            return "".join(m["content"] for m in messages)
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
 
-        template_prompt = self.chat_template(
-            messages, tokenize=False, add_generation_prompt=False
+        template_prompt = chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
 
+        # edge cases handling:
         # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
-        if self.decoder_model in {"gemma-2b", "gemma3-1b"}:
+        # This can happen after applying a prompt template, which might inject <bos> unintentionally.
+        # To prevent decoding issues, we explicitly remove <bos> token
+        if chat_template and self.decoder_model in {
+            "gemma-2b",
+            "gemma3-1b",
+        }:
             template_prompt = template_prompt.replace("<bos>", "")
 
         return template_prompt
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
index 149a376e918..0026354d5d3 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
@@ -14,8 +14,9 @@
 from dataclasses import dataclass
 from enum import Enum
 from functools import wraps
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Tuple
 
+import torch
 from executorch.backends.qualcomm.serialization.qc_schema import (
     QnnExecuTorchBackendType,
 )
@@ -33,17 +34,12 @@
     StaticLLMQuantRecipe,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-from torch.utils.data import DataLoader
 from transformers import AutoConfig
 
 
 class Mode(Enum):
-    # AR-N graph compiled and deployed for runtime.
     PREFILL = 1
-    # AR-1 graph compiled and deployed for runtime.
     DECODE = 2
-    # Full AR sequence mode; used for quantization, never deployed.
-    # After convert_pt2e, its scale/zp are propagated to DECODE and PREFILL via _encoding_override.
     CALIBRATE = 3
 
 
@@ -107,7 +103,6 @@ def process_model_args(
     else:
         raise ValueError(f"Unsupported mode: {mode}")
 
-    # TODO: support multi_batch for CALIBRATION MODE
     model_args.max_batch_size = 1
     model_args.max_seq_len = control_args.max_seq_len
     model_args.max_context_len = control_args.max_context_len
@@ -167,9 +162,9 @@ def process(self, request: Any):
 class Request:
     @dataclass
     class CalibrationData:
-        datasets: Optional[DataLoader] = None
-        intermediate_outputs: Optional[DataLoader] = None
-        qdq_intermediate_outputs: Optional[DataLoader] = None
+        datasets: List[Tuple[torch.Tensor]] = None
+        intermediate_outputs: List[Tuple[torch.Tensor]] = None
+        qdq_intermediate_outputs: List[Tuple[torch.Tensor]] = None
 
     @dataclass
     class Data:
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
index 9bab682eac8..720ddb97800 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -9,11 +9,13 @@
 import inspect
 import json
 import logging
+import os
 import re
+import time
 import types
 
 from functools import partial
-from typing import Dict, List
+from typing import Any, Dict, List
 
 import torch
 
@@ -46,11 +48,6 @@
     LLM_VARIANT_ARCHS,
     LLMModelConfig,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.dataset import (
-    DataConfig,
-    DatasetBuilder,
-    ModalityEncoderDataset,
-)
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     DECODE_QDQ_FILENAME,
@@ -61,19 +58,16 @@
     TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
+    _modality_inputs_merger,
+    graph_module_inference,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
     GraniteSpeechEncoder,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.evaluator.lm_eval_adapter import (
-    run_lm_eval,
-)
-from executorch.examples.qualcomm.oss_scripts.llama.inference import (
-    DecoderInference,
-    EncoderInference,
-)
 from executorch.examples.qualcomm.oss_scripts.llama.mix_precision_analyzer import (
     PerLayerSqnrAnalyzer,
     save_suggest_recipes,
@@ -85,11 +79,9 @@
     LlamaModel,
     ModelArgs,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.quantize import PTQStrategy
 from executorch.examples.qualcomm.oss_scripts.llama.static_llm_quant_recipe import (
     StaticLLMQuantRecipe,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
 from executorch.examples.qualcomm.oss_scripts.llama.wrappers.base_component import (
     Component,
     get_model_specific_kwargs,
@@ -105,11 +97,16 @@
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.extension.llm.custom_ops import model_sharding
 from executorch.extension.llm.export.builder import DType
-from torch.utils.data import DataLoader
 from torchao.prototype.spinquant import apply_spinquant
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-from transformers import AutoModel, AutoModelForSpeechSeq2Seq
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoModelForSpeechSeq2Seq,
+    AutoModelForVision2Seq,
+)
 
 
 def is_node_src_start_with_name(node: torch.fx.Node, kv_cache_prefix: str) -> bool:
@@ -204,19 +201,6 @@ def __init__(
             self.dep_table[SplitGraph] = [FoldQDQ]
             self.dep_table[TagQuantIO] = [SplitGraph]
 
-        self._decoder_inference = (
-            DecoderInference(
-                get_example_inputs=self.get_example_inputs,
-                audio_token_id=self.meta.get("audio_token_id", None),
-                image_token_id=self.meta.get("image_token_id", None),
-                max_context_len=self.meta["get_max_context_len"],
-                max_batch_size=self.meta["get_max_batch_size"],
-                use_i64_token=self.control_args.embedding_quantize is not None,
-            )
-            if self.decoder is not None
-            else None
-        )
-
     def _prepare_model(self):  # noqa: C901
         if (instance := self._get_model_instance()) is None:
             return None, None
@@ -415,11 +399,6 @@ def _get_model_instance(self) -> LlamaModel:
 
         return tok_embedding, decoder
 
-    @property
-    def attn_mask(self):
-        """Attention mask for this decoder graph, used as a schema for dataset construction."""
-        return self.example_input[1]
-
     def _save_logits_quant_attrs(self):
         for node in self.decoder.graph.nodes:
             if node.op == "output":
@@ -503,7 +482,7 @@ def _quant_recipe_suggestion(
         self,
         fp32_gm: torch.fx.GraphModule,
         qdq_gm: torch.fx.GraphModule,
-        text_dataloader: DataLoader,
+        input_sample: tuple,
         recipe: StaticLLMQuantRecipe,
     ):
         """
@@ -524,16 +503,148 @@ def _quant_recipe_suggestion(
             fp32_gm=fp32_gm,
             qdq_gm=qdq_gm,
             analysis_recipe=recipe,
-        ).analyze(
-            self._decoder_inference,
-            text_dataloader,
-        )
+        ).analyze(input_sample)
         report.save_analysis_summary()
         suggest_recipe_overrides = report.suggest_recipe_overrides()
         save_suggest_recipes(report, suggest_recipe_overrides)
 
+    def _auto_tune_calibration_threads(self):
+        """Find the optimal thread count for calibration via quick microbenchmark.
+
+        AR1 decode calibration is SGEMV-dominated (memory-bandwidth-bound).
+        The default thread count (os.cpu_count()) is typically far too high,
+        causing massive OpenMP sync overhead. This runs a few forward passes
+        at candidate thread counts and picks the fastest.
+        """
+        # Use sched_getaffinity when available — it respects cgroup/taskset
+        # constraints (e.g. containers), unlike os.cpu_count() which returns
+        # the host total regardless of pinning.
+        available = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else (os.cpu_count() or 1)
+        )
+        baseline = min(torch.get_num_threads(), available)
+        # Sample fractions of the thread ceiling from low through the
+        # bandwidth-saturation knee up to the current default.
+        fractions = (1 / 8, 1 / 4, 3 / 8, 1 / 2, 2 / 3, 3 / 4, 1.0)
+        candidates = sorted(
+            {1, baseline} | {max(1, round(baseline * f)) for f in fractions}
+        )
+        original = torch.get_num_threads()
+        best_threads, best_time = original, float("inf")
+        try:
+            for n_threads in candidates:
+                torch.set_num_threads(n_threads)
+                try:
+                    with torch.no_grad():
+                        self.decoder(*self.export_input)  # warmup
+                        t0 = time.perf_counter()
+                        for _ in range(3):
+                            self.decoder(*self.export_input)
+                        elapsed = time.perf_counter() - t0
+                    if elapsed < best_time:
+                        best_threads, best_time = n_threads, elapsed
+                except Exception:
+                    logging.debug("Auto-tune: threads=%d failed, skipping", n_threads)
+                    continue
+        finally:
+            torch.set_num_threads(original)
+        if best_time == float("inf"):
+            logging.warning(
+                "Auto-tune: all candidates %s failed, falling back to %d threads",
+                candidates,
+                baseline,
+            )
+            return baseline
+        logging.info(
+            "Auto-tune calibration threads: tested %s, best=%d (%.1fms/fwd)",
+            candidates,
+            best_threads,
+            best_time / 3 * 1000,
+        )
+        return best_threads
+
+    def _calibrate(
+        self,
+        model,
+        tokenizer,
+        event,
+        user_calibration_data,
+        tok_embedding=None,
+        intermediate_outputs=None,
+        collect_input_samples=False,
+    ):
+        """
+        Calibrate the model using either task-based evaluation or prompt-based inference.
+
+        This method performs Post-Training Quantization (PTQ) calibration by running inference
+        on the model with either:
+        1. Task-based datasets by lm_eval for text-only models in perplexity evaluation
+        2. User-provided prompts for both text-only and multimodal models
+
+        Args:
+            model: The decoder model to calibrate (GraphModule after prepare_pt2e)
+            tokenizer: Tokenizer for encoding text inputs
+            event: Event name for logging (e.g., "prepare_pt2e", "convert_pt2e")
+            tok_embedding: Optional text embedding module (required only for multimodal models)
+            intermediate_outputs: Optional pre-computed embeddings from vision/audio encoder
+                                 (required only for multimodal models)
+        """
+        # Determine if this is a multimodal model
+        is_multimodal = tok_embedding is not None
+
+        # Determine if task-based calibration is requested
+        has_task_calibration = self.control_args.calib_tasks is not None
+
+        # Task-based calibration: Only for text-only LLMs
+        # Multimodal models (VLMs) cannot use task-based evaluation currently.
+        input_samples = []
+        if has_task_calibration and not is_multimodal:
+            result = graph_module_inference(
+                use_kv_cache=self.meta["get_use_kv_cache"],
+                get_example_inputs=self.get_example_inputs,
+                module=model,
+                tokenizer=tokenizer,
+                ar_len=self.meta["get_ar_len"],
+                max_seq_len=self.meta["get_max_context_len"],
+                tasks=self.control_args.calib_tasks,
+                tasks_limit=self.control_args.calib_limit,
+                num_fewshot=self.control_args.calib_num_fewshot,
+                use_i64_token=self.control_args.embedding_quantize is not None,
+                event_name=f"{event}_tasks",
+                seq_mse_candidates=self.config.seq_mse_candidates,
+                collect_input_samples=collect_input_samples,
+            )
+            if result.input_samples:
+                input_samples.extend(result.input_samples)
+
+        # the user's prompt helps calibrate the special tokens.
+        if user_calibration_data:
+            for turn in zip(intermediate_outputs, user_calibration_data):
+                hidden_states, prompt = turn
+                result = graph_module_inference(
+                    use_kv_cache=self.meta["get_use_kv_cache"],
+                    get_example_inputs=self.get_example_inputs,
+                    hidden_states=hidden_states,  # hidden_states for multimodal
+                    module=model,
+                    tok_embedding=tok_embedding,
+                    audio_token_id=self.meta.get("audio_token_id", None),
+                    image_token_id=self.meta.get("image_token_id", None),
+                    tokenizer=tokenizer,
+                    ar_len=self.meta["get_ar_len"],
+                    max_seq_len=self.meta["get_max_context_len"],
+                    prompt=torch.Tensor(prompt).to(torch.long),
+                    use_i64_token=self.control_args.embedding_quantize is not None,
+                    event_name=f"{event}_prompt",
+                    collect_input_samples=collect_input_samples,
+                )
+                if result.input_samples:
+                    input_samples.extend(result.input_samples)
+        return input_samples
+
     @log_info
-    def quantize(self, request: Request):  # noqa: C901
+    def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
         if self.quant_recipe is None:
             return
 
@@ -579,7 +690,6 @@ def quantize(self, request: Request):  # noqa: C901
         )
 
         with torch.no_grad():
-            graph_module = None
             self.decoder = torch.export.export(
                 self.decoder, self.export_input, strict=True
             ).module()
@@ -595,23 +705,6 @@ def quantize(self, request: Request):  # noqa: C901
                     strict=True,
                 ).module()
 
-            if (
-                self.control_args.verbose
-                and self.mode == Mode.CALIBRATE
-                and not self.apply_embedding
-            ):
-                run_lm_eval(
-                    module=self.decoder,
-                    get_example_inputs=self.get_example_inputs,
-                    tokenizer=data.tokenizer,
-                    max_seq_length=self.meta["get_max_context_len"],
-                    tasks=self.control_args.eval_tasks,
-                    use_i64_token=self.control_args.embedding_quantize is not None,
-                    num_fewshot=self.control_args.eval_num_fewshot,
-                    limit=self.control_args.eval_limit,
-                    event_name="export_tasks",
-                )
-
             self.decoder = prepare_pt2e(self.decoder, quantizer)
             if self.apply_embedding:
                 self.tok_embedding = prepare_pt2e(
@@ -619,22 +712,33 @@ def quantize(self, request: Request):  # noqa: C901
                 )
 
             if self.mode == Mode.CALIBRATE:
-                calibration_dataloaders = {
-                    AUDIO_ENCODER: request.method_data[
-                        AUDIO_ENCODER
-                    ].calibration_data.intermediate_outputs,
-                    VISION_ENCODER: request.method_data[
-                        VISION_ENCODER
-                    ].calibration_data.intermediate_outputs,
-                    TEXT_DECODER: data.calibration_data.datasets,
-                }
-                PTQStrategy(
-                    inference=self._decoder_inference,
-                    module=self.decoder,
-                    seq_mse_candidates=self.config.seq_mse_candidates,
+                audio_turns = request.method_data[
+                    AUDIO_ENCODER
+                ].calibration_data.intermediate_outputs
+                vision_turns = request.method_data[
+                    VISION_ENCODER
+                ].calibration_data.intermediate_outputs
+                if audio_turns is None:
+                    audio_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                if vision_turns is None:
+                    vision_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                intermediate_outputs = [
+                    [*audio_turn, *vision_turn]
+                    for audio_turn, vision_turn in zip(audio_turns, vision_turns)
+                ]
+                input_samples = self._calibrate(
+                    model=self.decoder,
+                    tokenizer=data.tokenizer,
+                    event="prepare_pt2e",
+                    user_calibration_data=calibration_tokens,
                     tok_embedding=self.tok_embedding,
-                ).quantize(calib_loader=calibration_dataloaders)
-                logging.info("Calibration complete for prepare_pt2e")
+                    intermediate_outputs=intermediate_outputs,
+                    collect_input_samples=self.control_args.quant_recipe_suggestion,
+                )
             else:
                 # one dummy inference to remove affine observer
                 # error happened in convert_pt2e
@@ -649,32 +753,39 @@ def quantize(self, request: Request):  # noqa: C901
                 self._quant_recipe_suggestion(
                     graph_module,
                     self.decoder,
-                    calibration_dataloaders[TEXT_DECODER],
+                    input_samples,
                     self.quant_recipe.recipe,
                 )
 
-            # FP32 model used for quant-recipe-suggestion reference; release after use.
-            del graph_module
-            gc.collect()
-
             if self.apply_embedding:
                 self.tok_embedding = convert_pt2e(self.tok_embedding)
 
-            if (
-                self.control_args.verbose
-                and self.mode == Mode.CALIBRATE
-                and not self.apply_embedding
-            ):
-                run_lm_eval(
-                    module=self.decoder,
-                    get_example_inputs=self.get_example_inputs,
+            if self.control_args.verbose and self.mode == Mode.CALIBRATE:
+                audio_turns = request.method_data[
+                    AUDIO_ENCODER
+                ].calibration_data.qdq_intermediate_outputs
+                vision_turns = request.method_data[
+                    VISION_ENCODER
+                ].calibration_data.qdq_intermediate_outputs
+                if audio_turns is None:
+                    audio_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                if vision_turns is None:
+                    vision_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                qdq_intermediate_outputs = [
+                    [*audio_turn, *vision_turn]
+                    for audio_turn, vision_turn in zip(audio_turns, vision_turns)
+                ]
+                self._calibrate(
+                    model=self.decoder,
                     tokenizer=data.tokenizer,
-                    max_seq_length=self.meta["get_max_context_len"],
-                    tasks=self.control_args.eval_tasks,
-                    use_i64_token=self.control_args.embedding_quantize is not None,
-                    num_fewshot=self.control_args.eval_num_fewshot,
-                    limit=self.control_args.eval_limit,
-                    event_name="convert_pt2e_tasks",
+                    event="convert_pt2e",
+                    user_calibration_data=calibration_tokens,
+                    tok_embedding=self.tok_embedding,
+                    intermediate_outputs=qdq_intermediate_outputs,
                 )
 
         # setup quantized IO
@@ -711,13 +822,8 @@ def __init__(
             Mode.PREFILL,
             apply_embedding=apply_embedding,
         )
-        # Full AR sequence with KV cache; used only for quantization.
-        # Scales/zp collected here are propagated to decode and prefill graphs via _encoding_override.
-        self.calibration_prefill = TextDecoder(
-            control_args,
-            config,
-            Mode.CALIBRATE,
-            apply_embedding=apply_embedding,
+        self.calibration_prefill = TextDecoder(  # for quantization only
+            control_args, config, Mode.CALIBRATE, apply_embedding=apply_embedding
         )
 
         self.control_args = control_args
@@ -874,10 +980,149 @@ def parameter_override(quantized_node, unquantized_node):
 
         unquantized_model.recompile()
 
+    def _generate_tokens_from_hf(self, model: AutoModel, data, intermediate_outputs):
+        from pytorch_tokenizers.tiktoken import TiktokenTokenizer
+
+        tok_embedding = self.decode.tok_embedding
+        audio_token_id = self.decode.meta.get("audio_token_id")
+        image_token_id = self.decode.meta.get("image_token_id")
+        use_i64_token = self.decode.control_args.embedding_quantize is not None
+        max_seq_len = self.decode.meta["get_max_context_len"]
+        tokenizer = data.tokenizer
+        is_multimodal = all(
+            [
+                tok_embedding,
+                audio_token_id or image_token_id,
+            ]
+        )
+
+        calibration_tokens = []
+        for hidden_states, prompt in zip(
+            intermediate_outputs, data.calibration_data.datasets
+        ):
+            if isinstance(tokenizer, TiktokenTokenizer):
+                token_ids = tokenizer.encode(
+                    prompt, bos=True, eos=False, allowed_special="all"
+                )
+            else:
+                token_ids = tokenizer.encode(prompt, bos=True, eos=False)
+            input_ids = torch.tensor([token_ids], dtype=torch.int64)
+
+            with torch.no_grad():
+                if is_multimodal and hidden_states:
+                    token_dtype = torch.int64 if use_i64_token else torch.int32
+                    text_embeds = tok_embedding(input_ids.to(token_dtype))
+                    merged_embeds = _modality_inputs_merger(
+                        input_ids,
+                        text_embeds,
+                        torch.cat(hidden_states, dim=1),
+                        audio_token_id or image_token_id,
+                    )
+                    generated_ids = model.generate(
+                        inputs_embeds=merged_embeds,
+                        max_new_tokens=max_seq_len - len(token_ids),
+                        eos_token_id=tokenizer.eos_id,
+                        do_sample=False,
+                    )
+                    full_tokens = token_ids + generated_ids[0].tolist()
+                else:
+                    output_ids = model.generate(
+                        input_ids=input_ids,
+                        max_new_tokens=max_seq_len - len(token_ids),
+                        eos_token_id=tokenizer.eos_id,
+                        do_sample=False,
+                    )
+                    full_tokens = output_ids[0].tolist()
+
+            calibration_tokens.append(full_tokens)
+
+        return calibration_tokens
+
+    def _generate_calibration_tokens(self, request: Request):
+        data = request.method_data[TEXT_DECODER]
+        audio_turns = request.method_data[
+            AUDIO_ENCODER
+        ].calibration_data.intermediate_outputs
+        vision_turns = request.method_data[
+            VISION_ENCODER
+        ].calibration_data.intermediate_outputs
+        if audio_turns is None:
+            audio_turns = [[] for _ in range(len(data.calibration_data.datasets))]
+        if vision_turns is None:
+            vision_turns = [[] for _ in range(len(data.calibration_data.datasets))]
+        intermediate_outputs = [
+            [*audio_turn, *vision_turn]
+            for audio_turn, vision_turn in zip(audio_turns, vision_turns)
+        ]
+
+        if self.config.repo_id:
+            if self.control_args.decoder_model == "smolvlm_500m_instruct":
+                hf_model = AutoModelForVision2Seq.from_pretrained(
+                    self.config.repo_id, torch_dtype=torch.float32
+                )
+
+            elif self.control_args.decoder_model == "internvl3_1b":
+                hf_model = AutoModelForImageTextToText.from_pretrained(
+                    self.config.repo_id, torch_dtype=torch.float32
+                )
+
+            elif self.control_args.decoder_model == "granite_speech_3_3-2b":
+                hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                    self.config.repo_id, torch_dtype=torch.float32
+                )
+            else:
+                hf_model = AutoModelForCausalLM.from_pretrained(
+                    self.config.repo_id,
+                )
+            calibration_tokens = self._generate_tokens_from_hf(
+                model=hf_model,
+                data=data,
+                intermediate_outputs=intermediate_outputs,
+            )
+        else:
+            # Auto-tune thread count for the without-cache calibration pass.
+            calib_threads = getattr(self.control_args, "calibration_num_threads", 0)
+            if calib_threads <= 0:
+                calib_threads = self.decode._auto_tune_calibration_threads()
+            original_threads = torch.get_num_threads()
+            torch.set_num_threads(calib_threads)
+            try:
+                calibration_tokens = []
+                for hidden_states, prompt in zip(
+                    intermediate_outputs, data.calibration_data.datasets
+                ):
+                    result = graph_module_inference(
+                        use_kv_cache=self.decode.meta["get_use_kv_cache"],
+                        get_example_inputs=self.decode.get_example_inputs,
+                        hidden_states=hidden_states,
+                        module=self.decode.decoder,
+                        tok_embedding=self.decode.tok_embedding,
+                        image_token_id=self.decode.meta.get("image_token_id", None),
+                        tokenizer=data.tokenizer,
+                        ar_len=self.decode.meta["get_ar_len"],
+                        max_seq_len=self.decode.meta["get_max_context_len"],
+                        prompt=prompt,
+                        use_i64_token=self.decode.control_args.embedding_quantize
+                        is not None,
+                        event_name="generated_user_prompt",
+                    )
+                    calibration_tokens.append(result.token_list)
+            finally:
+                torch.set_num_threads(original_threads)
+
+        return calibration_tokens
+
     def quantize(self, request: Request):
         if request.method_data[TEXT_DECODER].skip_quantize:
             return
-        self.calibration_prefill.quantize(request)
+
+        if self.control_args.skip_user_prompt_calibration:
+            calibration_tokens = None
+        else:
+            calibration_tokens = self._generate_calibration_tokens(request)
+        self.calibration_prefill.quantize(
+            request, calibration_tokens=calibration_tokens
+        )
 
     @log_info
     def compile(self, request: Request):  # noqa: C901
@@ -912,21 +1157,12 @@ def compile(self, request: Request):  # noqa: C901
 
             # Saving Decode QDQ Model EP for SQNR evaluation
             qdq_ep = torch.export.export(
-                self.calibration_prefill.decoder,
-                self.calibration_prefill.export_input,
-                strict=True,
+                self.decode.decoder, self.decode.export_input, strict=True
             )
             qdq_ep_path = f"{self.decode.control_args.artifact}/{DECODE_QDQ_FILENAME}"
             torch.export.save(qdq_ep, qdq_ep_path)
             logging.info(f"QDQ EP saved to {qdq_ep_path}")
 
-            if self.apply_embedding:
-                self._encoding_override(
-                    quantized_model=self.calibration_prefill.tok_embedding,
-                    unquantized_model=self.decode.tok_embedding,
-                    override_kv_cache=False,
-                )
-
             # For hybrid mode, override encoding of prefill model.
             if (
                 self.prefill.decoder is not None
@@ -1092,8 +1328,6 @@ def __init__(
             # metadata
             self.config = config
 
-            self._encoder_inference = EncoderInference()
-
         self.pass_manager_cls = get_qnn_pass_manager_cls()
         self.passes_job = self.pass_manager_cls.get_capture_program_passes()
         self.dep_table = (
@@ -1145,7 +1379,7 @@ def compile(self, request: Request):
             self.dep_table[TagQuantIO] = [SplitGraph]
 
             if not request_data.skip_quantize:
-                fixed_point_type = {"io_type": torch.float32}
+                fixed_point_type = {"io_type": torch.uint16}
 
                 # setup quantized IO
                 self.passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
@@ -1178,17 +1412,10 @@ def compile(self, request: Request):
 
     def _calibrate(self, model, calibration_datasets):
         outputs = []
-        for batch in calibration_datasets:
-            outputs_each_batch = [
-                self._encoder_inference.predict_step(model, data)
-                for data in batch["inputs"]
-            ]
-            outputs.append(outputs_each_batch)
-        return DataLoader(
-            ModalityEncoderDataset(outputs),
-            batch_size=1,
-            shuffle=False,
-        )
+        for turn in calibration_datasets:
+            outputs_each_turn = [model(*data) for data in turn]
+            outputs.append(outputs_each_turn)
+        return outputs
 
     def quantize(self, request: Request):
         if self.model is None:
@@ -1232,8 +1459,6 @@ def quantize(self, request: Request):
 
 class MultiModalManager(Component):
     def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
-        self.control_args = control_args
-        self.config = config
         self.audio_encoder = Modality(
             control_args,
             config,
@@ -1292,20 +1517,12 @@ def compile(
     @log_info
     def quantize(
         self,
-        tokenizer_wrapper: TokenizerWrapper,
+        calibration_data: Dict[str, List[Any]],
         skip_quantize: Dict[str, bool],
+        tokenizer,
         backend,
         soc_model,
     ):
-        data_config = DataConfig.from_args(self.control_args)
-        dataset_builder = DatasetBuilder(
-            data_config=data_config,
-            llm_config=self.config,
-            tokenizer_wrapper=tokenizer_wrapper,
-            attn_mask=self.text_decoder.calibration_prefill.attn_mask,
-        )
-        calibration_data = dataset_builder.build_calib_dataloaders()
-
         quantize_request = Request(
             inspect.currentframe().f_code.co_name,
             {
@@ -1314,7 +1531,7 @@ def quantize(
                         datasets=calibration_data[m]
                     ),
                     skip_quantize=skip_quantize.get(m, False),
-                    tokenizer=tokenizer_wrapper.tokenizer,
+                    tokenizer=tokenizer,
                     backend=backend,
                     soc_model=soc_model,
                 )
diff --git a/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py b/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
index 8dc334baf28..f59dc548c44 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
+++ b/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
@@ -38,10 +38,7 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            if isinstance(cache_config, dict):
-                max_seq_len = cache_config.get("max_cache_len", None)
-            else:
-                max_seq_len = getattr(cache_config, "max_cache_len", None)
+            max_seq_len = getattr(cache_config, "max_cache_len", None)
             if max_seq_len is not None:
                 metadata["get_max_seq_len"] = max_seq_len
 
@@ -118,7 +115,7 @@ def _qnn_attention_mask(
 
     # Simplest and most efficient way to obtain a causal mask
     causal_mask = kv_arange <= reshaped_cache_position
-    atten_mask = torch.full((causal_mask.shape[0], kv_length), -65504.0)
+    atten_mask = torch.full((causal_mask.shape[0], kv_length), torch.tensor(-65504.0))
     atten_mask = atten_mask.masked_fill(causal_mask, 0)
     atten_mask = atten_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
 
@@ -136,7 +133,7 @@ def __init__(self, model):
         logging.info(f"Metadata to be recorded in PTE: {self._metadata}")
         self.exportable_module = TorchExportableModuleForDecoderOnlyLM(
             self.model,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=self._metadata.get("get_max_seq_len"),
         )
         self._register_attention_mask_for_4_53(self.exportable_module)
@@ -157,9 +154,7 @@ def get_example_inputs(self):
         return (example_input_ids, example_cache_position)
 
     def forward(self, input_ids: torch.Tensor, cache_position: torch.Tensor):
-        return self.exportable_module(
-            input_ids=input_ids, cache_position=cache_position
-        )
+        return self.exportable_module(input_ids, cache_position)
 
     def get_metadata(self):
         return self._metadata
diff --git a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
index f9d1b8993a3..89277bcaac8 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
+++ b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
@@ -171,6 +171,16 @@ def pt2e_calibrate(
         calibration_data,
         tokenizer_path,
     ):
+        try:
+            from executorch.examples.qualcomm.oss_scripts.llm_utils.eval_decoder_model_qnn import (
+                GraphModuleCalibrationWrapper,
+            )
+            from lm_eval.evaluator import simple_evaluate
+        except ImportError:
+            raise ImportError(
+                "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
+            )
+
         tokenizer = get_tokenizer(tokenizer_path)
         logging.info(
             f"Calibrating with tasks: {calibration_tasks}, limit: {calibration_limit}, calibration_data: {calibration_data}, tokenizer_path: {tokenizer_path}, seq_length: {self.config.max_seq_len}"
@@ -201,17 +211,6 @@ def calibrate_template(
             max_len=calibration_seq_length,
         )
         if calibration_tasks is not None and calibration_limit is not None:
-            # Import lazily so only import lm_eval when user use it.
-            try:
-                from executorch.examples.qualcomm.oss_scripts.llm_utils.eval_decoder_model_qnn import (
-                    GraphModuleCalibrationWrapper,
-                )
-                from lm_eval.evaluator import simple_evaluate
-            except ImportError:
-                raise ImportError(
-                    "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
-                )
-
             eval_wrapper = GraphModuleCalibrationWrapper(
                 model=self.graph_module,
                 tokenizer=tokenizer,
diff --git a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
index 7876a5b54b3..70641af8fb7 100644
--- a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
+++ b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
@@ -14,6 +14,7 @@
 
 import torch
 from executorch.backends.qualcomm.export_utils import (
+    get_backend_type,
     QnnConfig,
     setup_common_args_and_variables,
     SimpleADB,
@@ -74,7 +75,7 @@ def compile(args: argparse.Namespace, qnn_config: QnnConfig):  # noqa: C901
             args.calibration_limit,
             args.prompt,
             tokenizer_json_path,
-            qnn_config.backend,
+            get_backend_type(qnn_config.backend),
             qnn_config.soc_model,
         )
 
@@ -157,7 +158,7 @@ def post_process():
             runner="examples/models/llama/llama_main",
         )
         # No pregen inputs, input_list is not required
-        adb.push(inputs=[], files=[tokenizer_json_path])
+        adb.push(inputs=[], input_list="", files=[tokenizer_json_path])
         adb.execute(custom_runner_cmd=runner_cmd)
 
         adb.pull(host_output_path=args.artifact, callback=post_process)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index 9bdba810138..f63f20717d1 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -390,10 +390,9 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                         "If the data is too large and it's not preferred to copy, please tag the "
                         "constant node like node.['no_copy'] = True and they won't be copied."
                     )
-                # Pick a deterministic consumer tag so a constant shared across
-                # partitions is assigned reproducibly across runs.
+                # tag the data node with the same tag as the last user
                 if len(user_tags) > 0:
-                    node.meta["delegation_tag"] = min(user_tags)
+                    node.meta["delegation_tag"] = user_tags.pop()
 
 
 def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
@@ -430,10 +429,9 @@ def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
                     "If the data is too large and it's not preferred to copy, please tag the "
                     "constant node like node.['no_copy'] = True and they won't be copied."
                 )
-            # Pick a deterministic consumer tag so a buffer shared across
-            # partitions is assigned reproducibly across runs.
+            # tag the data node with the same tag as the last user
             if len(user_tags) > 0:
-                node.meta["delegation_tag"] = min(user_tags)
+                node.meta["delegation_tag"] = user_tags.pop()
 
 
 def is_shape_dynamic(node: torch.fx.Node) -> bool:
diff --git a/exir/pass_base.py b/exir/pass_base.py
index c657ac53a91..910adf64de9 100644
--- a/exir/pass_base.py
+++ b/exir/pass_base.py
@@ -97,52 +97,6 @@ def _unstack_pytree(xs) -> List[PyTree]:  # pyre-ignore
     return pytrees
 
 
-@dataclass(frozen=True)
-class _SymbolicTensorSnapshot:
-    shape: Tuple[Optional[str], ...]
-
-
-def _symbolic_scalar_snapshot(
-    value: Argument,
-) -> Optional[Tuple[str, str]]:
-    if isinstance(value, torch.SymInt):
-        return ("SymInt", str(value))
-    if isinstance(value, torch.SymFloat):
-        return ("SymFloat", str(value))
-    if isinstance(value, torch.SymBool):
-        return ("SymBool", str(value))
-    return None
-
-
-def _leaf_symbolic_snapshot(value: Argument) -> Any:
-    scalar_snapshot = _symbolic_scalar_snapshot(value)
-    if scalar_snapshot is not None:
-        return scalar_snapshot
-
-    if isinstance(value, FakeTensor):
-        dims = []
-        has_symbolic_dim = False
-        for dim in value.shape:
-            dim_snapshot = _symbolic_scalar_snapshot(dim)
-            if dim_snapshot is None:
-                dims.append(None)
-            else:
-                has_symbolic_dim = True
-                dims.append(dim_snapshot[1])
-        if has_symbolic_dim:
-            return _SymbolicTensorSnapshot(tuple(dims))
-
-    return None
-
-
-def _extract_symbolic_snapshot(value: Argument) -> Any:
-    snapshot = pytree.tree_map(_leaf_symbolic_snapshot, value)
-    leaves = pytree.tree_leaves(snapshot)
-    if any(leaf is not None for leaf in leaves):
-        return snapshot
-    return None
-
-
 class NodeMetadata:
     def __init__(self, data: Dict[str, Any]) -> None:
         self.data: Dict[str, Any] = data.copy()
@@ -526,50 +480,6 @@ def __init__(self) -> None:
         self._initialized = True
         self.node_debug_str: Optional[str] = None
 
-    def should_preserve_symbolic_input_metadata(self) -> bool:
-        """Returns whether replay should validate symbolic input preservation.
-
-        Override to ``False`` for passes that intentionally change symbolic
-        input metadata during replay.
-        """
-        return True
-
-    def _capture_symbolic_input_snapshots(
-        self, graph_module: fx.GraphModule
-    ) -> List[Any]:
-        return [
-            _extract_symbolic_snapshot(node.meta.get("val"))
-            for node in graph_module.graph.nodes
-            if node.op == "placeholder"
-        ]
-
-    def _validate_symbolic_input_snapshots(
-        self,
-        graph_module: fx.GraphModule,
-        new_graph_module: fx.GraphModule,
-    ) -> None:
-        if not self.should_preserve_symbolic_input_metadata():
-            return
-
-        symbolic_inputs = self._capture_symbolic_input_snapshots(graph_module)
-        if all(snapshot is None for snapshot in symbolic_inputs):
-            return
-
-        new_symbolic_inputs = self._capture_symbolic_input_snapshots(new_graph_module)
-        for input_index, snapshot in enumerate(symbolic_inputs):
-            if snapshot is None:
-                continue
-            if input_index >= len(new_symbolic_inputs):
-                raise ExportPassBaseError(
-                    f"Input at position {input_index} did not preserve symbolic metadata across pass replay."
-                )
-
-            current_snapshot = new_symbolic_inputs[input_index]
-            if current_snapshot != snapshot:
-                raise ExportPassBaseError(
-                    f"Input at position {input_index} did not preserve symbolic metadata across pass replay."
-                )
-
     def _fx(
         self,
         kind: str,
@@ -781,7 +691,6 @@ def call_submodule(
             interpreter.run(*inputs_data)
 
         new_graph_module = torch.fx.GraphModule(self.tracer.root, self.tracer.graph)
-        self._validate_symbolic_input_snapshots(graph_module, new_graph_module)
 
         # Preserve GraphModule-level metadata from the input module.
         new_graph_module.meta = graph_module.meta.copy()
diff --git a/exir/tensor.py b/exir/tensor.py
index a4e480ffce0..fa1287fbd85 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -10,6 +10,8 @@
 # pyre-ignore-all-errors[16]
 from __future__ import annotations
 
+import copy
+
 import math
 import typing
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
@@ -110,7 +112,7 @@ def stride_from_dim_order(sizes: List[int], dim_order: List[int]) -> List[int]:
     """
     if len(sizes) == 0:
         return []
-    strides = list(sizes)
+    strides = copy.deepcopy(sizes)
     ndim = len(sizes)
     strides[dim_order[ndim - 1]] = 1
     for i in range(ndim - 2, -1, -1):
diff --git a/exir/tests/test_pass_infra.py b/exir/tests/test_pass_infra.py
index 59406b13f8f..7df6b76b93a 100644
--- a/exir/tests/test_pass_infra.py
+++ b/exir/tests/test_pass_infra.py
@@ -15,9 +15,7 @@
 from executorch.exir.pass_base import (
     ExportedProgramPassBase,
     ExportedProgramPassResult,
-    ExportPass,
     ExportPassBaseError,
-    NodeMetadata,
     ProxyValue,
 )
 from executorch.exir.pass_manager import ExportedProgramPassManager, PassManager
@@ -451,109 +449,3 @@ def f(x: torch.Tensor) -> torch.Tensor:
 
         with self.assertRaisesRegex(Exception, "call_method"):
             pm(exported_program)
-
-
-class TestPassBaseSymbolicInputs(unittest.TestCase):
-    class SymSizeModule(torch.nn.Module):
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return x.view(x.size(0), -1)
-
-    @staticmethod
-    def _find_input_node(gm: torch.fx.GraphModule) -> torch.fx.Node:
-        for node in gm.graph.nodes:
-            if node.op == "placeholder" and "val" in node.meta:
-                return node
-        raise AssertionError("Expected to find an input placeholder")
-
-    @staticmethod
-    def _symbolic_input_shape(node: torch.fx.Node) -> tuple[str | None, ...]:
-        value = node.meta["val"]
-        assert isinstance(value, torch.Tensor)
-        return tuple(
-            str(dim) if isinstance(dim, torch.SymInt) else None for dim in value.shape
-        )
-
-    def _export_dynamic_graph_module(self) -> torch.fx.GraphModule:
-        exported = export(
-            self.SymSizeModule(),
-            (torch.randn(2, 3),),
-            dynamic_shapes=({0: Dim("batch", min=1, max=8)},),
-            strict=True,
-        )
-        return to_edge(exported).exported_program().graph_module
-
-    def test_export_pass_preserves_symbolic_input_metadata(self) -> None:
-        graph_module = self._export_dynamic_graph_module()
-        original_input = self._find_input_node(graph_module)
-        original_snapshot = self._symbolic_input_shape(original_input)
-        self.assertTrue(any(dim is not None for dim in original_snapshot))
-
-        new_graph_module = ExportPass()(graph_module).graph_module
-        new_input = self._find_input_node(new_graph_module)
-
-        self.assertEqual(self._symbolic_input_shape(new_input), original_snapshot)
-
-    def test_export_pass_matches_symbolic_inputs_by_position(self) -> None:
-        class RenamePlaceholderPass(ExportPass):
-            def placeholder(
-                self,
-                name: str,
-                arg: torch.Tensor,
-                meta: NodeMetadata,
-            ) -> ProxyValue:
-                return super().placeholder(f"renamed_{name}", arg, meta)
-
-        new_graph_module = RenamePlaceholderPass()(
-            self._export_dynamic_graph_module()
-        ).graph_module
-        new_input = self._find_input_node(new_graph_module)
-
-        self.assertEqual(new_input.name, "renamed_x")
-        self.assertTrue(
-            any(dim is not None for dim in self._symbolic_input_shape(new_input))
-        )
-
-    def test_export_pass_rejects_collapsed_symbolic_input_metadata(self) -> None:
-        class CollapseSymbolicInputPass(ExportPass):
-            def placeholder(
-                self,
-                name: str,
-                arg: torch.Tensor,
-                meta: NodeMetadata,
-            ) -> ProxyValue:
-                proxy = super().placeholder(name, arg, meta)
-                if any(isinstance(dim, torch.SymInt) for dim in arg.shape):
-                    proxy.node.meta["val"] = torch.empty(2, 3, device="meta")
-                return proxy
-
-        with self.assertRaisesRegex(
-            ExportPassBaseError,
-            "Input at position 0 did not preserve symbolic metadata",
-        ):
-            CollapseSymbolicInputPass()(self._export_dynamic_graph_module())
-
-    def test_export_pass_can_disable_symbolic_input_validation(self) -> None:
-        class CollapseSymbolicInputPass(ExportPass):
-            def should_preserve_symbolic_input_metadata(self) -> bool:
-                return False
-
-            def placeholder(
-                self,
-                name: str,
-                arg: torch.Tensor,
-                meta: NodeMetadata,
-            ) -> ProxyValue:
-                proxy = super().placeholder(name, arg, meta)
-                if any(isinstance(dim, torch.SymInt) for dim in arg.shape):
-                    proxy.node.meta["val"] = torch.empty(2, 3, device="meta")
-                return proxy
-
-        graph_module = self._export_dynamic_graph_module()
-        original_snapshot = self._symbolic_input_shape(
-            self._find_input_node(graph_module)
-        )
-
-        new_graph_module = CollapseSymbolicInputPass()(graph_module).graph_module
-        new_input = self._find_input_node(new_graph_module)
-
-        self.assertNotEqual(self._symbolic_input_shape(new_input), original_snapshot)
diff --git a/exir/tests/test_tensor.py b/exir/tests/test_tensor.py
index 6435ca98a13..25bf2ea451e 100644
--- a/exir/tests/test_tensor.py
+++ b/exir/tests/test_tensor.py
@@ -388,26 +388,6 @@ def test_strides_from_dim_order(self) -> None:
         strides = stride_from_dim_order(sizes, dim_order)
         self.assertEqual(expected_strides, strides)
 
-    def test_strides_from_dim_order_with_symbolic_sizes(self) -> None:
-        class ViewModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return x.view(x.shape[0], -1)
-
-        exported_program = torch.export.export(
-            ViewModule(),
-            (torch.randn(2, 3, 4),),
-            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=8)}},
-        )
-        placeholder = next(
-            node
-            for node in exported_program.graph_module.graph.nodes
-            if node.op == "placeholder"
-        )
-        sizes = list(placeholder.meta["val"].shape)
-
-        self.assertIsInstance(sizes[0], torch.SymInt)
-        self.assertEqual([12, 4, 1], stride_from_dim_order(sizes, [0, 1, 2]))
-
     def test_num_bytes_from_shape_and_dtype(self) -> None:
         shape = (2, 3, 4)
         self.assertEqual(24, num_bytes_from_shape_and_dtype(shape, torch.int8))
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index c7a823c26df..8e1c2bf0143 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -67,12 +67,12 @@ struct type_map<torch::executor::Tensor> final {
 
 // Optional.
 template <class T>
-struct type_map<std::optional<T>> final {
+struct type_map<torch::executor::optional<T>> final {
   using type = std::optional<typename type_map<T>::type>;
 };
 
 template <class T>
-struct type_map<std::optional<T>&> final {
+struct type_map<torch::executor::optional<T>&> final {
   using type = std::optional<typename type_map<T>::type>&;
 };
 
@@ -177,7 +177,7 @@ struct type_convert<
                 typename remove_const_ref<AOptional>::type::value_type>> &&
         std::is_same_v<
             typename remove_const_ref<EOptional>::type,
-            std::optional<
+            torch::executor::optional<
                 typename remove_const_ref<EOptional>::type::value_type>>>>
     final {
  public:
diff --git a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
index 3abc84897ce..b76596b9963 100644
--- a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
+++ b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
@@ -32,8 +32,8 @@ Tensor& add_1_out(const Tensor& a, Tensor& out) {
 }
 
 Tensor& add_optional_scalar_out(
-    std::optional<int64_t> s1,
-    std::optional<int64_t> s2,
+    torch::executor::optional<int64_t> s1,
+    torch::executor::optional<int64_t> s2,
     Tensor& out) {
   if (s1.has_value()) {
     out.mutable_data_ptr<int64_t>()[0] += s1.value();
@@ -45,8 +45,8 @@ Tensor& add_optional_scalar_out(
 }
 
 Tensor& add_optional_tensor_out(
-    std::optional<torch::executor::Tensor> s1,
-    std::optional<torch::executor::Tensor> s2,
+    torch::executor::optional<torch::executor::Tensor> s1,
+    torch::executor::optional<torch::executor::Tensor> s2,
     Tensor& out) {
   if (s1.has_value()) {
     out.mutable_data_ptr<int64_t>()[0] +=
@@ -78,7 +78,8 @@ Tensor& sum_arrayref_tensor_out(
 }
 
 Tensor& sum_arrayref_optional_tensor_out(
-    torch::executor::ArrayRef<std::optional<torch::executor::Tensor>> a,
+    torch::executor::ArrayRef<
+        torch::executor::optional<torch::executor::Tensor>> a,
     Tensor& out) {
   for (int i = 0; i < a.size(); i++) {
     if (a[i].has_value()) {
@@ -168,19 +169,20 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_Tuple_TensorRef3x) {
 TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_Optionals) {
   // Scalar.
   EXPECT_TRUE((std::is_same<
-               type_map<std::optional<int64_t>>::type,
+               type_map<torch::executor::optional<int64_t>>::type,
                std::optional<int64_t>>::value));
   // Tensor.
-  EXPECT_TRUE((std::is_same<
-               type_map<std::optional<torch::executor::Tensor>>::type,
-               std::optional<at::Tensor>>::value));
-  // ArrayRef.
   EXPECT_TRUE(
       (std::is_same<
-          type_map<std::optional<torch::executor::ArrayRef<int64_t>>>::type,
-          std::optional<c10::ArrayRef<int64_t>>>::value));
+          type_map<torch::executor::optional<torch::executor::Tensor>>::type,
+          std::optional<at::Tensor>>::value));
+  // ArrayRef.
+  EXPECT_TRUE((std::is_same<
+               type_map<torch::executor::optional<
+                   torch::executor::ArrayRef<int64_t>>>::type,
+               std::optional<c10::ArrayRef<int64_t>>>::value));
   EXPECT_TRUE((std::is_same<
-               type_map<std::optional<
+               type_map<torch::executor::optional<
                    torch::executor::ArrayRef<torch::executor::Tensor>>>::type,
                std::optional<c10::ArrayRef<at::Tensor>>>::value));
 }
@@ -196,13 +198,13 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_ArrayRef) {
           type_map<torch::executor::ArrayRef<torch::executor::Tensor>>::type,
           c10::ArrayRef<at::Tensor>>::value));
   // Optionals.
-  EXPECT_TRUE(
-      (std::is_same<
-          type_map<torch::executor::ArrayRef<std::optional<int64_t>>>::type,
-          c10::ArrayRef<std::optional<int64_t>>>::value));
   EXPECT_TRUE((std::is_same<
                type_map<torch::executor::ArrayRef<
-                   std::optional<torch::executor::Tensor>>>::type,
+                   torch::executor::optional<int64_t>>>::type,
+               c10::ArrayRef<std::optional<int64_t>>>::value));
+  EXPECT_TRUE((std::is_same<
+               type_map<torch::executor::ArrayRef<
+                   torch::executor::optional<torch::executor::Tensor>>>::type,
                c10::ArrayRef<std::optional<at::Tensor>>>::value));
 }
 
@@ -251,16 +253,17 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalScalar) {
   // Convert optional at to et.
   auto optional_at_in = std::optional<int64_t>();
   auto optional_et =
-      type_convert<std::optional<int64_t>, std::optional<int64_t>>(
+      type_convert<std::optional<int64_t>, torch::executor::optional<int64_t>>(
           optional_at_in)
           .call();
   EXPECT_TRUE(
-      (std::is_same<decltype(optional_et), std::optional<int64_t>>::value));
+      (std::is_same<decltype(optional_et), torch::executor::optional<int64_t>>::
+           value));
 
   // Convert optional et to at.
-  auto optional_et_in = std::optional<int64_t>();
+  auto optional_et_in = torch::executor::optional<int64_t>();
   auto optional_at_out =
-      type_convert<std::optional<int64_t>, std::optional<int64_t>>(
+      type_convert<torch::executor::optional<int64_t>, std::optional<int64_t>>(
           optional_et_in)
           .call();
   EXPECT_TRUE(
@@ -270,19 +273,20 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalScalar) {
 TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalTensor) {
   // Convert optional at to et.
   auto optional_at_in = std::optional<at::Tensor>();
-  auto optional_et = type_convert<
-                         std::optional<at::Tensor>,
-                         std::optional<torch::executor::Tensor>>(optional_at_in)
-                         .call();
+  auto optional_et =
+      type_convert<
+          std::optional<at::Tensor>,
+          torch::executor::optional<torch::executor::Tensor>>(optional_at_in)
+          .call();
   EXPECT_TRUE((std::is_same<
                decltype(optional_et),
-               std::optional<torch::executor::Tensor>>::value));
+               torch::executor::optional<torch::executor::Tensor>>::value));
 
   // Convert optional et to at.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
-  auto et_in = std::optional<torch::executor::Tensor>(tf.ones({3}));
+  auto et_in = torch::executor::optional<torch::executor::Tensor>(tf.ones({3}));
   auto optional_at_out = type_convert<
-                             std::optional<torch::executor::Tensor>,
+                             torch::executor::optional<torch::executor::Tensor>,
                              std::optional<at::Tensor>>(optional_et)
                              .call();
   EXPECT_TRUE(
@@ -515,8 +519,9 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   const std::optional<int64_t> const_optional_at_in =
       std::optional<int64_t>(42);
   auto const_optional_et =
-      type_convert<const std::optional<int64_t>, std::optional<int64_t>>(
-          const_optional_at_in)
+      type_convert<
+          const std::optional<int64_t>,
+          torch::executor::optional<int64_t>>(const_optional_at_in)
           .call();
   EXPECT_TRUE(const_optional_et.has_value());
   EXPECT_EQ(const_optional_et.value(), 42);
@@ -524,7 +529,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   // Test optional scalar reference conversion
   std::optional<int64_t> optional_at_ref_in = std::optional<int64_t>(24);
   auto optional_et_from_ref =
-      type_convert<std::optional<int64_t>&, std::optional<int64_t>>(
+      type_convert<std::optional<int64_t>&, torch::executor::optional<int64_t>>(
           optional_at_ref_in)
           .call();
   EXPECT_TRUE(optional_et_from_ref.has_value());
@@ -534,8 +539,9 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   const std::optional<int64_t> const_optional_at_ref_in =
       std::optional<int64_t>(84);
   auto const_optional_et_from_ref =
-      type_convert<const std::optional<int64_t>&, std::optional<int64_t>>(
-          const_optional_at_ref_in)
+      type_convert<
+          const std::optional<int64_t>&,
+          torch::executor::optional<int64_t>>(const_optional_at_ref_in)
           .call();
   EXPECT_TRUE(const_optional_et_from_ref.has_value());
   EXPECT_EQ(const_optional_et_from_ref.value(), 84);
@@ -545,7 +551,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({5}));
   auto const_optional_tensor_converter = type_convert<
       const std::optional<at::Tensor>,
-      std::optional<torch::executor::Tensor>>(const_optional_tensor_at_in);
+      torch::executor::optional<torch::executor::Tensor>>(
+      const_optional_tensor_at_in);
   auto const_optional_tensor_et = const_optional_tensor_converter.call();
   EXPECT_TRUE(const_optional_tensor_et.has_value());
   EXPECT_EQ(const_optional_tensor_et.value().const_data_ptr<int64_t>()[0], 5);
@@ -555,7 +562,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({7}));
   auto optional_tensor_converter_from_ref = type_convert<
       std::optional<at::Tensor>&,
-      std::optional<torch::executor::Tensor>>(optional_tensor_at_ref_in);
+      torch::executor::optional<torch::executor::Tensor>>(
+      optional_tensor_at_ref_in);
   auto optional_tensor_et_from_ref = optional_tensor_converter_from_ref.call();
   EXPECT_TRUE(optional_tensor_et_from_ref.has_value());
   EXPECT_EQ(
@@ -566,7 +574,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({9}));
   auto const_optional_tensor_converter_from_ref = type_convert<
       const std::optional<at::Tensor>&,
-      std::optional<torch::executor::Tensor>>(const_optional_tensor_at_ref_in);
+      torch::executor::optional<torch::executor::Tensor>>(
+      const_optional_tensor_at_ref_in);
   auto const_optional_tensor_et_from_ref =
       const_optional_tensor_converter_from_ref.call();
   EXPECT_TRUE(const_optional_tensor_et_from_ref.has_value());
@@ -577,8 +586,9 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   // Test empty const optional conversions
   const std::optional<int64_t> empty_const_optional_at_in = std::nullopt;
   auto empty_const_optional_et =
-      type_convert<const std::optional<int64_t>, std::optional<int64_t>>(
-          empty_const_optional_at_in)
+      type_convert<
+          const std::optional<int64_t>,
+          torch::executor::optional<int64_t>>(empty_const_optional_at_in)
           .call();
   EXPECT_FALSE(empty_const_optional_et.has_value());
 
@@ -587,7 +597,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   auto empty_const_optional_tensor_et =
       type_convert<
           const std::optional<at::Tensor>,
-          std::optional<torch::executor::Tensor>>(
+          torch::executor::optional<torch::executor::Tensor>>(
           empty_const_optional_tensor_at_in)
           .call();
   EXPECT_FALSE(empty_const_optional_tensor_et.has_value());
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 206499b26eb..bc5c17ef33f 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -13,7 +13,6 @@
 #include <cstddef>
 #include <cstring>
 #include <limits>
-#include <new>
 
 #include <executorch/runtime/platform/compat_unistd.h>
 #include <fcntl.h>
@@ -45,12 +44,7 @@ namespace extension {
 
 namespace {
 inline void* et_aligned_alloc(size_t size, std::align_val_t alignment) {
-  // Use the nothrow form so allocation failure returns nullptr instead of
-  // throwing std::bad_alloc. ExecuTorch is built exception-free and callers
-  // (e.g. FileDataLoader::load) check for nullptr and return
-  // Error::MemoryAllocationFailed; a throw here would unwind with no landing
-  // pad and abort the process.
-  return ::operator new(size, alignment, std::nothrow);
+  return ::operator new(size, alignment);
 }
 
 inline void et_aligned_free(void* ptr, std::align_val_t alignment) {
diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp
index bcf17e4afee..7dc872995a5 100644
--- a/extension/data_loader/test/file_data_loader_test.cpp
+++ b/extension/data_loader/test/file_data_loader_test.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 
-#include <atomic>
 #include <cstring>
-#include <new>
 
 #include <gtest/gtest.h>
 
@@ -27,59 +25,6 @@ using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 
-namespace {
-// When set, the replacement nothrow aligned operator new below returns nullptr,
-// simulating an allocation failure without needing a real OOM.
-std::atomic<bool> g_fail_aligned_nothrow_alloc{false};
-
-// RAII guard to ensure flag is reset even if test asserts early.
-struct FailAllocGuard {
-  FailAllocGuard() {
-    g_fail_aligned_nothrow_alloc.store(true, std::memory_order_relaxed);
-  }
-  ~FailAllocGuard() {
-    g_fail_aligned_nothrow_alloc.store(false, std::memory_order_relaxed);
-  }
-};
-} // namespace
-
-// Detect ASAN to avoid multiple definition link error and to skip test when
-// ASAN runtime provides its own strong operator new.
-#if defined(__SANITIZE_ADDRESS__) || \
-    (defined(__has_feature) && __has_feature(address_sanitizer))
-#define ET_TEST_ASAN_ENABLED 1
-#else
-#define ET_TEST_ASAN_ENABLED 0
-#endif
-
-#if !ET_TEST_ASAN_ENABLED
-// Replaces the global nothrow aligned allocation function for this test binary
-// so FileDataLoader's segment allocation can be made to fail on demand. When
-// the toggle is off it forwards to the real aligned allocator. We call the
-// throwing aligned new inside a try/catch and convert exceptions to nullptr
-// to emulate nothrow semantics without recursing into this same nothrow
-// overload (calling ::operator new(size, alignment, std::nothrow) here would
-// infinite-loop). Memory allocated here is released through the default
-// operator delete, which is not replaced.
-// This is a strong (non-weak) replacement so it reliably overrides libc++'s
-// default on all platforms (a weak definition loses to libc++'s own weak
-// definition on Apple's linker, leaving the override silently unused). Under
-// ASAN this whole block is excluded so it can't clash with ASAN's allocator.
-void* operator new(
-    std::size_t size,
-    std::align_val_t alignment,
-    const std::nothrow_t& /* tag */) noexcept {
-  if (g_fail_aligned_nothrow_alloc.load(std::memory_order_relaxed)) {
-    return nullptr;
-  }
-  try {
-    return ::operator new(size, alignment);
-  } catch (...) {
-    return nullptr;
-  }
-}
-#endif // !ET_TEST_ASAN_ENABLED
-
 class FileDataLoaderTest : public ::testing::TestWithParam<size_t> {
  protected:
   void SetUp() override {
@@ -202,46 +147,6 @@ TEST_P(FileDataLoaderTest, OutOfBoundsLoadFails) {
   }
 }
 
-#if !ET_TEST_ASAN_ENABLED
-TEST_P(FileDataLoaderTest, AllocationFailureDuringLoadReturnsError) {
-  // Create a temp file; contents don't matter.
-  uint8_t data[256] = {};
-  TempFile tf(data, sizeof(data));
-
-  Result<FileDataLoader> fdl =
-      FileDataLoader::from(tf.path().c_str(), alignment());
-  ASSERT_EQ(fdl.error(), Error::Ok);
-
-  // Force the segment allocation inside load() to fail. The loader must surface
-  // Error::MemoryAllocationFailed rather than letting std::bad_alloc escape,
-  // which would abort the process in the exception-free runtime.
-  FailAllocGuard fail_guard;
-  Result<FreeableBuffer> fb = fdl->load(
-      /*offset=*/0,
-      /*size=*/sizeof(data),
-      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program));
-
-  EXPECT_EQ(fb.error(), Error::MemoryAllocationFailed);
-}
-#endif // !ET_TEST_ASAN_ENABLED
-
-#if !ET_TEST_ASAN_ENABLED
-TEST_P(FileDataLoaderTest, AllocationFailureDuringFromReturnsError) {
-  // Create a temp file; contents don't matter.
-  uint8_t data[256] = {};
-  TempFile tf(data, sizeof(data));
-
-  // Force the filename allocation inside from() to fail. FileDataLoader::from
-  // copies the filename using et_aligned_alloc and must return
-  // Error::MemoryAllocationFailed on nullptr rather than throwing.
-  FailAllocGuard fail_guard;
-  Result<FileDataLoader> fdl =
-      FileDataLoader::from(tf.path().c_str(), alignment());
-
-  EXPECT_EQ(fdl.error(), Error::MemoryAllocationFailed);
-}
-#endif // !ET_TEST_ASAN_ENABLED
-
 TEST_P(FileDataLoaderTest, FromMissingFileFails) {
   // Wrapping a file that doesn't exist should fail.
   Result<FileDataLoader> fdl = FileDataLoader::from(
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 342e29e63fc..845778f45c2 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -49,7 +49,7 @@ bool is_aligned(const void* data) {
 }
 
 Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
-    std::string_view key,
+    executorch::aten::string_view key,
     const flatbuffers::Vector<
         flatbuffers::Offset<flat_tensor_flatbuffer::NamedData>>* named_data,
     const flatbuffers::Vector<
@@ -127,7 +127,7 @@ Result<const TensorLayout> create_tensor_layout(
 } // namespace
 
 ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_tensor_layout(
-    std::string_view key) const {
+    executorch::aten::string_view key) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
   if (!segment_end_offset.ok()) {
     return segment_end_offset.error();
@@ -144,7 +144,7 @@ ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_tensor_layout(
 }
 
 ET_NODISCARD Result<FreeableBuffer> FlatTensorDataMap::get_data(
-    std::string_view key) const {
+    executorch::aten::string_view key) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
   if (!segment_end_offset.ok()) {
     return segment_end_offset.error();
@@ -170,7 +170,7 @@ ET_NODISCARD Result<FreeableBuffer> FlatTensorDataMap::get_data(
 }
 
 ET_NODISCARD Error FlatTensorDataMap::load_data_into(
-    ET_UNUSED std::string_view key,
+    ET_UNUSED executorch::aten::string_view key,
     ET_UNUSED void* buffer,
     ET_UNUSED size_t size) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
diff --git a/extension/flat_tensor/flat_tensor_data_map.h b/extension/flat_tensor/flat_tensor_data_map.h
index 7b66eeab470..751e312f7ef 100644
--- a/extension/flat_tensor/flat_tensor_data_map.h
+++ b/extension/flat_tensor/flat_tensor_data_map.h
@@ -54,7 +54,7 @@ class FlatTensorDataMap final
   ET_NODISCARD
   executorch::runtime::Result<
       const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
-  get_tensor_layout(std::string_view key) const override;
+  get_tensor_layout(executorch::aten::string_view key) const override;
 
   /**
    * Retrieve read-only data for the specified key.
@@ -65,7 +65,7 @@ class FlatTensorDataMap final
    */
   ET_NODISCARD
   executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
-      std::string_view key) const override;
+      executorch::aten::string_view key) const override;
 
   /**
    * Loads the data of the specified tensor into the provided buffer.
@@ -78,7 +78,7 @@ class FlatTensorDataMap final
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD executorch::runtime::Error load_data_into(
-      std::string_view key,
+      executorch::aten::string_view key,
       void* buffer,
       size_t size) const override;
 
diff --git a/extension/llm/modules/turboquant/kv_cache.py b/extension/llm/modules/turboquant/kv_cache.py
index 684f763b44e..12c01721a15 100644
--- a/extension/llm/modules/turboquant/kv_cache.py
+++ b/extension/llm/modules/turboquant/kv_cache.py
@@ -158,13 +158,9 @@ def update(self, input_pos, k_val, v_val):
         k_packed, k_norms = self._compress(k_val)
         v_packed, v_norms = self._compress(v_val)
 
-        # index_copy_ (not self.x[:, :, input_pos] = ...) keeps the decode
-        # write CUDA-graph-capturable: a static scatter along the position
-        # dim, matching the model's flat global KV cache. Plain index
-        # assignment lowers to index_put_, which breaks cuda_graph capture.
-        self.k_packed.index_copy_(2, input_pos, k_packed)
-        self.k_norms.index_copy_(2, input_pos, k_norms)
-        self.v_packed.index_copy_(2, input_pos, v_packed)
-        self.v_norms.index_copy_(2, input_pos, v_norms)
+        self.k_packed[:, :, input_pos] = k_packed
+        self.k_norms[:, :, input_pos] = k_norms
+        self.v_packed[:, :, input_pos] = v_packed
+        self.v_norms[:, :, input_pos] = v_norms
 
         return self.k_packed, self.k_norms, self.v_packed, self.v_norms
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
index d76f741fbf4..630395e006c 100644
--- a/extension/named_data_map/merged_data_map.cpp
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -12,13 +12,13 @@
 #include <unordered_map>
 #include <vector>
 
+using executorch::aten::string_view;
 using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
 using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
 using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
-using std::string_view;
 
 namespace executorch::extension {
 namespace ET_MERGED_DATA_MAP_NAMESPACE {
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
index cc291b4d093..42490ec3d58 100644
--- a/extension/named_data_map/merged_data_map.h
+++ b/extension/named_data_map/merged_data_map.h
@@ -48,7 +48,7 @@ class MergedDataMap final
   ET_NODISCARD
   executorch::runtime::Result<
       const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
-  get_tensor_layout(std::string_view key) const override;
+  get_tensor_layout(executorch::aten::string_view key) const override;
 
   /**
    * Retrieve read-only data for the specified key.
@@ -59,7 +59,7 @@ class MergedDataMap final
    */
   ET_NODISCARD
   executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
-      std::string_view key) const override;
+      executorch::aten::string_view key) const override;
 
   /**
    * Loads the data of the specified tensor into the provided buffer.
@@ -72,7 +72,7 @@ class MergedDataMap final
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD executorch::runtime::Error load_data_into(
-      std::string_view key,
+      executorch::aten::string_view key,
       void* buffer,
       size_t size) const override;
 
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index 519842db598..812d3e8fab3 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -19,7 +19,8 @@ namespace executor {
 namespace native {
 
 using Tensor = executorch::aten::Tensor;
-using TensorOptList = executorch::aten::ArrayRef<std::optional<Tensor>>;
+using TensorOptList =
+    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>>;
 
 Tensor& index_put_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
index a2e1afeff32..1fa7a903e7f 100644
--- a/kernels/portable/cpu/op_log_softmax.cpp
+++ b/kernels/portable/cpu/op_log_softmax.cpp
@@ -70,7 +70,7 @@ Tensor& log_softmax_out(
                   size,
                   stride);
 
-              const ACC exp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
+              ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
                     return std::exp(
                         static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
@@ -81,13 +81,13 @@ Tensor& log_softmax_out(
                   in_data + base,
                   size,
                   stride);
-              const ACC log_sum = std::log(exp_sum);
+              temp_sum = std::log(temp_sum);
 
               apply_unary_map_fn(
-                  [max_in, log_sum](const CTYPE val_in) {
+                  [max_in, temp_sum](const CTYPE val_in) {
                     return static_cast<CTYPE>(
                         static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
-                        log_sum);
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/portable/cpu/op_native_dropout.cpp b/kernels/portable/cpu/op_native_dropout.cpp
index dc72fb54599..fae7928568d 100644
--- a/kernels/portable/cpu/op_native_dropout.cpp
+++ b/kernels/portable/cpu/op_native_dropout.cpp
@@ -17,7 +17,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     double prob,
-    std::optional<bool> train,
+    torch::executor::optional<bool> train,
     Tensor& out,
     Tensor& mask) {
   std::tuple<Tensor&, Tensor&> ret(out, mask);
diff --git a/kernels/test/op_native_dropout_test.cpp b/kernels/test/op_native_dropout_test.cpp
index fec00c87862..931205f54a5 100644
--- a/kernels/test/op_native_dropout_test.cpp
+++ b/kernels/test/op_native_dropout_test.cpp
@@ -25,7 +25,7 @@ class OpNativeDropoutTest : public OperatorTest {
   void op_native_dropout_out(
       const Tensor& self,
       double prob,
-      std::optional<bool> train,
+      executorch::aten::optional<bool> train,
       Tensor& out,
       Tensor& mask) {
     torch::executor::aten::native_dropout_outf(
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index ac4fb9a126e..f539414aec9 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -8,9 +8,6 @@
 
 #pragma once
 
-#include <optional>
-#include <string_view>
-
 #include <executorch/runtime/core/error.h> // @manual
 #include <executorch/runtime/core/result.h> // @manual
 #include <executorch/runtime/core/tensor_shape_dynamism.h> // @manual
@@ -186,7 +183,8 @@ using quint2x4 = torch::executor::quint2x4;
 using IntArrayRef = torch::executor::IntArrayRef;
 
 template <typename T>
-using OptionalArrayRef = std::optional<torch::executor::ArrayRef<T>>;
+using OptionalArrayRef =
+    torch::executor::optional<torch::executor::ArrayRef<T>>;
 using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
 
 using torch::executor::compute_numel;
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 3e8e36b442e..f48b50a0786 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -51,7 +51,7 @@ using ScalarType = at::ScalarType;
 namespace executorch {
 namespace aten {
 using ScalarType = torch::executor::ScalarType;
-using string_view = std::string_view;
+using string_view = torch::executor::string_view;
 } // namespace aten
 } // namespace executorch
 #endif // USE_ATEN_LIB
diff --git a/runtime/core/memory_allocator.h b/runtime/core/memory_allocator.h
index 4d8f8da3b4f..001ebd7ac4f 100644
--- a/runtime/core/memory_allocator.h
+++ b/runtime/core/memory_allocator.h
@@ -178,22 +178,6 @@ class MemoryAllocator {
     return size_;
   }
 
-  // Returns the number of bytes currently allocated from this allocator. The
-  // default implementation reports the bump cursor's offset from the base
-  // (cur_ - begin_); subclasses backed by a different allocator should override
-  // this to match their own accounting.
-  virtual size_t used_size() const {
-    return static_cast<size_t>(cur_ - begin_);
-  }
-
-  // Returns the number of bytes still available for allocation, not accounting
-  // for any alignment padding a future allocation may require. The default
-  // implementation reports end_ - cur_; subclasses should override to stay
-  // consistent with used_size().
-  virtual size_t free_size() const {
-    return static_cast<size_t>(end_ - cur_);
-  }
-
   // Resets the current pointer to the base address. It does nothing to
   // the contents.
   virtual void reset() {
diff --git a/runtime/core/named_data_map.h b/runtime/core/named_data_map.h
index dbd5b21a66f..c6b6aa4bb7b 100644
--- a/runtime/core/named_data_map.h
+++ b/runtime/core/named_data_map.h
@@ -31,7 +31,7 @@ class NamedDataMap {
    * @return Result containing TensorLayout.
    */
   ET_NODISCARD virtual Result<const TensorLayout> get_tensor_layout(
-      std::string_view key) const = 0;
+      executorch::aten::string_view key) const = 0;
   /**
    * Get data by key.
    *
@@ -39,7 +39,7 @@ class NamedDataMap {
    * @return Result containing a FreeableBuffer.
    */
   ET_NODISCARD virtual Result<FreeableBuffer> get_data(
-      std::string_view key) const = 0;
+      executorch::aten::string_view key) const = 0;
 
   /**
    * Loads data corresponding to the key into the provided buffer.
@@ -51,8 +51,10 @@ class NamedDataMap {
    * `size` bytes of memory.
    * @returns an Error indicating if the load was successful.
    */
-  ET_NODISCARD virtual Error
-  load_data_into(std::string_view key, void* buffer, size_t size) const = 0;
+  ET_NODISCARD virtual Error load_data_into(
+      executorch::aten::string_view key,
+      void* buffer,
+      size_t size) const = 0;
 
   /**
    * Get the number of keys in the NamedDataMap.
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
index deff1f1b2cc..31ad06fd093 100644
--- a/runtime/core/portable_type/optional.h
+++ b/runtime/core/portable_type/optional.h
@@ -10,16 +10,16 @@
 
 #include <optional>
 
-#include <executorch/runtime/platform/compiler.h>
-
 namespace executorch {
 namespace runtime {
 namespace etensor {
 
-template <typename T>
-using optional ET_DEPRECATED = std::optional<T>;
-using nullopt_t ET_DEPRECATED = std::nullopt_t;
-ET_DEPRECATED inline constexpr std::nullopt_t nullopt{std::nullopt};
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt_t;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::optional;
 
 } // namespace etensor
 } // namespace runtime
@@ -29,9 +29,8 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-template <typename T>
-using optional ET_DEPRECATED = std::optional<T>;
-using nullopt_t ET_DEPRECATED = std::nullopt_t;
-ET_DEPRECATED inline constexpr std::nullopt_t nullopt{std::nullopt};
+using ::executorch::runtime::etensor::nullopt;
+using ::executorch::runtime::etensor::nullopt_t;
+using ::executorch::runtime::etensor::optional;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
index f1f25f0b881..8e28fa022cc 100644
--- a/runtime/core/portable_type/string_view.h
+++ b/runtime/core/portable_type/string_view.h
@@ -10,13 +10,11 @@
 
 #include <string_view>
 
-#include <executorch/runtime/platform/compiler.h>
-
 namespace executorch {
 namespace runtime {
 namespace etensor {
 
-using string_view ET_DEPRECATED = std::string_view;
+using std::string_view;
 
 } // namespace etensor
 } // namespace runtime
@@ -26,6 +24,6 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using string_view ET_DEPRECATED = std::string_view;
+using ::executorch::runtime::etensor::string_view;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
index 15fda045875..b1f57a93ab5 100644
--- a/runtime/core/portable_type/test/CMakeLists.txt
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -19,8 +19,14 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_test_srcs bfloat16_test.cpp dont_shadow_complex_test.c half_test.cpp
-               scalar_test.cpp tensor_impl_test.cpp tensor_test.cpp
+set(_test_srcs
+    bfloat16_test.cpp
+    dont_shadow_complex_test.c
+    half_test.cpp
+    optional_test.cpp
+    scalar_test.cpp
+    tensor_impl_test.cpp
+    tensor_test.cpp
 )
 
 et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS)
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index ce831f89327..a6671d7d400 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -15,6 +15,14 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "optional_test",
+        srcs = ["optional_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/portable_type:portable_type",
+        ],
+    )
+
     runtime.cxx_test(
         name = "tensor_test",
         srcs = ["tensor_test.cpp"],
diff --git a/runtime/core/test/memory_allocator_test.cpp b/runtime/core/test/memory_allocator_test.cpp
index 922ec9a828c..fee95a6407e 100644
--- a/runtime/core/test/memory_allocator_test.cpp
+++ b/runtime/core/test/memory_allocator_test.cpp
@@ -52,89 +52,6 @@ TEST_F(MemoryAllocatorTest, MemoryAllocator) {
   ASSERT_NE(nullptr, allocator.allocate(16));
 }
 
-TEST_F(MemoryAllocatorTest, UsedAndFreeSize) {
-  constexpr size_t mem_size = 64;
-  std::array<uint8_t, mem_size> mem_pool{};
-  MemoryAllocator allocator(mem_size, mem_pool.data());
-
-  EXPECT_EQ(allocator.used_size(), 0u);
-  EXPECT_EQ(allocator.free_size(), mem_size);
-
-  void* p1 = allocator.allocate(8, /*alignment=*/8);
-  ASSERT_NE(p1, nullptr);
-  // Independently derive the expected bump offset from the returned block, so
-  // the free_size() check is a real comparison and not the identity
-  // free_size() == size() - used_size().
-  const size_t expected_used1 = static_cast<size_t>(
-      static_cast<uint8_t*>(p1) + 8 - allocator.base_address());
-  EXPECT_EQ(allocator.used_size(), expected_used1);
-  EXPECT_EQ(allocator.free_size(), mem_size - expected_used1);
-
-  void* p2 = allocator.allocate(8, /*alignment=*/8);
-  ASSERT_NE(p2, nullptr);
-  const size_t expected_used2 = static_cast<size_t>(
-      static_cast<uint8_t*>(p2) + 8 - allocator.base_address());
-  EXPECT_GT(expected_used2, expected_used1);
-  EXPECT_EQ(allocator.used_size(), expected_used2);
-  EXPECT_EQ(allocator.free_size(), mem_size - expected_used2);
-
-  allocator.reset();
-  EXPECT_EQ(allocator.used_size(), 0u);
-  EXPECT_EQ(allocator.free_size(), mem_size);
-}
-
-TEST_F(MemoryAllocatorTest, UsedAndFreeSizeZeroCapacity) {
-  MemoryAllocator allocator(0, nullptr);
-  EXPECT_EQ(allocator.used_size(), 0u);
-  EXPECT_EQ(allocator.free_size(), 0u);
-}
-
-namespace {
-// Overrides the accessors with sentinel values to prove base-reference calls
-// dispatch virtually to the override.
-class SentinelAccessorAllocator : public MemoryAllocator {
- public:
-  using MemoryAllocator::MemoryAllocator;
-  size_t used_size() const override {
-    return 111;
-  }
-  size_t free_size() const override {
-    return 222;
-  }
-};
-} // namespace
-
-TEST_F(MemoryAllocatorTest, UsedAndFreeSizeDispatchVirtually) {
-  std::array<uint8_t, 16> mem_pool{};
-  SentinelAccessorAllocator derived(mem_pool.size(), mem_pool.data());
-  MemoryAllocator& base = derived;
-  EXPECT_EQ(base.used_size(), 111u);
-  EXPECT_EQ(base.free_size(), 222u);
-}
-
-// The base used_size()/free_size() report the true bump-cursor offset
-// (cur_ - begin_): the end of the last block relative to base_address(),
-// including the padding inserted before an aligned allocation. That matches the
-// deleted EspMemoryAllocator's formula (end_block - base_address) exactly. The
-// deleted ArmMemoryAllocator tracker used a different formula that skipped this
-// inter-allocation padding -- for the sequence below it would have reported 17,
-// not 32 -- so dropping it intentionally makes Arm report this corrected value.
-TEST_F(MemoryAllocatorTest, UsedAndFreeSizeAcrossAlignmentPadding) {
-  constexpr size_t mem_size = 128;
-  std::array<uint8_t, mem_size> mem_pool{};
-  MemoryAllocator allocator(mem_size, mem_pool.data());
-
-  // 1-byte block, then a 16-aligned block: 1 + 15 padding + 16 = 32 used.
-  ASSERT_NE(allocator.allocate(1, /*alignment=*/1), nullptr);
-  void* p2 = allocator.allocate(16, /*alignment=*/16);
-  ASSERT_NE(p2, nullptr);
-
-  const size_t expected_used = static_cast<size_t>(
-      static_cast<uint8_t*>(p2) + 16 - allocator.base_address());
-  EXPECT_EQ(allocator.used_size(), expected_used);
-  EXPECT_EQ(allocator.free_size(), allocator.size() - expected_used);
-}
-
 TEST_F(MemoryAllocatorTest, MemoryAllocatorAlignment) {
   constexpr size_t arr_size = 6;
   size_t allocation[arr_size] = {7, 6, 3, 76, 4, 1};
diff --git a/runtime/executor/merged_data_map.h b/runtime/executor/merged_data_map.h
index aae92d90a9b..d5ae97057f2 100644
--- a/runtime/executor/merged_data_map.h
+++ b/runtime/executor/merged_data_map.h
@@ -57,7 +57,7 @@ class MergedDataMap final : public NamedDataMap {
    */
   ET_NODISCARD
   Result<const TensorLayout> get_tensor_layout(
-      std::string_view key) const override {
+      executorch::aten::string_view key) const override {
     auto layout = first_->get_tensor_layout(key);
     if (layout.ok()) {
       return layout.get();
@@ -76,7 +76,8 @@ class MergedDataMap final : public NamedDataMap {
    * @return error if the key is not present or data cannot be loaded.
    */
   ET_NODISCARD
-  Result<FreeableBuffer> get_data(std::string_view key) const override {
+  Result<FreeableBuffer> get_data(
+      executorch::aten::string_view key) const override {
     auto data = first_->get_data(key);
     if (data.error() != Error::NotFound) {
       return data;
@@ -96,7 +97,7 @@ class MergedDataMap final : public NamedDataMap {
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD Error load_data_into(
-      ET_UNUSED std::string_view key,
+      ET_UNUSED executorch::aten::string_view key,
       ET_UNUSED void* buffer,
       ET_UNUSED size_t size) const override {
     return Error::NotImplemented;
diff --git a/runtime/executor/pte_data_map.cpp b/runtime/executor/pte_data_map.cpp
index e35745e7689..881bfd5165a 100644
--- a/runtime/executor/pte_data_map.cpp
+++ b/runtime/executor/pte_data_map.cpp
@@ -26,7 +26,8 @@ namespace internal {
 }
 
 ET_NODISCARD
-Result<FreeableBuffer> PteDataMap::get_data(std::string_view key) const {
+Result<FreeableBuffer> PteDataMap::get_data(
+    executorch::aten::string_view key) const {
   for (uint32_t i = 0; i < named_data_->size(); i++) {
     const auto* named_data_item = named_data_->Get(i);
     ET_CHECK_OR_RETURN_ERROR(
diff --git a/runtime/executor/pte_data_map.h b/runtime/executor/pte_data_map.h
index 36d33ae3945..b4b46a6b541 100644
--- a/runtime/executor/pte_data_map.h
+++ b/runtime/executor/pte_data_map.h
@@ -79,7 +79,7 @@ class PteDataMap final : public NamedDataMap {
    */
   ET_NODISCARD
   Result<const TensorLayout> get_tensor_layout(
-      ET_UNUSED std::string_view key) const override {
+      ET_UNUSED executorch::aten::string_view key) const override {
     return Error::NotImplemented;
   }
 
@@ -91,13 +91,14 @@ class PteDataMap final : public NamedDataMap {
    * @return error if the key is not present or data cannot be loaded.
    */
   ET_NODISCARD
-  Result<FreeableBuffer> get_data(std::string_view key) const override;
+  Result<FreeableBuffer> get_data(
+      executorch::aten::string_view key) const override;
 
   /**
    * The PteDataMap currently does not implement load_into.
    */
   ET_NODISCARD Error load_data_into(
-      ET_UNUSED std::string_view key,
+      ET_UNUSED executorch::aten::string_view key,
       ET_UNUSED void* buffer,
       ET_UNUSED size_t size) const override {
     return Error::NotImplemented;
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index 1324a40cf52..a1991a0562c 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -38,7 +38,7 @@ class TensorInfoTestFriend final {
       Span<const uint8_t> dim_order,
       executorch::aten::ScalarType scalar_type,
       const bool is_memory_planned,
-      std::string_view name) {
+      executorch::aten::string_view name) {
     return TensorInfo::create(
                Span<const int32_t>(sizes.data(), sizes.size()),
                Span<const uint8_t>(dim_order.data(), dim_order.size()),
@@ -236,7 +236,7 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
           Span<const uint8_t>(dim_order.data(), dim_order.size()),
           executorch::aten::ScalarType::Float,
           false, // is_memory_planned
-          std::string_view{nullptr, 0}),
+          executorch::aten::string_view{nullptr, 0}),
       "");
 }
 
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index d7d0cb08567..c0877aac924 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -75,6 +75,7 @@
             "bfloat16_test.cpp",
             "dont_shadow_complex_test.c",
             "half_test.cpp",
+            "optional_test.cpp",
             "scalar_test.cpp",
             "tensor_impl_test.cpp",
             "tensor_test.cpp"

From 15a10d6d08f12ff8f028512f796400f09f0f8154 Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Thu, 25 Jun 2026 10:20:04 -0700
Subject: [PATCH 2/2] =?UTF-8?q?gemma4=5F31b:=20combined=20internal-test=20?=
 =?UTF-8?q?branch=20(export=5Funder=5F32gb=20+=20cuda-decode-speedup=20+?=
 =?UTF-8?q?=20cuda-attn-perf=20+=20tq4-prefill-decode-tuned).=20DO=20NOT?=
 =?UTF-8?q?=20LAND=20=E2=80=94=20internal=20test=20only.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .ci/scripts/export_model_artifact.sh          |   3 +-
 .ci/scripts/test_model_e2e.sh                 | 101 ++++
 .claude/skills/qualcomm/new_op_development.md |  15 +-
 .flake8                                       |   1 +
 .github/workflows/build-cadence-runner.yml    |  64 +--
 .github/workflows/mlx.yml                     |  23 +
 .github/workflows/validate_flatbuffer_gen.yml |  16 +-
 .gitignore                                    |   1 +
 .lintrunner.toml                              |   3 +
 Makefile                                      |  12 +-
 backends/aoti/aoti_backend.py                 |  44 +-
 backends/aoti/aoti_delegate_handle.h          |  26 +
 backends/aoti/aoti_partitioner.py             | 120 ++--
 backends/aoti/tests/TARGETS                   |  12 +
 .../apple/metal/runtime/metal_backend.cpp     |  10 +-
 backends/arm/_passes/__init__.py              |   2 +
 backends/arm/_passes/arm_pass_manager.py      |   7 +-
 .../aten_to_tosa_activation_functions.py      |  18 +
 backends/arm/_passes/decompose_round_pass.py  |  14 +-
 .../arm/_passes/deduplicate_get_attr_pass.py  |  27 +-
 backends/arm/_passes/exir_to_tosa_pass.py     |  43 +-
 .../arm/_passes/insert_dynamic_padding.py     |  14 +-
 backends/arm/_passes/insert_rescales_pass.py  |  35 +-
 backends/arm/_passes/insert_table_ops.py      |   1 +
 backends/arm/_passes/rewrite_conv_pass.py     |  26 +-
 backends/arm/_passes/rewrite_mxfp_linear.py   |  98 +++-
 .../arm/_passes/size_adjust_input_pass.py     |  43 +-
 backends/arm/_passes/symbolic_value_range.py  | 113 +++-
 backends/arm/ao_ext/mxfp.py                   |  79 ++-
 backends/arm/ao_ext/mxfp_tosa_lib.py          |   1 +
 backends/arm/ao_ext/mxfp_transform.py         |   3 +
 backends/arm/ao_ext/ops/__init__.py           |   2 +
 backends/arm/ao_ext/ops/mxfp_linear_op.py     |  77 ++-
 backends/arm/operator_support/TARGETS         |   1 +
 backends/arm/operator_support/__init__.py     |   1 +
 .../tosa_profile_supported_op_lists.py        |   3 +
 .../tosa_supported_operators.py               | 406 +++++++++----
 backends/arm/operators/__init__.py            |   2 +
 .../operators/op_tosa_cast_to_block_scaled.py |  85 ++-
 .../op_tosa_matmul_t_block_scaled.py          |   8 +-
 backends/arm/operators/op_tosa_shapes.py      | 218 ++++++-
 backends/arm/process_node.py                  | 101 +++-
 .../arm/quantizer/quantization_annotator.py   |   1 +
 .../arm/scripts/install_models_for_test.sh    |   7 +-
 backends/arm/scripts/pre-push                 |   2 +-
 backends/arm/test/misc/test_mxfp_linear_ao.py | 103 +++-
 backends/arm/test/misc/test_process_node.py   |  80 ++-
 backends/arm/test/misc/test_runner_utils.py   | 116 ++++
 backends/arm/test/misc/test_vgf_backend.py    | 188 +++++-
 backends/arm/test/misc/test_vgf_check_env.py  |  99 +++-
 .../test_tosa_dialect_cast_to_block_scaled.py |  50 +-
 .../test_tosa_dialect_mxfp_linear.py          |  34 ++
 .../arm/test/ops/mxfp/test_mxfp_linear.py     | 218 +++++--
 backends/arm/test/ops/test_round.py           |   2 -
 .../test_insert_dynamic_padding_pass.py       | 154 +++--
 .../arm/test/passes/test_rewrite_conv_pass.py |  38 +-
 .../passes/test_rewrite_mxfp_linear_pass.py   |  42 +-
 .../test/passes/test_symbolic_value_range.py  |  13 +
 backends/arm/test/runner_utils.py             | 142 ++++-
 backends/arm/test/targets.bzl                 |   3 +
 backends/arm/tosa/dialect/__init__.py         |   1 +
 .../tosa/dialect/ops/cast_to_block_scaled.py  |  25 +-
 backends/arm/tosa/dialect/ops/conv2d.py       |  31 +-
 backends/arm/tosa/dialect/ops/conv3d.py       |  28 +-
 .../arm/tosa/dialect/ops/depthwise_conv2d.py  |  19 +-
 .../tosa/dialect/ops/matmul_t_block_scaled.py |  44 +-
 backends/arm/tosa/mapping.py                  |  17 +-
 backends/arm/tosa/partitioner.py              |  12 +-
 backends/arm/tosa/utils.py                    |   4 +
 backends/arm/vgf/backend.py                   | 160 +++++-
 backends/arm/vgf/check_env.py                 | 177 +-----
 backends/arm/vgf/model_converter.py           | 180 +++++-
 backends/cadence/fused_quant/op_add.cpp       |   2 +-
 backends/cadence/fused_quant/op_add.h         |  13 +-
 backends/cadence/fused_quant/op_bmm.cpp       |   2 +-
 backends/cadence/fused_quant/op_bmm.h         |  13 +-
 backends/cadence/fused_quant/op_hardswish.cpp |   2 +-
 backends/cadence/fused_quant/op_hardswish.h   |   8 +-
 backends/cadence/fused_quant/op_mul.cpp       |   2 +-
 backends/cadence/fused_quant/op_mul.h         |  13 +-
 backends/cadence/fused_quant/op_relu.cpp      |   2 +-
 backends/cadence/fused_quant/op_relu.h        |   8 +-
 backends/cadence/fused_quant/quant_utils.h    |   4 +-
 .../cadence/fused_quant/tests/test_op_add.cpp |   2 +-
 .../cadence/fused_quant/tests/test_op_bmm.cpp |   2 +-
 .../fused_quant/tests/test_op_hardswish.cpp   |   2 +-
 .../cadence/fused_quant/tests/test_op_mul.cpp |   2 +-
 .../fused_quant/tests/test_op_relu.cpp        |   2 +-
 .../generic/operators/op_avg_pool2d.cpp       |   2 +-
 .../cadence/generic/operators/op_avg_pool2d.h |   5 +-
 .../generic/operators/op_fully_connected.cpp  |   2 +-
 .../generic/operators/op_fully_connected.h    |   2 +-
 .../generic/operators/op_linalg_svd.cpp       |   2 +-
 .../cadence/generic/operators/op_linalg_svd.h |   2 +-
 .../operators/op_quantized_conv1d_nlc.cpp     |   2 +-
 .../operators/op_quantized_conv1d_nlc.h       |   2 +-
 .../generic/operators/op_quantized_conv2d.cpp |   4 +-
 .../generic/operators/op_quantized_conv2d.h   |   2 +-
 .../op_quantized_depthwise_conv1d_nlc.cpp     |   2 +-
 .../operators/op_quantized_embedding_byte.cpp |   2 +-
 .../operators/op_quantized_embedding_byte.h   |   3 +-
 .../op_quantized_fully_connected.cpp          |   2 +-
 .../operators/op_quantized_fully_connected.h  |   8 +-
 .../operators/op_quantized_layer_norm.cpp     |   2 +-
 .../generic/operators/op_quantized_linear.cpp |   2 +-
 .../generic/operators/op_quantized_linear.h   |   4 +-
 .../generic/operators/op_quantized_matmul.cpp |   2 +-
 .../generic/operators/op_quantized_matmul.h   |   2 +-
 .../generic/operators/op_quantized_mul.cpp    |   2 +-
 .../generic/operators/op_quantized_relu.cpp   |   2 +-
 .../generic/operators/op_requantize.cpp       |   2 +-
 .../cadence/generic/operators/op_rope.cpp     |   4 +-
 backends/cadence/generic/operators/op_rope.h  |   4 +-
 .../cadence/generic/operators/op_softmax.cpp  |   2 +-
 .../cadence/generic/operators/op_softmax.h    |   2 +-
 .../operators/op_transposed_convolution.cpp   |   2 +-
 backends/cadence/hifi/operators/op_mean.cpp   |   2 +-
 .../operators/op_quantized_conv1d_nlc.cpp     |   2 +-
 .../op_quantized_conv2d_nhwc_out.cpp          |   2 +-
 .../op_quantized_depthwise_conv1d_nlc.cpp     |   2 +-
 .../hifi/operators/op_quantized_matmul_out.h  |   2 +-
 .../hifi/operators/op_softmax_f32_f32.cpp     |   4 +-
 backends/cadence/hifi/operators/operators.h   |   6 +-
 .../operators/op_quantized_conv_out.cpp       |   2 +-
 .../op_quantized_fully_connected_out.cpp      |   2 +-
 .../operators/op_quantized_linear_out.cpp     |   4 +-
 .../operators/op_quantized_matmul_out.cpp     |   4 +-
 .../cadence/vision/operators/op_softmax.cpp   |   2 +-
 backends/cadence/vision/operators/operators.h |   2 +-
 backends/cortex_m/TARGETS                     |  13 +-
 .../ops/cmsis_scratch_buffer_context.h        |   2 +-
 backends/cortex_m/ops/op_quantized_conv2d.cpp |   4 +-
 .../ops/op_quantized_depthwise_conv2d.cpp     |   4 +-
 backends/cortex_m/ops/op_quantized_linear.cpp |   4 +-
 .../ops/op_quantized_transpose_conv2d.cpp     |   4 +-
 backends/cortex_m/passes/BUCK                 |   2 +
 backends/cortex_m/passes/__init__.py          |  30 -
 .../cortex_m/passes/aten_to_cortex_m_pass.py  |   2 +-
 .../cortex_m/passes/scratch_buffer_sizes.py   |   2 +-
 backends/cortex_m/target_config.py            |   3 +-
 .../cortex_m/test/misc/test_cmsis_pybind.py   |   3 +-
 .../cortex_m/test/misc/test_target_config.py  |   3 +-
 backends/cortex_m/test/ops/test_avg_pool2d.py |   2 +-
 backends/cuda/cuda_backend.py                 | 190 ++++++-
 .../quantize_op_dispatch/int4_dispatch.py     |  46 +-
 backends/cuda/runtime/cuda_backend.cpp        |  25 +-
 backends/cuda/tests/test_cuda_partitioner.py  | 160 +++++-
 backends/cuda/tests/test_tq4_sdpa.py          | 438 +++++++++++++-
 backends/cuda/triton/kernels/sdpa.py          | 309 ++++++----
 backends/cuda/triton/kernels/tq4_sdpa.py      | 186 ++----
 .../nxp/backend/edge_program_converter.py     |   1 +
 backends/nxp/backend/graph_utils.py           |   2 +-
 .../ops_converters/__init__.py                |   4 +
 .../ops_converters/clamp_converter.py         |  50 +-
 .../ops_converters/hardtanh_converter.py      |  85 +--
 .../ops_converters/mean_dim_converter.py      | 125 +++-
 .../ir/converter/quantization_utils.py        |   5 +-
 backends/nxp/backend/node_format_inference.py |  73 ++-
 backends/nxp/neutron_partitioner.py           |   3 +-
 backends/nxp/nxp_backend.py                   |  55 +-
 backends/nxp/quantizer/neutron_quantizer.py   |   2 +
 backends/nxp/quantizer/patterns.py            |  43 +-
 backends/nxp/run_unittests.sh                 |   2 +-
 backends/nxp/tests/conftest.py                |   2 +-
 backends/nxp/tests/executorch_pipeline.py     |   4 +
 .../nxp/tests/generic_tests/test_cifarnet.py  |  10 +-
 .../generic_tests/test_convert_div_to_mul.py  |   3 +-
 .../tests/generic_tests/test_integration.py   |   2 +-
 .../test_quantized_input_data.py              |  43 +-
 .../node_converter/test_abs_converter.py      |   6 +-
 .../test_adaptive_avg_pool2d_converter.py     |  11 +-
 .../test_add_tensor_converter.py              |  83 ++-
 .../test_avg_pool2d_converter.py              |  19 +-
 .../node_converter/test_cat_converter.py      |  24 +-
 .../node_converter/test_clamp_converter.py    |  38 +-
 .../test_constant_pad_nd_converter.py         |  29 +-
 .../node_converter/test_hardtanh_converter.py | 313 +++++++---
 .../test_leaky_relu_converter.py              |  18 +-
 .../node_converter/test_log_converter.py      |   6 +-
 .../test_max_pool_2d_converter.py             |  35 +-
 .../node_converter/test_mean_dim_converter.py | 340 ++++++++---
 .../test_mul_tensor_converter.py              |  13 +-
 .../test_permute_copy_converter.py            |  49 +-
 .../node_converter/test_relu_converter.py     |  18 +-
 .../node_converter/test_sigmoid_converter.py  |  13 +-
 .../test_slice_tensor_converter.py            |  40 +-
 .../test_sub_tensor_converter.py              |  89 ++-
 .../node_converter/test_tanh_converter.py     |  18 +-
 .../test_upsample_bilinear2d.py               |  37 +-
 .../node_converter/test_upsample_nearest2d.py |  17 +-
 backends/nxp/tests/model_output_comparator.py |  30 +-
 backends/nxp/tests/nsys_testing.py            | 106 +++-
 backends/nxp/tests/ops_aliases.py             |   2 +
 backends/nxp/tests/utils.py                   |  32 ++
 backends/qualcomm/_passes/__init__.py         |   4 +
 backends/qualcomm/_passes/decompose_acos.py   |   4 +-
 backends/qualcomm/_passes/decompose_atan2.py  |   4 +-
 .../_passes/decompose_log_variants.py         |   6 +-
 .../qualcomm/_passes/decompose_remainder.py   |   6 +-
 backends/qualcomm/_passes/decompose_var.py    |   4 +-
 backends/qualcomm/_passes/qnn_pass_manager.py |   8 +
 backends/qualcomm/_passes/utils.py            |   2 +-
 backends/qualcomm/aot/wrappers/targets.bzl    |   3 +-
 backends/qualcomm/builders/README.md          |   6 +
 backends/qualcomm/debugger/README.md          |   4 +-
 backends/qualcomm/export_utils.py             |   7 +-
 .../quantizer/annotators/htp_rules.py         |   6 +-
 .../quantizer/annotators/lpai_rules.py        |   6 +-
 backends/qualcomm/runtime/targets.bzl         |   5 +-
 backends/qualcomm/targets.bzl                 |  32 +-
 backends/qualcomm/tests/models.py             |  50 ++
 backends/qualcomm/tests/test_qnn_delegate.py  | 166 +++++-
 .../postpone_permute_below_squeeze_view.py    |   6 +-
 .../test/test_permute_optimization_passes.py  |  35 ++
 .../runtime/graph/ops/impl/Q8taBinary.cpp     |  28 +-
 .../runtime/graph/ops/impl/Q8taConv2d.cpp     |  52 +-
 .../runtime/graph/ops/impl/Q8taConv2d.h       |   7 +-
 .../runtime/graph/ops/impl/Q8taConv2dDW.cpp   |  52 +-
 .../graph/ops/impl/Q8taConv2dIm2Col.cpp       |  72 ++-
 .../runtime/graph/ops/impl/Q8taConv2dPW.cpp   |  89 ++-
 .../graph/ops/impl/Q8taConv2dTransposed.cpp   |  58 +-
 .../runtime/graph/ops/impl/Q8taLinear.cpp     |  30 +-
 .../graph/ops/impl/Q8taQuantizeDequantize.cpp |  23 +-
 .../test/custom_ops/impl/TestConv2dDw.cpp     |   8 +-
 backends/vulkan/test/custom_ops/utils.cpp     |   4 +
 backends/vulkan/test/custom_ops/utils.h       |  18 +
 backends/webgpu/CMakeLists.txt                |  39 +-
 backends/webgpu/runtime/WebGPUBackend.cpp     |  11 +-
 backends/webgpu/runtime/WebGPUGraph.cpp       | 247 ++++++--
 backends/webgpu/runtime/WebGPUGraph.h         |  52 +-
 backends/webgpu/runtime/WebGPUUtils.h         |  22 +
 .../webgpu/scripts/test_webgpu_native_ci.sh   | 125 ++--
 backends/webgpu/test/test_build_webgpu.sh     |  29 +-
 backends/webgpu/test/test_webgpu_native.cpp   | 538 +++++++++++++-----
 codegen/api/et_cpp.py                         |   3 +-
 codegen/api/types/types.py                    |   3 +-
 devtools/bundled_program/schema/README.md     |  10 +
 devtools/bundled_program/serialize/BUCK       |   3 +-
 .../bundled_program/serialize/__init__.py     | 184 +++++-
 docs/source/backends/nxp/op-support.csv       |   1 +
 .../executor_runner/arm_memory_allocator.cpp  |  25 +-
 .../executor_runner/arm_memory_allocator.h    |  15 +-
 examples/espressif/README.md                  |   2 -
 .../espressif/executor_runner/CMakeLists.txt  |   3 +-
 .../executor_runner/esp_executor_runner.cpp   |   9 +-
 examples/models/BUCK                          |   3 +
 examples/models/__init__.py                   |   8 +
 examples/models/gemma4_31b/README.md          |  21 +-
 .../gemma4_31b/cuda_source_transformations.py | 223 +++++++-
 examples/models/gemma4_31b/export.py          |  18 +-
 examples/models/gemma4_31b/gguf_loader.py     |  70 ++-
 examples/models/gemma4_31b/main.cpp           | 122 +++-
 examples/models/gemma4_31b/model.py           |   2 +-
 examples/models/gemma4_31b/sampler.py         |   7 +-
 .../gemma4_31b/tests/test_cuda_pipeline.py    |  49 +-
 examples/models/parakeet/CMakeLists.txt       |  61 +-
 examples/models/parakeet/CMakePresets.json    |  12 +-
 examples/models/parakeet/README.md            |  25 +
 examples/models/parakeet/main.cpp             | 274 ++-------
 examples/models/qwen3_5_moe/CMakeLists.txt    |  19 +-
 examples/models/qwen3_5_moe/CMakePresets.json |  43 +-
 examples/models/qwen3_5_moe/README.md         |  83 ++-
 examples/models/qwen3_5_moe/export.py         |   6 +
 .../models/qwen3_5_moe/qwen35_moe_engine.cpp  | 111 +++-
 examples/qualcomm/oss_scripts/llama/README.md | 108 +++-
 examples/qualcomm/oss_scripts/llama/TARGETS   | 131 ++++-
 .../llama/encoder/encoder_config.py           |   7 -
 examples/qualcomm/oss_scripts/llama/llama.py  | 183 +++---
 .../oss_scripts/llama/masking_utils.py        |  91 ++-
 .../llama/mix_precision_analyzer.py           |  50 +-
 .../qualcomm/oss_scripts/llama/tokenizer.py   | 142 +++--
 .../llama/wrappers/base_component.py          |  15 +-
 .../llama/wrappers/llm_wrappers.py            | 493 +++++-----------
 .../llm_utils/decoder_model_wrapper.py        |  13 +-
 .../llm_utils/qnn_decoder_model_manager.py    |  21 +-
 .../qualcomm/oss_scripts/qwen2_5/qwen2_5.py   |   5 +-
 exir/backend/utils.py                         |  10 +-
 exir/pass_base.py                             |  91 +++
 exir/tensor.py                                |   4 +-
 exir/tests/test_pass_infra.py                 | 108 ++++
 exir/tests/test_tensor.py                     |  20 +
 .../make_aten_functor_from_et_functor.h       |   6 +-
 ...make_aten_functor_from_et_functor_test.cpp |  90 ++-
 extension/data_loader/file_data_loader.cpp    |   8 +-
 .../test/file_data_loader_test.cpp            |  95 ++++
 .../flat_tensor/flat_tensor_data_map.cpp      |   8 +-
 extension/flat_tensor/flat_tensor_data_map.h  |   6 +-
 extension/llm/modules/turboquant/kv_cache.py  |  12 +-
 extension/named_data_map/merged_data_map.cpp  |   2 +-
 extension/named_data_map/merged_data_map.h    |   6 +-
 kernels/portable/cpu/op_index_put.cpp         |   3 +-
 kernels/portable/cpu/op_log_softmax.cpp       |   8 +-
 kernels/portable/cpu/op_native_dropout.cpp    |   2 +-
 kernels/test/op_native_dropout_test.cpp       |   2 +-
 runtime/core/exec_aten/exec_aten.h            |   6 +-
 .../core/exec_aten/util/scalar_type_util.h    |   2 +-
 runtime/core/memory_allocator.h               |  16 +
 runtime/core/named_data_map.h                 |  10 +-
 runtime/core/portable_type/optional.h         |  19 +-
 runtime/core/portable_type/string_view.h      |   6 +-
 .../core/portable_type/test/CMakeLists.txt    |  10 +-
 runtime/core/portable_type/test/targets.bzl   |   8 -
 runtime/core/test/memory_allocator_test.cpp   |  83 +++
 runtime/executor/merged_data_map.h            |   7 +-
 runtime/executor/pte_data_map.cpp             |   3 +-
 runtime/executor/pte_data_map.h               |   7 +-
 runtime/executor/test/method_meta_test.cpp    |   4 +-
 test/utils/OSSTestConfig.json                 |   1 -
 308 files changed, 9311 insertions(+), 3284 deletions(-)

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index b5bf19f4155..0e28098a1e8 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -422,8 +422,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
       --no-compile
   echo "::endgroup::"
 
-  # Copy tokenizer for the runner
+  # Copy tokenizer files for the runner and model-specific serving launcher.
   cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
+  cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json"
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 503bd381a8d..d8bca45e695 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -447,4 +447,105 @@ case "$MODEL_NAME" in
 esac
 echo "::endgroup::"
 
+if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
+  echo "::group::Run $MODEL_NAME OpenAI serving smoke"
+  pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1"
+  python -m pip install --no-deps --no-build-isolation --editable . -v
+
+  PORT=$(python - <<'PY'
+import socket
+
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+)
+  SERVER_LOG=$(mktemp)
+  WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+  python -u -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path "${MODEL_DIR}/model.pte" \
+    --data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \
+    --tokenizer-path "${MODEL_DIR}/tokenizer.json" \
+    --hf-tokenizer "${MODEL_DIR}" \
+    --model-id qwen3.5-moe \
+    --max-context 4096 \
+    --max-sessions 2 \
+    --no-think \
+    --worker-bin "$WORKER_BIN" \
+    --host 127.0.0.1 \
+    --port "$PORT" >"$SERVER_LOG" 2>&1 &
+  SERVER_PID=$!
+
+  cleanup_qwen_server() {
+    if kill -0 "$SERVER_PID" 2>/dev/null; then
+      kill "$SERVER_PID" 2>/dev/null || true
+      wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    rm -f "$SERVER_LOG"
+  }
+  trap cleanup_qwen_server EXIT
+
+  if ! python - "$PORT" "$SERVER_LOG" <<'PY'
+import json
+import sys
+import time
+import urllib.request
+
+port = sys.argv[1]
+log_path = sys.argv[2]
+base = f"http://127.0.0.1:{port}"
+
+
+def request(path, payload=None):
+    data = None
+    headers = {}
+    if payload is not None:
+        data = json.dumps(payload).encode("utf-8")
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(base + path, data=data, headers=headers)
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+last = None
+for _ in range(180):
+    try:
+        request("/health")
+        break
+    except Exception as e:
+        last = e
+        time.sleep(1)
+else:
+    print(open(log_path, encoding="utf-8", errors="replace").read())
+    raise RuntimeError(f"server did not become healthy: {last}")
+
+models = request("/v1/models")
+ids = {m["id"] for m in models["data"]}
+if "qwen3.5-moe" not in ids:
+    raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}")
+
+body = {
+    "model": "qwen3.5-moe",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "max_tokens": 32,
+    "temperature": 0,
+}
+resp = request("/v1/chat/completions", body)
+content = resp["choices"][0]["message"].get("content") or ""
+if "Paris" not in content:
+    raise AssertionError(f"expected Paris in serving response, got: {content!r}")
+
+print("Qwen3.5-MoE serving smoke passed")
+PY
+  then
+    echo "Qwen3.5-MoE serving smoke failed; server log:"
+    cat "$SERVER_LOG"
+    exit 1
+  fi
+
+  cleanup_qwen_server
+  trap - EXIT
+  echo "::endgroup::"
+fi
+
 popd
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
index 4133a92ea48..149940b0796 100644
--- a/.claude/skills/qualcomm/new_op_development.md
+++ b/.claude/skills/qualcomm/new_op_development.md
@@ -217,8 +217,17 @@ class DecomposeMyOp(ExportPass):
 
 ### Registration (all decompose passes)
 1. `_passes/__init__.py` — import + `__all__`
-2. `_passes/qnn_pass_manager.py` — import + `transform_for_annotation_pipeline` + `transform_for_export_pipeline` + `get_capture_program_passes`
-3. `_passes/utils.py` — add to `get_passes_dependency_for_capture_program()` with `[RemoveRedundancy]` dependency
+2. `_passes/qnn_pass_manager.py` — The pass manager uses classmethods for pipeline definitions:
+   - **Import** — add to the import block at top of file
+   - **`get_annotation_passes()`** — add pass class to the returned list (runs before quantizer, ATen IR)
+   - **`get_export_passes()`** — add pass class if needed for float-only path (runs after quantization, before to-edge)
+   - **`get_default_pass_activations()`** — add `(PassClass, True)` ONLY if the pass also needs to run in the to-edge pipeline
+   - **`get_passes_dependency_for_capture_program()`** — add `PassClass: [RemoveRedundancy]` dependency ONLY if also in `get_default_pass_activations`
+
+**When to add to which pipeline:**
+- **Annotation only** (most common for decompose passes): `get_annotation_passes()` — pass decomposes the op before the quantizer sees it
+- **Export pipeline** too: if the float-only test fails without it (op doesn't get handled by PyTorch's built-in decomposition during to-edge)
+- **Capture program** (to-edge) too: if the op can appear in edge dialect and needs decomposition there (e.g., `DecomposeVar`, `DecomposeCDist`, `DecomposeDiagonal`)
 
 ---
 
@@ -255,4 +264,4 @@ class DecomposeMyOp(ExportPass):
 
 **Native QNN Op:** `qnn_constants.py` → `op_my_op.py` → `builders/__init__.py` → `htp_rules.py` → `lpai_rules.py` → `layout_transform.py` → `tests/models.py` → `test_qnn_delegate.py` → `partition/utils.py` (skip decomp) → `common_defs.py` (remove to_be_implemented) → `builders/README.md`
 
-**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (annotation + export + capture) → `_passes/utils.py` (dependency) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
+**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (`get_annotation_passes` + optionally `get_export_passes`; if also needed in to-edge: `get_default_pass_activations` + `get_passes_dependency_for_capture_program`) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
diff --git a/.flake8 b/.flake8
index fc9feb45d8b..ffb419da1e4 100644
--- a/.flake8
+++ b/.flake8
@@ -75,6 +75,7 @@ exclude =
     ./configurations,
     ./docs,
     ./exir/_serialize/generated/executorch_flatbuffer,
+    ./devtools/bundled_program/serialize/generated,
     ./third_party,
     *.pyi
 
diff --git a/.github/workflows/build-cadence-runner.yml b/.github/workflows/build-cadence-runner.yml
index 49f750eeea2..c447e4f9a20 100644
--- a/.github/workflows/build-cadence-runner.yml
+++ b/.github/workflows/build-cadence-runner.yml
@@ -19,36 +19,18 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  gate:
-    runs-on: ubuntu-latest
-    outputs:
-      run-cadence: ${{ steps.decide.outputs.run }}
-    steps:
-      - id: decide
-        env:
-          EVENT: ${{ github.event_name }}
-          IS_FORK: ${{ github.event.pull_request.head.repo.full_name != github.repository }}
-          HAS_CLA: ${{ contains(github.event.pull_request.labels.*.name, 'CLA Signed') }}
-          HAS_EXPORT: ${{ contains(github.event.pull_request.labels.*.name, 'meta-exported') }}
-        run: |
-          run=false
-          case "${EVENT}" in
-            push|schedule|workflow_dispatch)
-              run=true
-              ;;
-            pull_request)
-              [ "${IS_FORK}" = "false" ] && run=true
-              ;;
-            pull_request_target)
-              if [ "${IS_FORK}" = "true" ] && [ "${HAS_CLA}" = "true" ] && [ "${HAS_EXPORT}" = "true" ]; then
-                run=true
-              fi
-              ;;
-          esac
-          echo "run=${run}" >> "${GITHUB_OUTPUT}"
-
+  # Same-repo PRs run on pull_request, which reads the PR's own workflow AND code
+  # -- so CI changes, new test jobs, code, and tests are all validated pre-merge.
+  # Fork PRs can't get credentials (OIDC) on pull_request, so Meta-exported forks
+  # (labeled CLA Signed + meta-exported) run on pull_request_target instead. The
+  # run condition is inlined per job (GitHub Actions has no YAML anchors and env
+  # is unavailable in job-level if), so keep the copies in sync.
   cpu-build:
-    if: github.event_name != 'pull_request_target'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -58,7 +40,7 @@ jobs:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: recursive
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: cadence-runner-build
       script: |
@@ -75,21 +57,28 @@ jobs:
 
   cpu-test:
     needs: cpu-build
-    if: github.event_name != 'pull_request_target'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
     uses: ./.github/workflows/_test_cadence.yml
     with:
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
 
   # Cross-compile cadence_executor_runner for each Cadence Xtensa core, one job
   # per backend so they show as separate lines (no matrix grouping). Shared logic
   # lives in _xtensa_build.yml. fusion_g3 is omitted until the upstream fusion_g3
   # <-> nnlib-FusionG3 API skew is fixed (its runner does not link).
   hifi-build:
-    needs: gate
-    if: needs.gate.outputs.run-cadence == 'true'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
@@ -99,8 +88,11 @@ jobs:
       ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && github.event.pull_request.head.sha || github.sha }}
 
   vision-build:
-    needs: gate
-    if: needs.gate.outputs.run-cadence == 'true'
+    if: >-
+      github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      (github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository &&
+      contains(github.event.pull_request.labels.*.name, 'CLA Signed') && contains(github.event.pull_request.labels.*.name, 'meta-exported'))
     permissions:
       id-token: write
       contents: read
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index acc6b4840cf..5a4ccbb4952 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -161,6 +161,29 @@ jobs:
         fi
         echo "::endgroup::"
 
+        echo "::group::Verify chunked == unchunked prefill"
+        QWEN_TINY_PTE=/tmp/qwen35_moe_mlx_tiny/model.pte \
+          ${CONDA_RUN} python -m pytest \
+          examples/models/qwen3_5_moe/test_chunked_prefill.py -v
+        echo "::endgroup::"
+
+        echo "::group::Build Qwen 3.5 MoE MLX C++ runner"
+        # Validates the MLX C++ runner build wiring (compile + link + metallib).
+        # The tiny model has no compatible tokenizer (vocab 256, random weights),
+        # so we don't run C++ inference here — only confirm it builds.
+        ${CONDA_RUN} make qwen3_5_moe-mlx
+        RUNNER=cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
+        if [ ! -x "$RUNNER" ]; then
+          echo "Failed: runner not found at $RUNNER"
+          exit 1
+        fi
+        if [ ! -f "$(dirname "$RUNNER")/mlx.metallib" ]; then
+          echo "Failed: mlx.metallib not copied next to runner"
+          exit 1
+        fi
+        echo "Success: built $RUNNER"
+        echo "::endgroup::"
+
   backend-tester:
     needs: run-decision
     if: |
diff --git a/.github/workflows/validate_flatbuffer_gen.yml b/.github/workflows/validate_flatbuffer_gen.yml
index 96eeda95e04..6c0455784c6 100644
--- a/.github/workflows/validate_flatbuffer_gen.yml
+++ b/.github/workflows/validate_flatbuffer_gen.yml
@@ -5,7 +5,9 @@ on:
   pull_request:
     paths:
       - "schema/**"
-      - "exir/_serialize/generated/executorch_flatbuffer/**"
+      - "devtools/bundled_program/schema/**"
+      - "exir/_serialize/generated/**"
+      - "devtools/bundled_program/serialize/generated/**"
 
 jobs:
   exir-flatbuffer:
@@ -33,3 +35,15 @@ jobs:
             echo "Please run 'python exir/_serialize/generate_program.py' to regenerate the files and commit the changes."
             exit 1
           fi
+
+      - name: Generate bundled program flatbuffer Python
+        run: python devtools/bundled_program/serialize/generate_bundled_program.py
+
+      - name: Validate bundled_program_flatbuffer is unchanged
+        run: |
+          git add -A devtools/bundled_program/serialize/generated
+          if ! git diff --cached --quiet -- devtools/bundled_program/serialize/generated; then
+            echo "Error: bundled_program_flatbuffer has uncommitted changes."
+            echo "Please run 'python devtools/bundled_program/serialize/generate_bundled_program.py' to regenerate the files and commit the changes."
+            exit 1
+          fi
diff --git a/.gitignore b/.gitignore
index 87772e21014..ee206e23d94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ arm-scratch/
 executorch.egg-info
 pip-out/
 build-profiling/
+**/ddr_*_temp
 
 # Any exported models and profiling outputs
 *.bin
diff --git a/.lintrunner.toml b/.lintrunner.toml
index ab498a5d0ac..98c46c78960 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -9,6 +9,7 @@ exclude_patterns = [
     '.github/scripts/**',
     'exir/serde/**',
     'exir/_serialize/generated/executorch_flatbuffer/**',
+    'devtools/bundled_program/serialize/generated/**',
 ]
 command = [
     'python',
@@ -41,6 +42,7 @@ exclude_patterns = [
     '**/third-party/**',
     'exir/serde/**',
     'exir/_serialize/generated/executorch_flatbuffer/**',
+    'devtools/bundled_program/serialize/generated/**',
 ]
 command = [
     'python',
@@ -389,6 +391,7 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'examples/llm_server',
     'backends/cadence/utils/FACTO',
     'examples/cuda',
     'examples/qualcomm',
diff --git a/Makefile b/Makefile
index c93085115aa..552bbf89bd7 100644
--- a/Makefile
+++ b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,6 +131,7 @@ help:
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
+	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -467,6 +468,15 @@ qwen3_5_moe-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+qwen3_5_moe-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building Qwen3.5 MoE runner with MLX..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a478b43cf0f..22f6feeab6c 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
+import hashlib
 import os
 import typing
 from abc import ABC, abstractmethod
@@ -111,6 +112,21 @@ def codesign_so(cls, so_path: str, compile_specs: List[CompileSpec]) -> None:
         """
         return
 
+    @classmethod
+    def move_program_to_device(
+        cls,
+        edge_program: ExportedProgram,
+        device: str,
+        compile_specs: List[CompileSpec],
+    ) -> ExportedProgram:
+        """Move the exported program to the target device for compilation.
+
+        Default implementation moves everything (params, buffers, constants) via
+        ``move_to_device_pass``. Concrete backends may override to keep large
+        non-parameter tensors off the device during a low-memory export.
+        """
+        return move_to_device_pass(edge_program, device)
+
     @classmethod
     def release_moved_tensors(
         cls,
@@ -195,9 +211,13 @@ def preprocess(
         decomposition_table = cls.get_decomposition_table()
         options = cls.get_aoti_compile_options(compile_specs)
 
-        # Move the edge_program to the target device
-        device_edge_program = move_to_device_pass(
-            edge_program, device_name if device_name != "metal" else "mps"
+        # Move the edge_program to the target device. Routed through a hook so
+        # backends can keep large non-parameter tensors (e.g. KV-cache buffers)
+        # off the device during a low-memory export.
+        device_edge_program = cls.move_program_to_device(
+            edge_program,
+            device_name if device_name != "metal" else "mps",
+            compile_specs,
         )
 
         # Replace view_copy with view
@@ -276,18 +296,21 @@ def preprocess(
 
         # Create named data store
         named_data_store = NamedDataStore()
-        method_name = cls.method_name_from_compile_specs(compile_specs)
 
-        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
+        # Key each blob by a content hash so partitions in one method get distinct
+        # keys (a method-name-only key collides). Runtime recovers them from
+        # processed_bytes below.
+        so_blob_key = hashlib.sha256(so_data).hexdigest() + "_so_blob"
+        weights_blob_key = hashlib.sha256(blob_data).hexdigest() + "_weights_blob"
+
+        named_data_store.add_named_data(so_blob_key, so_data, 1, None)
         # Determine whether to save named data externally based on backend setting
         # External: save to separate .ptd file, otherwise merge with .pte file
         external_tag = (
             f"aoti_{device_name}_blob" if cls.save_data_externally() else None
         )
 
-        named_data_store.add_named_data(
-            method_name + "_weights_blob", blob_data, 1, external_tag
-        )
+        named_data_store.add_named_data(weights_blob_key, blob_data, 1, external_tag)
 
         # Clean up the generated files
         os.remove(so_path)
@@ -299,8 +322,11 @@ def preprocess(
         # the next preprocess call (e.g. for the next method).
         cls.release_moved_tensors(device_edge_program, compile_specs)
 
+        # The runtime cannot recompute these hash keys, so carry them (one per line).
+        processed_bytes = (so_blob_key + "\n" + weights_blob_key).encode("utf-8")
+
         return PreprocessResult(
-            processed_bytes=b"",
+            processed_bytes=processed_bytes,
             debug_handle_map={},
             data_store_output=named_data_store.get_named_data_store_output(),
         )
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index 2d1a3146ae5..fbd748306cc 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/freeable_buffer.h>
 #include <string>
 
 namespace executorch {
@@ -17,6 +18,7 @@ namespace backends {
 namespace aoti {
 
 using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
 using executorch::runtime::etensor::Tensor;
 
 extern "C" {
@@ -148,6 +150,30 @@ struct AOTIDelegateHandle {
       update_user_managed_constant_buffer_pairs;
 };
 
+// New-format payload is "<so_key>\n<weights_key>"; an empty payload is a
+// pre-this-change artifact, so fall back to the legacy method-name keys.
+inline Error resolve_blob_keys(
+    const FreeableBuffer* processed,
+    const std::string& method_name,
+    std::string& so_blob_key,
+    std::string& weights_blob_key) {
+  if (processed != nullptr && processed->size() > 0) {
+    const std::string keys(
+        static_cast<const char*>(processed->data()), processed->size());
+    const size_t newline = keys.find('\n');
+    if (newline == std::string::npos) {
+      return Error::Internal;
+    }
+    so_blob_key = keys.substr(0, newline);
+    weights_blob_key = keys.substr(newline + 1);
+  } else {
+    so_blob_key = method_name.empty() ? "so_blob" : method_name + "_so_blob";
+    weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+  }
+  return Error::Ok;
+}
+
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index b263d0f9c81..f84febbdc24 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Mapping, Optional, Tuple
 
 import torch
 from executorch.exir._warnings import experimental
@@ -21,6 +21,8 @@
 )
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupportBase
 
 
 @experimental(
@@ -30,12 +32,10 @@ class AotiPartitioner(Partitioner):
     """
     Base partitioner for AOTInductor-driven backend integration.
 
-    This partitioner creates a single partition containing all operators from the input graph.
-    It skips core ATen decomposition, allowing the backend to handle decomposition using
+    Delegates the non-lowered operators to AOTInductor as one or more convex
+    partitions (a single partition when nothing else has claimed part of the
+    graph). It skips core ATen decomposition, letting the backend decompose via
     AOTInductor's backend-specific decomposition table.
-
-    Only operators that cannot be handled by the aoti library will be excluded from
-    the partition and fall back to ExecuTorch's default or custom handling.
     """
 
     def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
@@ -49,62 +49,76 @@ def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
         self.delegation_spec = DelegationSpec(backend_name, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
-        """
-        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
-        """
+        """Delegate the non-lowered ops to AOTInductor.
 
-        partition_tags: Dict[str, DelegationSpec] = {}
-        tag = "tag0"
-
-        # Tag torch.cond and other control flow operations
-        def is_control_flow(node: torch.fx.Node) -> bool:
-            return node.op == "call_function" and node.target in [
-                torch.ops.higher_order.cond,
-                torch.ops.higher_order.map_impl,
-                torch.ops.higher_order.while_loop,
-            ]
-
-        # Nodes already lowered by an earlier partitioner (e.g. a preceding
-        # TensorRT partition) appear as executorch_call_delegate calls and their
-        # output getitems; re-delegating them would nest a foreign delegate. Tag
-        # only the remaining non-lowered ops so this partitioner composes after
-        # others.
+        Uses CapabilityBasedPartitioner rather than a single tag because a
+        delegated submodule must be convex: if a node that is not delegated sits
+        between the delegated ops, one tag would span a non-convex set and fusion
+        would fail with a dependency cycle.
+        """
+        # Only nodes not already lowered are candidates for this backend.
         non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph))
 
-        for node in exported_program.graph.nodes:
-            if node.op == "call_function":
-                if node not in non_lowered_nodes:
-                    continue
+        control_flow_targets = [
+            torch.ops.higher_order.cond,
+            torch.ops.higher_order.map_impl,
+            torch.ops.higher_order.while_loop,
+            torch.ops.higher_order.scan,
+        ]
+
+        class AotiOperatorSupport(OperatorSupportBase):
+            def is_node_supported(
+                self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+            ) -> bool:
+                return node.op == "call_function" and node in non_lowered_nodes
+
+        partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            AotiOperatorSupport(),
+            allows_single_node_partition=True,
+        )
+
+        partition_tags: Dict[str, DelegationSpec] = {}
+        for partition in partitioner.propose_partitions():
+            tag = f"aoti_{partition.id}"
+            partition_tags[tag] = self.delegation_spec
+            for node in partition.nodes:
                 node.meta["delegation_tag"] = tag
-            # Tag get_attr nodes that are used by control flow operations
-            elif node.op == "get_attr":
-                # Check if any user is a control flow operation
-                for user in node.users:
-                    if is_control_flow(user):
-                        node.meta["delegation_tag"] = tag
-                        break
 
-        partition_tags[tag] = self.delegation_spec
+        # A control-flow op carries its branch GraphModules as get_attr operands;
+        # they must share the op's tag so they land inside the same submodule. A
+        # branch module feeds a single control-flow op, so first match wins.
+        for node in exported_program.graph.nodes:
+            if node.op != "get_attr":
+                continue
+            for user in node.users:
+                if (
+                    user.op == "call_function"
+                    and user.target in control_flow_targets
+                    and "delegation_tag" in user.meta
+                ):
+                    node.meta["delegation_tag"] = user.meta["delegation_tag"]
+                    break
 
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
-        # A constant that still has users feeds only a prior delegate; tagging it
-        # would fail backend lowering's same-tag check (its user keeps the prior
-        # tag). tag_constant_data already claimed the ones this partition uses, so
-        # tag only the genuinely unused constants here.
-        for node in exported_program.graph.nodes:
-            if (
-                node.op == "placeholder"
-                and not node.users
-                and "delegation_tag" not in node.meta
-                and (
-                    is_param(exported_program, node)
-                    or is_buffer(exported_program, node)
-                    or is_lifted_tensor_constant(exported_program, node)
-                )
-            ):
-                node.meta["delegation_tag"] = tag
+        # tag_constant_data only tags constants that have users; tag the
+        # genuinely unused ones too so none are left dangling.
+        if partition_tags:
+            fallback_tag = next(iter(partition_tags))
+            for node in exported_program.graph.nodes:
+                if (
+                    node.op == "placeholder"
+                    and not node.users
+                    and "delegation_tag" not in node.meta
+                    and (
+                        is_param(exported_program, node)
+                        or is_buffer(exported_program, node)
+                        or is_lifted_tensor_constant(exported_program, node)
+                    )
+                ):
+                    node.meta["delegation_tag"] = fallback_tag
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS
index d92e0e32a1f..f41c1bfb517 100644
--- a/backends/aoti/tests/TARGETS
+++ b/backends/aoti/tests/TARGETS
@@ -3,6 +3,18 @@ load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 oncall("executorch")
 
+cpp_unittest(
+    name = "test_resolve_blob_keys",
+    srcs = [
+        "test_resolve_blob_keys.cpp",
+    ],
+    deps = [
+        "//executorch/backends/aoti:delegate_handle",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core:evalue",
+    ],
+)
+
 cpp_unittest(
     name = "test_common_shims",
     srcs = [
diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index c0d996df62b..b9579d59d9c 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -245,8 +245,12 @@ class ET_EXPERIMENTAL MetalBackend final
       }
     }
 
-    std::string so_blob_key =
-        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+    std::string so_blob_key;
+    std::string weights_blob_key;
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        executorch::backends::aoti::resolve_blob_keys(
+            processed, method_name, so_blob_key, weights_blob_key),
+        "Malformed named-data key payload");
     ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str());
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
@@ -258,8 +262,6 @@ class ET_EXPERIMENTAL MetalBackend final
     // Prefetch the weights blob — trigger async readahead so pages are
     // resident by the time update_constants_from_blob memcpy's them.
     // This overlaps disk I/O with the .so write + dlopen (~200ms).
-    std::string weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
     {
       auto prefetch_buf = named_data_map->get_data(weights_blob_key.c_str());
       if (prefetch_buf.ok() && prefetch_buf->data() != nullptr) {
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 29062b57579..27e5088fc72 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -171,12 +171,14 @@
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_max_pool2d_pass import RewriteMaxPool2dPass  # noqa
+from .rewrite_mxfp_conv2d import RewriteMXFPConv2dPass  # noqa
 from .rewrite_mxfp_linear import RewriteMXFPLinearPass  # noqa
 from .rewrite_pad import RewritePadPass  # noqa
 from .rewrite_slice import RewriteSlicePass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
+from .symbolic_to_tosa_shape_pass import SymbolicToTosaShapesPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
 from .replace_inf_and_limit_values_pass import (  # noqa  # usort: skip
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 5ec57ee1787..fedca6eb65b 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -117,6 +117,7 @@
     InsertConstShapesPass,
     InsertControlFlowRescalesPass,
     InsertDataLayoutCastsPass,
+    InsertDynamicPaddingPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
     InsertRescaleInt32Pass,
     InsertRescalePass,
@@ -146,12 +147,14 @@
     RewriteLeLtToGeGtPass,
     RewriteMatmulPass,
     RewriteMaxPool2dPass,
+    RewriteMXFPConv2dPass,
     RewriteMXFPLinearPass,
     RewritePadPass,
     RewriteSlicePass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
+    SymbolicToTosaShapesPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
@@ -610,6 +613,7 @@ def _tosa_pipeline(
                 RewriteMaxPool2dPass(),
                 DecomposeAdaptiveMaxPool2dPass(),
                 RewriteConvPass(exported_program),
+                RewriteMXFPConv2dPass(exported_program),
                 RewriteMXFPLinearPass(exported_program),
                 RewriteMatmulPass(),
                 RewritePadPass(),
@@ -630,6 +634,8 @@ def _tosa_pipeline(
             [
                 CastInt64BuffersToInt32Pass(exported_program),
                 FuseEqualPlaceholdersPass(exported_program),
+                SymbolicToTosaShapesPass(),
+                InsertDynamicPaddingPass(),
                 FuseConsecutiveConcatShapesPass(),
                 EnsureUniqueOutputNodesPass(),
                 RemoveNoopPass(),
@@ -677,7 +683,6 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                     InsertInt32CastsAfterInt64PlaceholdersPass(tfa_pass=True),
                     DecomposeEmbeddingPass(tfa_pass=True),
                     DecomposeScaledDotProductAttentionPass(tfa_pass=True),
-                    DecomposeRoundPass(tfa_pass=True),
                     DecomposeLogitPass(tfa_pass=True),
                     PromoteBoolOperandsPass(tfa_pass=True),
                     DecomposeSignPass(tfa_pass=True),
diff --git a/backends/arm/_passes/aten_to_tosa_activation_functions.py b/backends/arm/_passes/aten_to_tosa_activation_functions.py
index 9b92b31e630..8d51f092991 100644
--- a/backends/arm/_passes/aten_to_tosa_activation_functions.py
+++ b/backends/arm/_passes/aten_to_tosa_activation_functions.py
@@ -128,3 +128,21 @@ def rewrite_clamp(node: Node, pass_: AtenToDialectPass) -> DialectNodeSpec | Non
         exir_ops.backend.tosa.CLAMP.default,
         (node.args[0], *min_max_args),
     )
+
+
+def get_activation_replacement(
+    node: Node, pass_: AtenToDialectPass
+) -> DialectNodeSpec | None:
+    # Dispatch activation rewrites from their ATen target to the matching TOSA
+    # dialect node builder.
+    match node.target:
+        case exir_ops.edge.aten.clamp.default:
+            return rewrite_clamp(node, pass_)
+        case exir_ops.edge.aten.erf.default:
+            return rewrite_erf(node, pass_)
+        case exir_ops.edge.aten.sigmoid.default:
+            return rewrite_sigmoid(node, pass_)
+        case exir_ops.edge.aten.tanh.default:
+            return rewrite_tanh(node, pass_)
+        case _:
+            return None
diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py
index 476f75d6b56..48b26f1d027 100644
--- a/backends/arm/_passes/decompose_round_pass.py
+++ b/backends/arm/_passes/decompose_round_pass.py
@@ -5,7 +5,6 @@
 
 from typing import Set, Type
 
-import torch
 from executorch.backends.arm._passes import ArmOpTargetedPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -33,16 +32,6 @@ def _get_round_decomposition_ops(op) -> tuple[Op, Op, Op, Op, Op, Op, Op]:
             exir_ops.edge.aten.ceil.default,
             exir_ops.edge.aten.where.self,
         )
-    elif op == torch.ops.aten.round.default:
-        return (
-            torch.ops.aten.full.default,
-            torch.ops.aten.ge.Tensor,
-            torch.ops.aten.add.Scalar,
-            torch.ops.aten.sub.Scalar,
-            torch.ops.aten.floor.default,
-            torch.ops.aten.ceil.default,
-            torch.ops.aten.where.self,
-        )
     raise RuntimeError(f"Can't get round decomposition ops for op {op}")
 
 
@@ -65,11 +54,10 @@ class DecomposeRoundPass(ArmOpTargetedPass):
 
     target_ops = {
         exir_ops.edge.aten.round.default,
-        torch.ops.aten.round.default,
     }
 
     def call_operator(self, op, args, kwargs, meta, updated=False):
-        if op not in self.target_ops or not self.allowed_to_transform(meta):
+        if op not in self.target_ops or self._is_quantized_meta(meta):
             return super().call_operator(op, args, kwargs, meta, updated)
         x = args[0]
         input_dtype = x.node.meta["val"].dtype
diff --git a/backends/arm/_passes/deduplicate_get_attr_pass.py b/backends/arm/_passes/deduplicate_get_attr_pass.py
index 201a9036e34..f5760a2fcb8 100644
--- a/backends/arm/_passes/deduplicate_get_attr_pass.py
+++ b/backends/arm/_passes/deduplicate_get_attr_pass.py
@@ -9,6 +9,7 @@
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
+from torch.fx.node import map_arg
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
 
@@ -24,6 +25,13 @@ class DeduplicateGetAttrPass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
+    def _replace_input_node(self, node: Node, old_node: Node, new_node: Node) -> None:
+        def maybe_replace_node(arg: Any) -> Any:
+            return new_node if arg is old_node else arg
+
+        node.args = map_arg(node.args, maybe_replace_node)
+        node.kwargs = map_arg(node.kwargs, maybe_replace_node)
+
     def _get_attr(self, graph_module: GraphModule, target: str) -> Any:
         attr: Any = graph_module
         for target_atom in target.split("."):
@@ -51,9 +59,26 @@ def _copy_attr(self, graph_module: GraphModule, node: Node) -> str:
 
         return attr_name
 
+    def _split_shared_get_attrs(self, graph_module: GraphModule) -> bool:
+        modified = False
+
+        for node in list(graph_module.graph.find_nodes(op="get_attr")):
+            users = list(node.users)
+            if len(users) <= 1:
+                continue
+
+            for user in users[1:]:
+                with graph_module.graph.inserting_before(user):
+                    new_node = graph_module.graph.get_attr(node.target)
+                    new_node.meta.update(node.meta)
+                self._replace_input_node(user, node, new_node)
+                modified = True
+
+        return modified
+
     def call(self, graph_module: GraphModule) -> PassResult:
         seen_targets: set[str] = set()
-        modified = False
+        modified = self._split_shared_get_attrs(graph_module)
 
         for node in graph_module.graph.find_nodes(op="get_attr"):
 
diff --git a/backends/arm/_passes/exir_to_tosa_pass.py b/backends/arm/_passes/exir_to_tosa_pass.py
index b77171b9eaf..c0c6efb1a6c 100644
--- a/backends/arm/_passes/exir_to_tosa_pass.py
+++ b/backends/arm/_passes/exir_to_tosa_pass.py
@@ -5,37 +5,38 @@
 
 import executorch.backends.arm.tosa.dialect  # noqa: F401
 from executorch.backends.arm._passes.aten_to_tosa_activation_functions import (
-    rewrite_clamp,
-    rewrite_erf,
-    rewrite_sigmoid,
-    rewrite_tanh,
+    get_activation_replacement,
+)
+from executorch.backends.arm._passes.aten_to_tosa_tensor_operators import rewrite_argmax
+from executorch.backends.transforms.aten_to_dialect_pass import (
+    AtenToDialectPass,
+    DialectNodeSpec,
 )
-from executorch.backends.transforms.aten_to_dialect_pass import AtenToDialectPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx import Node
 
 
 class ExirToTosaPass(AtenToDialectPass):
     """Rewrite simple EXIR ops to equivalent backend TOSA dialect ops.
 
-    Rewrite functions are grouped by op category and registered with the shared
-    ATen-to-dialect pass infrastructure.
+    Rewrite functions are registered with the shared ATen-to-dialect pass
+    infrastructure.
 
     """
 
 
-_ACTIVATION_FUNCTION_REWRITES = {
-    exir_ops.edge.aten.clamp.default: rewrite_clamp,
-    exir_ops.edge.aten.erf.default: rewrite_erf,
-    exir_ops.edge.aten.sigmoid.default: rewrite_sigmoid,
-    exir_ops.edge.aten.tanh.default: rewrite_tanh,
-}
+@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.argmax.default)
+def _get_tensor_operators_replacement(
+    node: Node, pass_: AtenToDialectPass
+) -> DialectNodeSpec:
+    return rewrite_argmax(node, pass_)
 
-_DIRECT_REWRITE_CATEGORIES = {
-    "activation_functions": _ACTIVATION_FUNCTION_REWRITES,
-}
 
-# Register each category's ATen targets with the function that builds the
-# corresponding TOSA dialect node spec.
-for _rewrite_category in _DIRECT_REWRITE_CATEGORIES.values():
-    for _edge_target, _rewrite_fn in _rewrite_category.items():
-        ExirToTosaPass.register_dialect_substitution(_edge_target)(_rewrite_fn)
+@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.clamp.default)
+@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.erf.default)
+@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.sigmoid.default)
+@ExirToTosaPass.register_dialect_substitution(exir_ops.edge.aten.tanh.default)
+def _get_activation_replacement(
+    node: Node, pass_: AtenToDialectPass
+) -> DialectNodeSpec | None:
+    return get_activation_replacement(node, pass_)
diff --git a/backends/arm/_passes/insert_dynamic_padding.py b/backends/arm/_passes/insert_dynamic_padding.py
index bfc0382e4ad..b1d998268eb 100644
--- a/backends/arm/_passes/insert_dynamic_padding.py
+++ b/backends/arm/_passes/insert_dynamic_padding.py
@@ -29,6 +29,7 @@ class InsertDynamicPaddingPass(ArmOpTargetedPass):
     _passes_required_after: Set[Type[ExportPass]] = set()
     target_ops = (
         exir_ops.backend.tosa.CONV2D.default,
+        exir_ops.backend.tosa.CONV3D.default,
         exir_ops.backend.tosa.DEPTHWISE_CONV2D.default,
         exir_ops.backend.tosa.MAX_POOL2D.default,
         exir_ops.backend.tosa.AVG_POOL2D.default,
@@ -57,11 +58,12 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue:
         if not self._is_dynamic_padding(padding):
             return super().call_operator(op, args, kwargs, meta, updated)
 
-        # Create a pad op before conv2d
+        # Create a pad op before the convolution/pool op.
         input_tensor = args[0]
 
         zero_padding_pair = [0, 0]
-        zero_spatial_padding = [0, 0, 0, 0]
+        spatial_rank = 3 if op == exir_ops.backend.tosa.CONV3D.default else 2
+        zero_spatial_padding = [0] * (spatial_rank * 2)
         N_padding = super().call_shape_operator(
             exir_ops.backend.tosa.CONST_SHAPE.default,
             (zero_padding_pair,),
@@ -93,7 +95,7 @@ def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue:
             meta,
             True,
         )
-        new_conv2d_args = list(args)
-        new_conv2d_args[0] = pad_res
-        new_conv2d_args[padding_index] = zero_spatial_padding
-        return super().call_operator(op, tuple(new_conv2d_args), kwargs, meta, updated)
+        new_args = list(args)
+        new_args[0] = pad_res
+        new_args[padding_index] = zero_spatial_padding
+        return super().call_operator(op, tuple(new_args), kwargs, meta, updated)
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index 45374c12c3b..f84ec5b678e 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -18,6 +18,7 @@
 
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
@@ -35,6 +36,12 @@ class InsertRescalePass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
+    _mxfp_payload_dtypes = {
+        TosaSpecialDtype.FP4E2M1,
+        TosaSpecialDtype.FP6E2M3,
+        TosaSpecialDtype.FP6E3M2,
+    }
+
     def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None:
         """Ensure uint8 tensors only appear at IO boundaries.
 
@@ -51,21 +58,23 @@ def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None:
                 continue
             if node.op in ("placeholder", "output"):
                 continue
-            if node.op == "call_function" and node.target == operator.getitem:
-                if all(user.op == "output" for user in node.users):
+            if node.op == "call_function":
+                if node.target == operator.getitem and all(
+                    user.op == "output" for user in node.users
+                ):
                     continue
-            if (
-                node.op == "call_function"
-                and node.target
-                == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
-            ):
-                # dim_order is a view-like transform; allow it to preserve uint8 at IO.
-                continue
-            if (
-                node.op == "call_function"
-                and node.target == exir_ops.backend.tosa.RESCALE.default
-            ):
+                if node.target == exir_ops.backend.tosa.RESCALE.default:
+                    continue
+                if (
+                    node.target
+                    == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+                ):
+                    # dim_order is a view-like transform; allow it to preserve uint8 at IO.
+                    continue
+            if node.meta.get(TosaSpecialDtype.meta_key()) in self._mxfp_payload_dtypes:
+                # Sub-byte FP types are stored uint8 arrays, so we need an exception for those.
                 continue
+
             raise ValueError(
                 f"Found internal uint8 tensor at node {node.name} "
                 f"({node.target}). Uint8 is only allowed at IO boundaries."
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index 10b85149dad..82d2ff1dbe0 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -58,6 +58,7 @@ class TableOps:
         exir_ops.edge.aten.acos.default: torch.acos,
         exir_ops.edge.aten.tan.default: torch.tan,
         exir_ops.edge.aten.silu.default: torch.nn.functional.silu,
+        exir_ops.edge.aten.round.default: torch.round,
     }
 
     # Targets that must be treated explicitly
diff --git a/backends/arm/_passes/rewrite_conv_pass.py b/backends/arm/_passes/rewrite_conv_pass.py
index 6f588a1a1f1..3ae5ae9f9fb 100644
--- a/backends/arm/_passes/rewrite_conv_pass.py
+++ b/backends/arm/_passes/rewrite_conv_pass.py
@@ -97,23 +97,25 @@ def _adjust_pad_if_needed(
 
         if isinstance(mod_remainder, torch.SymInt):
             shape_env = get_context_shape_env()
-            exact_values = evaluate_symbolic_expr_values(
-                mod_remainder.node.expr, shape_env
-            )
+            exact_values = evaluate_symbolic_expr_values(mod_remainder, shape_env)
             if exact_values is not None:
                 mod_remainder_upper = max(exact_values)
+                if len(exact_values) == 1:
+                    mod_remainder = int(next(iter(exact_values)))
+                elif mod_remainder_upper == 0:
+                    mod_remainder = 0
+                else:
+                    return pad - mod_remainder
             else:
-                value_ranges = shape_env.bound_sympy(mod_remainder.node.expr)
-                mod_remainder_upper = int(value_ranges.upper)
-            if mod_remainder_upper == 0:
-                mod_remainder = 0
-        else:
-            mod_remainder_upper = mod_remainder
-
-        if mod_remainder_upper > pad:
+                # SizeAdjustInputPass already trims symbolic remainder classes
+                # that would force negative padding. Keep the symbolic
+                # expression here instead of asking ShapeEnv to normalize it.
+                return pad - mod_remainder
+        if mod_remainder > pad:
             raise RuntimeError(
-                "This case should be handled by the SizeAdjustInputPass, is it enabled?\n"
+                "This case should be handled by SizeAdjustInputPass, is it enabled?\n"
             )
+
         return pad - mod_remainder
 
     def _is_depthwise_conv2d(self, node: torch.fx.Node) -> bool:
diff --git a/backends/arm/_passes/rewrite_mxfp_linear.py b/backends/arm/_passes/rewrite_mxfp_linear.py
index d4ca436dc41..6f4a475e46e 100644
--- a/backends/arm/_passes/rewrite_mxfp_linear.py
+++ b/backends/arm/_passes/rewrite_mxfp_linear.py
@@ -8,16 +8,53 @@
 from typing import Any, cast, Sequence, Set, Type
 
 import torch
-from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes import ArmOpTargetedPass
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
 )
+from executorch.backends.arm.ao_ext.mxfp import (
+    mxfp_dtype_to_str,
+    mxfp_str_to_dtype,
+    MXFPDType,
+)
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
+
+
+def _get_weights_payload_dtype(
+    qdata_node: torch.fx.Node,
+    dtype: str = "",
+) -> MXFPDType:
+    if dtype:
+        return mxfp_str_to_dtype(dtype)
+    qdata = get_first_fake_tensor(qdata_node)
+    if qdata.dtype == torch.uint8:
+        return torch.float4_e2m1fn_x2
+    return qdata.dtype
+
+
+def _mark_mxfp_payload(node: torch.fx.Node, payload_dtype: MXFPDType) -> None:
+    """Annotate uint8-backed MXFP payload nodes with their TOSA dtype.
 
+    PyTorch represents sub-byte MXFP payloads as ``torch.uint8`` tensors, so
+    the tensor dtype alone cannot distinguish FP4E2M1, FP6E2M3, and FP6E3M2.
+    Store the logical TOSA dtype in node metadata so later lowering and
+    serialization treat the payload as MXFP data rather than ordinary uint8.
+    FP8 payloads have native PyTorch dtypes and do not need this metadata.
 
-class RewriteMXFPLinearPass(ArmPass):
+    """
+    if payload_dtype == torch.float4_e2m1fn_x2:
+        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP4E2M1
+    elif payload_dtype == DTYPE_FP6_E2M3:
+        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E2M3
+    elif payload_dtype == DTYPE_FP6_E3M2:
+        node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.FP6E3M2
+
+
+class RewriteMXFPLinearPass(ArmOpTargetedPass):
     """Rewrite ``tosa_mxfp.linear`` into explicit TOSA MXFP operators.
 
     For each MXFP linear custom op, the pass:
@@ -32,15 +69,24 @@ class RewriteMXFPLinearPass(ArmPass):
 
     """
 
+    target_ops = {
+        torch.ops.tosa_mxfp.linear.default,
+        exir_ops.edge.tosa_mxfp.linear.default,
+    }
     _passes_required_after: Set[Type[ExportPass]] = set()
 
     def __init__(self, exported_program: torch.export.ExportedProgram, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
-    def _get_linear_args(
-        self, node: torch.fx.Node
-    ) -> tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node, torch.fx.Node | None, int]:
+    def _get_linear_args(self, node: torch.fx.Node) -> tuple[
+        torch.fx.Node,
+        torch.fx.Node,
+        torch.fx.Node,
+        torch.fx.Node | None,
+        int,
+        MXFPDType,
+    ]:
         """Extract the MXFP linear operands from a custom-op node."""
         input_node = cast(torch.fx.Node, node.args[0])
         weight_qdata_node = cast(torch.fx.Node, node.args[1])
@@ -53,7 +99,26 @@ def _get_linear_args(
             int,
             node.args[4] if len(node.args) > 4 else node.kwargs.get("block_size", 32),
         )
-        return input_node, weight_qdata_node, weight_scale_node, bias_node, block_size
+        payload_dtype_str = cast(
+            str,
+            (
+                node.args[5]
+                if len(node.args) > 5
+                else node.kwargs.get(
+                    "weight_payload_dtype",
+                    node.kwargs.get("weight_dtype", ""),
+                )
+            ),
+        )
+        payload_dtype = _get_weights_payload_dtype(weight_qdata_node, payload_dtype_str)
+        return (
+            input_node,
+            weight_qdata_node,
+            weight_scale_node,
+            bias_node,
+            block_size,
+            payload_dtype,
+        )
 
     def _reshape_with_view(
         self,
@@ -84,12 +149,15 @@ def _create_block_scaled_inputs(
         weight_qdata_node: torch.fx.Node,
         weight_scale_node: torch.fx.Node,
         block_size: int,
+        payload_dtype: MXFPDType,
     ) -> tuple[torch.fx.Node, torch.fx.Node]:
         """Create rank-3 inputs for the block-scaled cast and matmul ops."""
         graph = graph_module.graph
         input_fake = get_first_fake_tensor(input_node)
         weight_qdata_fake = get_first_fake_tensor(weight_qdata_node)
         weight_scale_fake = get_first_fake_tensor(weight_scale_node)
+        payload_dtype_str = mxfp_dtype_to_str(payload_dtype)
+        _mark_mxfp_payload(weight_qdata_node, payload_dtype)
 
         batches = reduce(operator.mul, input_fake.shape[:-1], 1)
         input_reshape_shape = [1, batches, input_fake.shape[-1]]
@@ -109,13 +177,13 @@ def _create_block_scaled_inputs(
             graph=graph,
             op_target=exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default,
             args=(input_reshaped, block_size),
-            kwargs={"output_dtype": weight_qdata_fake.dtype},
+            kwargs={"output_dtype": payload_dtype_str},
             from_node=mxfp_linear_node,
         )
         cast_node.meta["val"] = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
             get_first_fake_tensor(input_reshaped),
             block_size,
-            output_dtype=weight_qdata_fake.dtype,
+            output_dtype=payload_dtype_str,
         )
 
         input_qdata_node = create_node(
@@ -126,6 +194,7 @@ def _create_block_scaled_inputs(
             from_node=mxfp_linear_node,
         )
         input_qdata_node.meta["val"] = cast_node.meta["val"][0]
+        _mark_mxfp_payload(input_qdata_node, payload_dtype)
 
         input_scale_node = create_node(
             graph=graph,
@@ -150,8 +219,10 @@ def _create_matmul_node(
         weight_qdata_node: torch.fx.Node,
         weight_scale_node: torch.fx.Node,
         block_size: int,
+        payload_dtype: MXFPDType,
     ) -> torch.fx.Node:
         """Insert ``MATMUL_T_BLOCK_SCALED`` with updated fake metadata."""
+        payload_dtype_str = mxfp_dtype_to_str(payload_dtype)
         matmul_node = create_node(
             graph=graph_module.graph,
             op_target=exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default,
@@ -162,7 +233,7 @@ def _create_matmul_node(
                 weight_scale_node,
                 block_size,
             ),
-            kwargs={},
+            kwargs={"payload_dtype": payload_dtype_str},
             from_node=mxfp_linear_node,
         )
         matmul_node.meta["val"] = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default(
@@ -171,6 +242,7 @@ def _create_matmul_node(
             get_first_fake_tensor(weight_qdata_node),
             get_first_fake_tensor(weight_scale_node),
             block_size,
+            payload_dtype=payload_dtype_str,
         )
         return matmul_node
 
@@ -255,6 +327,7 @@ def _rewrite_mxfp_linear_node(
             weight_scale_node,
             bias_node,
             block_size,
+            payload_dtype,
         ) = self._get_linear_args(mxfp_linear_node)
 
         with graph.inserting_before(mxfp_linear_node):
@@ -268,6 +341,7 @@ def _rewrite_mxfp_linear_node(
                 weight_qdata_node,
                 weight_scale_node,
                 block_size,
+                payload_dtype,
             )
             matmul_node = self._create_matmul_node(
                 graph_module,
@@ -277,6 +351,7 @@ def _rewrite_mxfp_linear_node(
                 weight_qdata_node,
                 weight_scale_node,
                 block_size,
+                payload_dtype,
             )
 
         with graph.inserting_after(matmul_node):
@@ -299,10 +374,7 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
 
         for node in list(graph.nodes):
-            if node.op != "call_function" or node.target not in (
-                torch.ops.tosa_mxfp.linear.default,
-                exir_ops.edge.tosa_mxfp.linear.default,
-            ):
+            if node.op != "call_function" or node.target not in self.target_ops:
                 continue
 
             modified = True
diff --git a/backends/arm/_passes/size_adjust_input_pass.py b/backends/arm/_passes/size_adjust_input_pass.py
index 1c331b9c329..6028e618d65 100644
--- a/backends/arm/_passes/size_adjust_input_pass.py
+++ b/backends/arm/_passes/size_adjust_input_pass.py
@@ -62,6 +62,41 @@ def _greater_than(input: SymIntLike, other: int) -> bool | torch.SymBool:
         return input > other
 
 
+def _get_slice_adjustment(
+    remainder: SymIntLike,
+    pad: int,
+    stride: int,
+) -> SymIntLike | None:
+    """Return the amount to slice from the end of a conv dimension.
+
+    The required trim is ``max(remainder - pad, 0)``. For symbolic shapes we
+    encode that clamp using only integer arithmetic that the TOSA shape
+    materializer already supports: a sum of floor-div terms over the possible
+    residue classes.
+
+    """
+    if not isinstance(remainder, torch.SymInt):
+        return remainder - pad if remainder > pad else None
+
+    shape_env = get_context_shape_env()
+    exact_values = evaluate_symbolic_expr_values(remainder.node.expr, shape_env)
+    if exact_values is not None:
+        adjustments = {max(value - pad, 0) for value in exact_values}
+        if len(adjustments) == 1:
+            adjustment = next(iter(adjustments))
+            return adjustment if adjustment > 0 else None
+
+    if pad >= stride - 1:
+        return None
+
+    adjustment: SymIntLike | None = None  # type: ignore[no-redef]
+    for threshold in range(pad + 1, stride):
+        term = (remainder + stride - threshold) // stride
+        adjustment = term if adjustment is None else adjustment + term
+
+    return adjustment
+
+
 def get_slices_convolution(conv_node: torch.fx.Node) -> Slices:
     slices: Slices = []
 
@@ -85,8 +120,12 @@ def get_slices_convolution(conv_node: torch.fx.Node) -> Slices:
         remainder = conv_remainder(
             input_shape[dim], pad, dilation, weight_shape[dim], stride
         )
-        if _greater_than(remainder, pad):
-            adjustment = remainder - pad
+        adjustment = _get_slice_adjustment(
+            remainder,
+            pad,
+            stride,
+        )
+        if adjustment is not None:
             args = (dim, 0, input_shape[dim] - adjustment)
             slices.append(args)
 
diff --git a/backends/arm/_passes/symbolic_value_range.py b/backends/arm/_passes/symbolic_value_range.py
index 0753fefa270..609a84edc54 100644
--- a/backends/arm/_passes/symbolic_value_range.py
+++ b/backends/arm/_passes/symbolic_value_range.py
@@ -39,11 +39,70 @@ def _symbol_values(symbol: sympy.Symbol, shape_env: ShapeEnv) -> _ExactValues:
     return frozenset(sympy.Integer(value) for value in range(lower, upper + 1))
 
 
+def _expr_symbols_to_values(
+    expr: sympy.Basic,
+    shape_env: ShapeEnv,
+) -> dict[sympy.Symbol, _ExactValues]:
+    return {symbol: _symbol_values(symbol, shape_env) for symbol in expr.free_symbols}
+
+
+def _try_expr_to_int(expr: sympy.Basic) -> Optional[int]:
+    integer_value = _expr_to_int(expr)
+    if integer_value is not None:
+        return integer_value
+
+    try:
+        return _expr_to_int(sympy.simplify(expr))
+    except (RecursionError, TypeError):
+        return None
+
+
+def _constant_expr_values(expr: sympy.Basic) -> Optional[set[int]]:
+    if expr.free_symbols:
+        return None
+
+    integer_value = _try_expr_to_int(expr)
+    return {integer_value} if integer_value is not None else None
+
+
+def _evaluate_exact_values(
+    expr: sympy.Basic,
+    shape_env: ShapeEnv,
+) -> _ExactValues:
+    try:
+        return sympy_interp(
+            _ExactValueAnalysis,
+            _expr_symbols_to_values(expr, shape_env),
+            expr,
+            missing_handler=lambda symbol: _symbol_values(symbol, shape_env),
+        )
+    except (RecursionError, TypeError):
+        return None
+
+
+def _exact_values_to_ints(exact_values: _ExactValues) -> Optional[set[int]]:
+    if exact_values is None:
+        return None
+
+    result: set[int] = set()
+    for value in exact_values:
+        integer_value = _try_expr_to_int(value)
+        if integer_value is None:
+            return None
+        result.add(integer_value)
+    return result
+
+
 def _map_values(values: _ExactValues, fn) -> _ExactValues:
     if values is None:
         return None
 
-    result = {sympy.simplify(fn(value)) for value in values}
+    result = set()
+    for value in values:
+        try:
+            result.add(fn(value))
+        except (RecursionError, TypeError):
+            return None
     if len(result) > _MAX_SET_SIZE:
         return None
     return frozenset(result)
@@ -55,7 +114,13 @@ def _combine_values(lhs: _ExactValues, rhs: _ExactValues, fn) -> _ExactValues:
     if len(lhs) * len(rhs) > _MAX_SET_SIZE * _MAX_SET_SIZE:
         return None
 
-    result = {sympy.simplify(fn(a, b)) for a in lhs for b in rhs}
+    result = set()
+    for a in lhs:
+        for b in rhs:
+            try:
+                result.add(fn(a, b))
+            except (RecursionError, TypeError):
+                return None
     if len(result) > _MAX_SET_SIZE:
         return None
     return frozenset(result)
@@ -80,6 +145,12 @@ def mod(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
             return None
         return _combine_values(lhs, rhs, lambda a, b: sympy.Mod(a, b))
 
+    @staticmethod
+    def floordiv(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
+        if rhs is None or any(value == 0 for value in rhs):
+            return None
+        return _combine_values(lhs, rhs, lambda a, b: sympy.floor(a / b))
+
     @staticmethod
     def pow(lhs: _ExactValues, rhs: _ExactValues) -> _ExactValues:
         return _combine_values(lhs, rhs, lambda a, b: a**b)
@@ -104,35 +175,15 @@ def evaluate_symbolic_expr_values(
 ) -> Optional[set[int]]:
     """Return a best-effort finite set of possible integer values.
 
-    The helper first relies on ``bound_sympy`` for cheap singleton detection.
-    When interval bounds are not precise enough, it falls back to a small
-    exact-set analysis over bounded symbols using ``sympy_interp``.
+    The helper avoids ShapeEnv bound queries here because some exported dynamic
+    expressions trigger very deep SymPy normalization. Instead, it relies on a
+    small exact-set analysis over bounded symbols using ``sympy_interp``.
 
     """
-    root_expr = sympy.simplify(
-        expr.node.expr if isinstance(expr, torch.SymInt) else expr
-    )
-    value_range = shape_env.bound_sympy(root_expr)
-    if value_range.is_int and value_range.is_singleton():
-        singleton = _expr_to_int(value_range.lower)
-        return {singleton} if singleton is not None else None
-
-    exact_values = sympy_interp(
-        _ExactValueAnalysis,
-        {
-            symbol: _symbol_values(symbol, shape_env)
-            for symbol in root_expr.free_symbols
-        },
-        root_expr,
-        missing_handler=lambda symbol: _symbol_values(symbol, shape_env),
-    )
-    if exact_values is None:
-        return None
+    root_expr = expr.node.expr if isinstance(expr, torch.SymInt) else expr
 
-    result: set[int] = set()
-    for value in exact_values:
-        integer_value = _expr_to_int(sympy.simplify(value))
-        if integer_value is None:
-            return None
-        result.add(integer_value)
-    return result
+    constant_values = _constant_expr_values(root_expr)
+    if constant_values is not None:
+        return constant_values
+
+    return _exact_values_to_ints(_evaluate_exact_values(root_expr, shape_env))
diff --git a/backends/arm/ao_ext/mxfp.py b/backends/arm/ao_ext/mxfp.py
index 783da92590e..f3b611ce14c 100644
--- a/backends/arm/ao_ext/mxfp.py
+++ b/backends/arm/ao_ext/mxfp.py
@@ -10,12 +10,85 @@
 from executorch.exir._warnings import experimental
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.mx_formats.config import ScaleCalculationMode
+from torchao.prototype.mx_formats.mx_tensor import (
+    DTYPE_FP6_E2M3,
+    DTYPE_FP6_E3M2,
+    to_dtype,
+    to_mx,
+)
 from torchao.quantization import quantize_
 
 
+# Pytorch lacks dtypes for the FP6 types, so we use ao's string representations for those.
+MXFPDType = torch.dtype | str
+
+
+SUPPORTED_MXFP_DTYPES: set[MXFPDType] = {
+    torch.float4_e2m1fn_x2,
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    # Use ao's string representations.
+    DTYPE_FP6_E2M3,
+    DTYPE_FP6_E3M2,
+}
+
+
+_DTYPE_TO_STR: dict[MXFPDType, str] = {
+    DTYPE_FP6_E2M3: "fp6e2m3",
+    DTYPE_FP6_E3M2: "fp6e3m2",
+    torch.float4_e2m1fn_x2: "f4e2m1",
+    torch.float8_e4m3fn: "f8e4m3",
+    torch.float8_e5m2: "f8e5m2",
+}
+
+
+_STR_TO_DTYPE = {value: key for (key, value) in _DTYPE_TO_STR.items()}
+
+
+def mxfp_dtype_to_str(dtype: MXFPDType) -> str:
+    try:
+        return _DTYPE_TO_STR[dtype]
+    except KeyError as e:
+        supported = ", ".join(str(dtype) for dtype in _DTYPE_TO_STR)
+        raise ValueError(
+            f"Unsupported MXFP dtype {dtype}. Supported dtypes: {supported}"
+        ) from e
+
+
+def mxfp_str_to_dtype(dtype: str) -> MXFPDType:
+    try:
+        return _STR_TO_DTYPE[dtype]
+    except KeyError as e:
+        supported = ", ".join(sorted(_STR_TO_DTYPE))
+        raise ValueError(
+            f"Unsupported MXFP dtype string {dtype!r}. Supported strings: {supported}"
+        ) from e
+
+
 def _match_supported_modules(module: torch.nn.Module, _name: str) -> bool:
     """Default filter function that matches supported modules."""
-    return isinstance(module, torch.nn.Linear)
+    return isinstance(module, (torch.nn.Linear, torch.nn.Conv2d))
+
+
+def _cast_to_block_scaled_cpu_ref(
+    input: torch.Tensor,
+    output_dtype: MXFPDType,
+    block_size: int,
+) -> torch.Tensor:
+    """Emulate the current TOSA activation cast in eager mode."""
+    input_scale, input_qdata = to_mx(
+        input.to(torch.float32).contiguous(),
+        elem_dtype=output_dtype,
+        block_size=block_size,
+        scaling_mode=ScaleCalculationMode.RCEIL,
+    )
+    return to_dtype(
+        input_qdata,
+        input_scale,
+        output_dtype,
+        block_size,
+        torch.float32,
+    )
 
 
 @experimental("This API is experimental and may change without notice.")
@@ -23,7 +96,7 @@ def _match_supported_modules(module: torch.nn.Module, _name: str) -> bool:
 class MXFPOpConfig(AOBaseConfig):
     """Configuration for Arm MXFP source transforms."""
 
-    weight_dtype: torch.dtype = torch.float8_e4m3fn
+    weight_dtype: MXFPDType = torch.float8_e4m3fn
     weight_scaling_mode: ScaleCalculationMode = ScaleCalculationMode.RCEIL
 
     # Only block size of 32 is currently supported for now, so we hardcode it here.
@@ -32,7 +105,7 @@ def block_size(self) -> int:
         return 32
 
     def __post_init__(self) -> None:
-        if self.weight_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
+        if self.weight_dtype not in SUPPORTED_MXFP_DTYPES:
             raise ValueError(f"Unsupported weight_dtype: {self.weight_dtype}")
         if not isinstance(self.weight_scaling_mode, ScaleCalculationMode):
             raise ValueError(
diff --git a/backends/arm/ao_ext/mxfp_tosa_lib.py b/backends/arm/ao_ext/mxfp_tosa_lib.py
index 4459ec59126..911d944c720 100644
--- a/backends/arm/ao_ext/mxfp_tosa_lib.py
+++ b/backends/arm/ao_ext/mxfp_tosa_lib.py
@@ -8,4 +8,5 @@
 # MXFP TOSA library definition for the Arm backend containing.
 # This library will generate custom ops like the following example:
 #   torch.ops.tosa_mxfp.linear.default
+#   torch.ops.tosa_mxfp.conv2d.default
 MXFP_TOSA_LIB = Library("tosa_mxfp", "DEF")
diff --git a/backends/arm/ao_ext/mxfp_transform.py b/backends/arm/ao_ext/mxfp_transform.py
index b7823524475..e1f119aa0a0 100644
--- a/backends/arm/ao_ext/mxfp_transform.py
+++ b/backends/arm/ao_ext/mxfp_transform.py
@@ -6,6 +6,7 @@
 import torch
 
 from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig
+from executorch.backends.arm.ao_ext.ops.mxfp_conv2d_op import transform_conv2d_to_mxfp
 from executorch.backends.arm.ao_ext.ops.mxfp_linear_op import transform_linear_to_mxfp
 from torchao.quantization.transform_module import register_quantize_module_handler
 
@@ -20,5 +21,7 @@ def _transform_to_mxfp(
     """
     if isinstance(module, torch.nn.Linear):
         return transform_linear_to_mxfp(module, config)
+    elif isinstance(module, torch.nn.Conv2d):
+        return transform_conv2d_to_mxfp(module, config)
     else:
         return module
diff --git a/backends/arm/ao_ext/ops/__init__.py b/backends/arm/ao_ext/ops/__init__.py
index a690c4b7b02..d4c602154fe 100644
--- a/backends/arm/ao_ext/ops/__init__.py
+++ b/backends/arm/ao_ext/ops/__init__.py
@@ -3,8 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .mxfp_conv2d_op import MXFPConv2dOp
 from .mxfp_linear_op import MXFPLinearOp
 
 __all__ = [
+    "MXFPConv2dOp",
     "MXFPLinearOp",
 ]
diff --git a/backends/arm/ao_ext/ops/mxfp_linear_op.py b/backends/arm/ao_ext/ops/mxfp_linear_op.py
index 5238f85a847..565d8695c5a 100644
--- a/backends/arm/ao_ext/ops/mxfp_linear_op.py
+++ b/backends/arm/ao_ext/ops/mxfp_linear_op.py
@@ -12,17 +12,50 @@
 
 import torch
 import torch.nn.functional as F
-from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig
+from executorch.backends.arm.ao_ext.mxfp import (
+    _cast_to_block_scaled_cpu_ref,
+    mxfp_dtype_to_str,
+    mxfp_str_to_dtype,
+    MXFPDType,
+    MXFPOpConfig,
+)
 from executorch.backends.arm.ao_ext.mxfp_tosa_lib import MXFP_TOSA_LIB
-from torchao.prototype.mx_formats.config import ScaleCalculationMode
 from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx
 
+
+# Define the custom TOSA operator. Note that weight_payload_dtype is needed as
+# an extra argument because sub-byte dtypes (FP4 and FP6) are contained
+# in uint8 tensors, meaning the weight tensor itself does not contain
+# the dtype.
 MXFP_TOSA_LIB.define(
     "linear(Tensor input, Tensor weight_qdata, Tensor weight_scale, "
-    "Tensor? bias=None, SymInt block_size=32) -> Tensor"
+    "Tensor? bias=None, SymInt block_size=32, str weight_payload_dtype='') -> Tensor"
 )
 
 
+def _get_mx_elem_dtype(
+    weight_qdata: torch.Tensor,
+    weight_payload_dtype: str = "",
+) -> MXFPDType:
+    if weight_payload_dtype:
+        return mxfp_str_to_dtype(weight_payload_dtype)
+    if weight_qdata.dtype == torch.uint8:
+        return torch.float4_e2m1fn_x2
+    return weight_qdata.dtype
+
+
+def _get_num_input_features(
+    weight_qdata: torch.Tensor, weight_payload_dtype: str = ""
+) -> int:
+    num_input_features = weight_qdata.shape[-1]
+    if weight_qdata.dtype == torch.uint8 and weight_payload_dtype == mxfp_dtype_to_str(
+        torch.float4_e2m1fn_x2
+    ):
+        # FP4 elements are packed pairwise in each byte in a uint8 tensor.
+        num_input_features *= 2
+    return num_input_features
+
+
 @torch.library.register_fake("tosa_mxfp::linear", lib=MXFP_TOSA_LIB)  # type: ignore[misc]
 def _mxfp_linear_fake(
     input: torch.Tensor,
@@ -30,6 +63,7 @@ def _mxfp_linear_fake(
     weight_scale: torch.Tensor,
     bias: torch.Tensor | None = None,
     block_size: int = 32,
+    weight_payload_dtype: str = "",
 ) -> torch.Tensor:
     if weight_qdata.ndim != 3:
         raise ValueError(
@@ -39,15 +73,16 @@ def _mxfp_linear_fake(
         raise ValueError(
             f"Expected weight_qdata batch dim to be 1, got {weight_qdata.shape[0]}"
         )
-    if input.shape[-1] != weight_qdata.shape[-1]:
+    num_input_features = _get_num_input_features(weight_qdata, weight_payload_dtype)
+    if input.shape[-1] != num_input_features:
         raise ValueError(
             f"Input last dim {input.shape[-1]} must match linear in_features "
-            f"{weight_qdata.shape[-1]}"
+            f"{num_input_features}"
         )
     expected_scale_shape = (
         1,
         weight_qdata.shape[1],
-        weight_qdata.shape[-1] // block_size,
+        num_input_features // block_size,
     )
     if tuple(weight_scale.shape) != expected_scale_shape:
         raise ValueError(
@@ -58,27 +93,6 @@ def _mxfp_linear_fake(
     return input.new_empty(output_shape, dtype=torch.float32)
 
 
-def _cast_to_block_scaled_cpu_ref(
-    input: torch.Tensor,
-    output_dtype: torch.dtype,
-    block_size: int,
-) -> torch.Tensor:
-    """Emulate the current TOSA activation cast in eager mode."""
-    input_scale, input_qdata = to_mx(
-        input.to(torch.float32).contiguous(),
-        elem_dtype=output_dtype,
-        block_size=block_size,
-        scaling_mode=ScaleCalculationMode.RCEIL,
-    )
-    return to_dtype(
-        input_qdata,
-        input_scale,
-        output_dtype,
-        block_size,
-        torch.float32,
-    )
-
-
 @torch.library.impl("tosa_mxfp::linear", "cpu", lib=MXFP_TOSA_LIB)
 def _mxfp_linear_cpu(
     input: torch.Tensor,
@@ -86,23 +100,26 @@ def _mxfp_linear_cpu(
     weight_scale: torch.Tensor,
     bias: torch.Tensor | None = None,
     block_size: int = 32,
+    weight_payload_dtype: str = "",
 ) -> torch.Tensor:
     """CPU reference implementation of the MXFP linear op."""
 
     if weight_qdata.ndim != 3 or weight_scale.ndim != 3:
         raise ValueError("Expected rank-3 weight tensors for MXFP linear")
 
+    elem_dtype = _get_mx_elem_dtype(weight_qdata, weight_payload_dtype)
+
     # Cast the input to block-scaled format and back again to match the
     # expected input format of the TOSA
     dequantized_input = _cast_to_block_scaled_cpu_ref(
         input,
-        weight_qdata.dtype,
+        elem_dtype,
         block_size,
     )
     dequantized_weight = to_dtype(
         weight_qdata,
         weight_scale,
-        weight_qdata.dtype,
+        elem_dtype,
         block_size,
         torch.float32,
     )
@@ -124,6 +141,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
+        self.weight_dtype = mxfp_dtype_to_str(config.weight_dtype)
 
         self.register_buffer("weight_qdata", weight_qdata, persistent=True)
         self.register_buffer("weight_scale", weight_scale, persistent=True)
@@ -146,6 +164,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.weight_scale,
             self.bias,
             self.config.block_size,
+            self.weight_dtype,
         )
 
 
diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS
index a2fd054d472..88e112feac5 100644
--- a/backends/arm/operator_support/TARGETS
+++ b/backends/arm/operator_support/TARGETS
@@ -4,6 +4,7 @@ runtime.python_library(
     name = "operator_support",
     srcs = glob(["*.py"]),
     deps = [
+        "//executorch/backends/arm:ao_ext",
         "//executorch/backends/arm:constants",
         "//executorch/backends/arm/_passes:passes",
         "//executorch/backends/arm/tosa:resize_utils",
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index 066b5462f64..4d48d6ad0ff 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -21,6 +21,7 @@
     reduce_sum_support,
     right_shift_support,
     slice_copy_support,
+    sym_size_int_support,
     to_dim_order_copy_support,
     tosa_supported_operators,
     unfold_copy_support,
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index fab4e6c60c1..dc448ba0d5f 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -99,6 +99,7 @@
     exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
     exir_ops.edge.aten.pad.default,
     exir_ops.edge.aten.constant_pad_nd.default,
+    exir_ops.edge.aten.argmax.default,
     exir_ops.edge.aten.amax.default,
     exir_ops.edge.aten.amin.default,
     exir_ops.edge.aten.eye.default,
@@ -128,6 +129,7 @@
     exir_ops.edge.aten.tan.default,
     exir_ops.edge.aten.silu.default,
     exir_ops.edge.aten.detach_copy.default,
+    exir_ops.edge.aten.round.default,
 }
 
 
@@ -237,6 +239,7 @@
     operator.getitem,
     exir_ops.edge.aten.pad.default,
     exir_ops.edge.aten.constant_pad_nd.default,
+    exir_ops.edge.aten.argmax.default,
     exir_ops.edge.aten.amax.default,
     exir_ops.edge.aten.amin.default,
     exir_ops.edge.aten.eye.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 2e640b758d2..82a529d62a2 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -14,9 +14,12 @@
 import typing
 from typing import final, Optional, Sequence, Type
 
+# Register Arm-specific torch.library ops and MXFP transforms at package
+# import time.
+import executorch.backends.arm.ao_ext  # noqa: F401
+
 import torch
 import torch.fx as fx
-
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_first_fake_tensor,
     is_submodule_node,
@@ -84,7 +87,7 @@ def __init__(self, tosa_spec: TosaSpecification, reporter: WhyNoPartitionReporte
 
     # Class attributes populated by subclasses
     tosa_specs: list[TosaSpecification] = TosaSpecification.all_versions_and_profiles()
-    targets: list[str] = []
+    targets: list[object] = []
 
     @final
     def is_node_supported(
@@ -240,7 +243,10 @@ def get_registered_tosa_support_checks(
 class MXOpsSupportList(OperatorSupportBase):
     """Accept Arm MX custom ops when the active spec enables MX support."""
 
-    targets = (exir_ops.edge.tosa_mxfp.linear.default,)
+    targets = (
+        exir_ops.edge.tosa_mxfp.conv2d.default,
+        exir_ops.edge.tosa_mxfp.linear.default,
+    )
 
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
@@ -248,88 +254,141 @@ def is_node_supported(
         return node.op == "call_function" and node.target in self.targets
 
 
-def tosa_support_factory(
+def _profile_support_check(
     tosa_spec: TosaSpecification,
-    exported_program: ExportedProgram,
-    reporter: WhyNoPartitionReporter,
-    additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
-) -> OperatorSupportBase:
-    """Create an OperatorSupport composite for a TOSA spec.
+) -> Optional[OperatorSupportBase]:
+    if tosa_spec.support_integer() and tosa_spec.support_float():
+        return TOSAProINTFPSupportList()
+    if tosa_spec.support_integer():
+        return TOSAProINTSupportList()
+    if tosa_spec.support_float():
+        return TOSAProFPSupportList()
+    return None
 
-    Combine profile-specific positive checks, registered operator checks, and
-    negative checks into a single :py:class:`OperatorSupportBase` chain.
 
-    Args:
-        tosa_spec (TosaSpecification): Active TOSA specification.
-        exported_program (ExportedProgram): Program context for checks.
-        reporter (WhyNoPartitionReporter): Reporter for rejections.
-        additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
-            negative checks to apply.
+def _registered_support_checks(
+    tosa_spec: TosaSpecification,
+    reporter: WhyNoPartitionReporter,
+) -> list[OperatorSupportBase]:
+    return [
+        check(tosa_spec, reporter)
+        for check in get_registered_tosa_support_checks(tosa_spec)
+    ]
 
-    Returns:
-        OperatorSupportBase: Composite checker for the given spec.
 
-    """
-    # Postive checks: Add nodes to partitioning
-    positive_checks: list[OperatorSupportBase] = [
+def _positive_checks(
+    tosa_spec: TosaSpecification,
+    exported_program: ExportedProgram,
+    reporter: WhyNoPartitionReporter,
+) -> list[OperatorSupportBase]:
+    checks: list[OperatorSupportBase] = [
         ControlFlowSubmoduleSupported(exported_program, tosa_spec, reporter),
         ControlFlowOpSupported(exported_program, tosa_spec, reporter),
     ]
 
-    if tosa_spec.support_integer() and tosa_spec.support_float():
-        positive_checks.append(TOSAProINTFPSupportList())
-    elif tosa_spec.support_integer():
-        positive_checks.append(TOSAProINTSupportList())
-    elif tosa_spec.support_float():
-        positive_checks.append(TOSAProFPSupportList())
+    if profile_check := _profile_support_check(tosa_spec):
+        checks.append(profile_check)
+
     if tosa_spec.support_extension("mxfp"):
-        positive_checks.append(MXOpsSupportList())
+        checks.append(MXOpsSupportList())
+
     # TODO: Refactor to use TOSAProSupportLists + negtive checks
-    positive_checks += [
-        check(tosa_spec, reporter)
-        for check in get_registered_tosa_support_checks(tosa_spec)
-    ]
+    checks.extend(_registered_support_checks(tosa_spec, reporter))
 
-    # Negative checks: Remove nodes from partitioning
-    negative_checks: list[OperatorSupportBase] = [
-        CheckInt64InputsAndOutputs(exported_program, reporter),
-        RankCheck(reporter, max_rank=MAX_RANK),
-        *[
-            reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
-            for check in (additional_checks if additional_checks else [])
-        ],
-    ]
+    return checks
 
-    if tosa_spec.support_float():
-        negative_checks.append(CheckMixedFloatingInputs(reporter))
-    else:
-        negative_checks.append(CheckArmQuantized(reporter))
-        negative_checks.append(CheckProperQuantization(reporter))
 
-    disallowed_dtypes = [torch.float64]
+def _disallowed_dtypes(tosa_spec: TosaSpecification) -> list[torch.dtype]:
+    dtypes = [torch.float64]
     if not tosa_spec.support_extension("bf16"):
-        disallowed_dtypes.append(torch.bfloat16)
+        dtypes.append(torch.bfloat16)
     if not (
         tosa_spec.support_extension("fp8e4m3") or tosa_spec.support_extension("mxfp")
     ):
-        disallowed_dtypes.append(torch.float8_e4m3fn)
+        dtypes.append(torch.float8_e4m3fn)
     if not (
         tosa_spec.support_extension("fp8e5m2") or tosa_spec.support_extension("mxfp")
     ):
-        disallowed_dtypes.append(torch.float8_e5m2)
+        dtypes.append(torch.float8_e5m2)
     if tosa_spec.is_U55_subset:
-        disallowed_dtypes.append(torch.bool)
-    negative_checks.append(
+        dtypes.append(torch.bool)
+    return dtypes
+
+
+def _wrapped_additional_checks(
+    additional_checks: Optional[Sequence[OperatorSupportBase]],
+    reporter: WhyNoPartitionReporter,
+) -> list[OperatorSupportBase]:
+    if not additional_checks:
+        return []
+    return [
+        reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
+        for check in additional_checks
+    ]
+
+
+def _negative_checks(
+    tosa_spec: TosaSpecification,
+    exported_program: ExportedProgram,
+    reporter: WhyNoPartitionReporter,
+    additional_checks: Optional[Sequence[OperatorSupportBase]],
+) -> list[OperatorSupportBase]:
+    checks: list[OperatorSupportBase] = [RankCheck(reporter, MAX_RANK)]
+
+    if not tosa_spec.support_extension("int64"):
+        checks.append(CheckInt64InputsAndOutputs(exported_program, reporter, tosa_spec))
+
+    checks.extend(_wrapped_additional_checks(additional_checks, reporter))
+
+    if tosa_spec.support_float():
+        checks.append(CheckMixedFloatingInputs(reporter))
+    else:
+        checks.append(CheckArmQuantized(reporter))
+        checks.append(CheckProperQuantization(reporter))
+
+    checks.append(
         CheckDtypeInputsAndOutputs(
-            exported_program, reporter, disallowed_dtypes, tosa_spec
+            exported_program, reporter, _disallowed_dtypes(tosa_spec), tosa_spec
         )
     )
+
     if tosa_spec.is_U55_subset:
-        negative_checks.append(EthosU55NotSupported(reporter))
-        negative_checks.append(EthosU55DtypeSupport(reporter))
-        negative_checks.append(EthosU55CastCheck(reporter))
+        checks.append(EthosU55NotSupported(reporter))
+        checks.append(EthosU55DtypeSupport(reporter))
+        checks.append(EthosU55CastCheck(reporter))
+
     if not tosa_spec.support_extension("shape"):
-        negative_checks.append(SymbolicShapeSupportCheck(reporter))
+        checks.append(SymbolicShapeSupportCheck(reporter))
+
+    return checks
+
+
+def tosa_support_factory(
+    tosa_spec: TosaSpecification,
+    exported_program: ExportedProgram,
+    reporter: WhyNoPartitionReporter,
+    additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
+) -> OperatorSupportBase:
+    """Create an OperatorSupport composite for a TOSA spec.
+
+    Combine profile-specific positive checks, registered operator checks, and
+    negative checks into a single :py:class:`OperatorSupportBase` chain.
+
+    Args:
+        tosa_spec (TosaSpecification): Active TOSA specification.
+        exported_program (ExportedProgram): Program context for checks.
+        reporter (WhyNoPartitionReporter): Reporter for rejections.
+        additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
+            negative checks to apply.
+
+    Returns:
+        OperatorSupportBase: Composite checker for the given spec.
+
+    """
+    positive_checks = _positive_checks(tosa_spec, exported_program, reporter)
+    negative_checks = _negative_checks(
+        tosa_spec, exported_program, reporter, additional_checks
+    )
 
     return chain(
         reporter.wrap_check(
@@ -368,6 +427,40 @@ def _has_symbolic_shape(node: fx.Node) -> bool:
 
         return False
 
+    def _partition_dynamic_upmsample_nearest2d(self, node: fx.Node) -> bool:
+        """Check if the node is an upsample_nearest2d with symbolic shapes.
+
+        Args:
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if the node is an upsample_nearest2d with symbolic
+                shapes; otherwise, False.
+
+        """
+        if node.target != exir_ops.edge.aten.upsample_nearest2d.vec:
+            return False
+
+        try:
+            input_tensor = get_first_fake_tensor(node.all_input_nodes[0])
+            output_tensor = get_first_fake_tensor(node)
+        except Exception as exc:
+            self.reporter.report_reject(
+                node,
+                f"upsample_nearest2d symbolic shapes need tensor metadata: {exc}",
+            )
+            return False
+
+        input_size_xy = input_tensor.shape[2:4]
+        output_size_xy = output_tensor.shape[2:4]
+        if len(input_size_xy) != 2 or len(output_size_xy) != 2:
+            self.reporter.report_reject(
+                node, "upsample_nearest2d expects 2D spatial input/output."
+            )
+            return False
+
+        return True
+
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
@@ -394,14 +487,13 @@ def is_node_supported(
             self._has_symbolic_shape(input_node) for input_node in node.all_input_nodes
         ):
             if node.target == exir_ops.edge.aten.upsample_nearest2d.vec:
-                return True
-
-            self.reporter.report_reject(
-                node,
-                "Node has symbolic shape but the TOSA spec does not support "
-                "the shape extension.",
-            )
-            return False
+                return self._partition_dynamic_upmsample_nearest2d(node)
+            else:
+                self.reporter.report_reject(
+                    node,
+                    "Node has symbolic shape, has the TOSA spec shape extension support?",
+                )
+                return False
 
         return True
 
@@ -562,7 +654,10 @@ def is_node_supported(
             self.reporter.report_reject(node, "One or more inputs were not quantized.")
             return False
 
-        all_q_users = all((output_node.target in Q_OPS) for output_node in node.users)
+        all_q_users = all(
+            output_node.target in (*Q_OPS, torch.ops.aten.sym_size.int)
+            for output_node in node.users
+        )
         output_dtype = get_first_fake_tensor(node).dtype
         output_quantized = (
             output_quantized or all_q_users or _is_integer_dtype(output_dtype)
@@ -588,7 +683,10 @@ class CheckInt64InputsAndOutputs(OperatorSupportBase):
     """
 
     def __init__(
-        self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter
+        self,
+        exported_program: ExportedProgram,
+        reporter: WhyNoPartitionReporter,
+        tosa_spec: TosaSpecification,
     ):
         """Initialize the check with program context and reporter."""
         self.input_names = [
@@ -597,6 +695,7 @@ def __init__(
             if spec.kind == InputKind.USER_INPUT
         ]
         self.reporter = reporter
+        self.tosa_spec = tosa_spec
         self.int32_min = torch.iinfo(torch.int32).min
         self.int32_max = torch.iinfo(torch.int32).max
         super().__init__()
@@ -609,6 +708,138 @@ def inside_int32_bounds(self, node: torch.fx.Node) -> bool:
         min_val, max_val = int(torch.min(data)), int(torch.max(data))
         return min_val >= self.int32_min and max_val <= self.int32_max
 
+    def has_rejected_int64_output(
+        self, node: torch.fx.Node, tensor_list: Sequence[typing.Any]
+    ) -> bool:
+        if node.target in (
+            torch.ops.aten.argmax.default,
+            exir_ops.edge.aten.argmax.default,
+        ):
+            return not self._is_tosa_argmax_supported(node)
+        return any(
+            tensor.dtype == torch.int64
+            for tensor in tensor_list
+            if isinstance(tensor, FakeTensor)
+        )
+
+    def _is_tosa_argmax_dtype_supported(
+        self, node: torch.fx.Node, input_dtype: torch.dtype
+    ) -> bool:
+        if input_dtype == torch.int8:
+            if not self.tosa_spec.support_integer():
+                self.reporter.report_reject(
+                    node, "TOSA ARGMAX requires PRO-INT for int8 input."
+                )
+                return False
+        elif input_dtype == torch.int16:
+            if not (
+                self.tosa_spec.support_integer()
+                and self.tosa_spec.support_extension("int16")
+            ):
+                self.reporter.report_reject(
+                    node, "TOSA ARGMAX requires EXT-INT16 for int16 input."
+                )
+                return False
+        elif input_dtype in (torch.float16, torch.float32):
+            if not self.tosa_spec.support_float():
+                self.reporter.report_reject(
+                    node, f"TOSA ARGMAX requires PRO-FP for {input_dtype} input."
+                )
+                return False
+        elif input_dtype == torch.bfloat16:
+            if not (
+                self.tosa_spec.support_float()
+                and self.tosa_spec.support_extension("bf16")
+            ):
+                self.reporter.report_reject(
+                    node, "TOSA ARGMAX requires EXT-BF16 for bfloat16 input."
+                )
+                return False
+        else:
+            self.reporter.report_reject(
+                node, f"TOSA ARGMAX does not support {input_dtype} input."
+            )
+            return False
+        return True
+
+    def _is_tosa_argmax_supported(self, node: torch.fx.Node) -> bool:
+        dim = node.kwargs.get("dim", node.args[1] if len(node.args) > 1 else None)
+        if dim is None:
+            self.reporter.report_reject(
+                node, "TOSA ARGMAX requires an explicit reduction dimension."
+            )
+            return False
+        if not isinstance(dim, int):
+            self.reporter.report_reject(
+                node, "TOSA ARGMAX requires a statically known reduction dimension."
+            )
+            return False
+
+        input_node = typing.cast(torch.fx.Node, node.args[0])
+        input_tensor = get_first_fake_tensor(input_node)
+        if not self._is_tosa_argmax_dtype_supported(node, input_tensor.dtype):
+            return False
+
+        input_rank = len(input_tensor.shape)
+        if input_rank == 0:
+            self.reporter.report_reject(
+                node, "TOSA ARGMAX requires an input with rank at least 1."
+            )
+            return False
+
+        axis = dim + input_rank if dim < 0 else dim
+        if axis < 0 or axis >= input_rank:
+            self.reporter.report_reject(
+                node,
+                f"TOSA ARGMAX axis must be in [0, {input_rank - 1}] but got {dim}.",
+            )
+            return False
+
+        keepdim = node.kwargs.get(
+            "keepdim", node.args[2] if len(node.args) > 2 else False
+        )
+        if keepdim:
+            self.reporter.report_reject(
+                node, "TOSA ARGMAX does not support keepdim=True."
+            )
+            return False
+
+        return True
+
+    def _check_int64_input_nodes(self, node: torch.fx.Node) -> bool:
+        """Check if all int64 input nodes are constant and will be
+        partitioned.
+        """
+        for input_node in (
+            input_node
+            for input_node in node.all_input_nodes
+            if input_node.op != "get_attr"
+        ):
+            if isinstance(input_node.meta["val"], torch.SymInt):
+                continue
+            tensor_in = get_first_fake_tensor(input_node)
+            if tensor_in.dtype != torch.int64:
+                continue
+            # Constant placeholder
+            if (
+                input_node.op != "call_function"
+                and input_node.name not in self.input_names
+            ):
+                continue
+            # Constant operator
+            if input_node.op == "call_function":
+                if input_node.target in ComputeConstantOpsAOTPass.targeted_ops:
+                    # This is not perfect since the input_node can still be rejected by other checks but
+                    # this should cover the majority of cases.
+                    if self.is_node_supported({}, input_node):
+                        continue
+            self.reporter.report_reject(
+                node, f"Non-constant int64 input {input_node.name}"
+            )
+            return False
+
+        return True
+
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
@@ -618,7 +849,7 @@ def is_node_supported(
         vals = node.meta["val"]
         tensor_list = vals if isinstance(vals, (list, tuple)) else [vals]
 
-        any_int64 = any(tensor.dtype == torch.int64 for tensor in tensor_list)
+        any_int64 = self.has_rejected_int64_output(node, tensor_list)
         # Don't partition nodes with int64 output...
         if any_int64:
             # ... Except for constant ops that are directly cast to something non-int64.
@@ -652,35 +883,7 @@ def is_node_supported(
                 )
                 return False
 
-        # Ops with int64 inputs are only partitioned if input nodes are constant and will be partitioned.
-        # If it is not partitioned, the partition will get an int64 input and fail.
-        for input_node in (
-            input_node
-            for input_node in node.all_input_nodes
-            if input_node.op != "get_attr"
-        ):
-            tensor_in = get_first_fake_tensor(input_node)
-            if tensor_in.dtype != torch.int64:
-                continue
-            # Constant placeholder
-            if (
-                input_node.op != "call_function"
-                and input_node.name not in self.input_names
-            ):
-                continue
-            # Constant operator
-            if input_node.op == "call_function":
-                if input_node.target in ComputeConstantOpsAOTPass.targeted_ops:
-                    # This is not perfect since the input_node can still be rejected by other checks but
-                    # this should cover the majority of cases.
-                    if self.is_node_supported({}, input_node):
-                        continue
-            self.reporter.report_reject(
-                node, f"Non-constant int64 input {input_node.name}"
-            )
-            return False
-
-        return True
+        return self._check_int64_input_nodes(node)
 
 
 class CheckDtypeInputsAndOutputs(OperatorSupportBase):
@@ -712,6 +915,9 @@ def is_node_supported(
             for input_node in node.all_input_nodes
             if input_node.op != "get_attr"
         ):
+            if isinstance(input_node.meta["val"], torch.SymInt):
+                continue
+
             tensor = get_first_fake_tensor(input_node)
             if tensor.dtype in self.disallowed_dtypes:
                 self.reporter.report_reject(
@@ -772,6 +978,8 @@ def is_node_supported(
             for input_node in node.all_input_nodes
             if input_node.op != "get_attr"
         ):
+            if isinstance(input_node.meta["val"], torch.SymInt):
+                continue
             dtype = get_first_fake_tensor(input_node).dtype
             if dtype.is_floating_point:
                 floating_dtypes.add(dtype)
@@ -809,6 +1017,8 @@ def is_node_supported(
         )
         # check if any input node has an unsupported rank
         for input_node in input_nodes:
+            if isinstance(input_node.meta["val"], torch.SymInt):
+                continue
             input_node_shape = get_first_fake_tensor(input_node).shape
             if len(input_node_shape) > self.max_rank:
                 self.reporter.report_reject(
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index aa988a1ccd7..1acaf4e65ef 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -42,11 +42,13 @@
     op_sub,
     op_sum,
     op_to_dim_order_copy,
+    op_tosa_argmax,
     op_tosa_avg_pool2d,
     op_tosa_avg_pool2d_adaptive,
     op_tosa_cast_to_block_scaled,
     op_tosa_clamp,
     op_tosa_conv2d,
+    op_tosa_conv2d_block_scaled,
     op_tosa_conv3d,
     op_tosa_custom,
     op_tosa_depthwise_conv2d,
diff --git a/backends/arm/operators/op_tosa_cast_to_block_scaled.py b/backends/arm/operators/op_tosa_cast_to_block_scaled.py
index 454c28ddfe2..b49fa521866 100644
--- a/backends/arm/operators/op_tosa_cast_to_block_scaled.py
+++ b/backends/arm/operators/op_tosa_cast_to_block_scaled.py
@@ -5,7 +5,7 @@
 """Provide a visitor for lowering block-scaled casts to TOSA."""
 
 import operator
-from typing import Any, cast, List
+from typing import Any, List
 
 import torch
 import tosa_serializer as ts
@@ -16,25 +16,36 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_valid_dtype,
 )
-from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import TosaSpecification
 
 
-def _ordered_getitem_output_names(node: torch.fx.Node) -> list[str]:
+def _getitem_index(node: torch.fx.Node) -> int:
+    index = node.args[1]
+    if not isinstance(index, int):
+        raise ValueError(
+            f"CAST_TO_BLOCK_SCALED: expected integer getitem index, got {index!r}"
+        )
+    return index
+
+
+def _ordered_getitem_outputs(node: torch.fx.Node) -> list[torch.fx.Node]:
     getitem_users = [
         user
         for user in node.users
         if user.op == "call_function" and user.target == operator.getitem
     ]
 
-    ordered_users = sorted(getitem_users, key=lambda user: cast(int, user.args[1]))
+    ordered_users = sorted(getitem_users, key=_getitem_index)
     if len(ordered_users) != 2:
         raise ValueError(
-            f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem outputs, got {len(ordered_users)}"
+            f"{CastToBlockScaledVisitor.target}: Expected exactly two getitem "
+            f"outputs, got {len(ordered_users)}"
         )
 
-    return [user.name for user in ordered_users]
+    return ordered_users
 
 
 @register_node_visitor
@@ -58,15 +69,67 @@ def define_node(
             raise ValueError(f"{self.target} requires the TOSA mxfp extension")
 
         input_tensor = inputs[0]
-        block_size = inputs[1].number
+        block_size = inputs[1].number if hasattr(inputs[1], "number") else None
+        if not isinstance(block_size, int) or isinstance(block_size, bool):
+            raise ValueError(f"{self.target}: missing block_size argument")
+
+        validate_valid_dtype(
+            self.target,
+            input_tensor,
+            [ts.DType.FP32, ts.DType.BF16, ts.DType.FP16],
+            self.tosa_spec,
+        )
+
+        if not isinstance(node.meta.get("val"), tuple) or len(node.meta["val"]) != 2:
+            raise ValueError(
+                f"{self.target}: expected tuple metadata with two outputs, got {node.meta.get('val')!r}"
+            )
         output_data_tensor, output_scale_tensor = node.meta["val"]
+        output_getitems = _ordered_getitem_outputs(node)
+        output_names = [user.name for user in output_getitems]
+        output_payload_dtype = output_getitems[0].meta.get(TosaSpecialDtype.meta_key())
+
+        if output_payload_dtype in (
+            TosaSpecialDtype.FP4E2M1,
+            TosaSpecialDtype.FP6E2M3,
+            TosaSpecialDtype.FP6E3M2,
+        ):
+            output_data_dtype = output_payload_dtype.get_tosa_dtype()
+        elif output_data_tensor.dtype == torch.float8_e4m3fn:
+            output_data_dtype = ts.DType.FP8E4M3
+        elif output_data_tensor.dtype == torch.float8_e5m2:
+            output_data_dtype = ts.DType.FP8E5M2
+        else:
+            raise ValueError(
+                f"{self.target}: unsupported payload dtype {output_data_tensor.dtype}"
+            )
+        if output_data_dtype not in (
+            ts.DType.FP4E2M1,
+            ts.DType.FP6E2M3,
+            ts.DType.FP6E3M2,
+            ts.DType.FP8E4M3,
+            ts.DType.FP8E5M2,
+        ):
+            raise ValueError(
+                f"{self.target}: unsupported payload dtype {output_data_dtype}"
+            )
+        if output_scale_tensor.dtype != torch.float8_e8m0fnu:
+            raise ValueError(
+                f"{self.target}: unsupported scale dtype {output_scale_tensor.dtype}"
+            )
 
-        # TODO(MLETORCH-2018): This is a local workaround for multi-output TOSA ops.
-        # Remove it once twe can handle multiple outputs generally.
-        output_names = _ordered_getitem_output_names(node)
+        if not hasattr(ts.Op, "CAST_TO_BLOCK_SCALED"):
+            raise NotImplementedError(
+                "tosa_serializer does not provide CAST_TO_BLOCK_SCALED yet"
+            )
 
         attr = ts.TosaSerializerAttribute()
-        attr.CastToBlockScaledAttribute(block_size)
+        attr_ctor = getattr(attr, "CastToBlockScaledAttribute", None)
+        if attr_ctor is None:
+            raise NotImplementedError(
+                "tosa_serializer does not provide CastToBlockScaledAttribute yet"
+            )
+        attr_ctor(block_size)
 
         self._serialize_operator(
             node,
diff --git a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
index 2f1bd88c2bb..4c3a8ba99b2 100644
--- a/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
+++ b/backends/arm/operators/op_tosa_matmul_t_block_scaled.py
@@ -53,7 +53,13 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [A_data, B_data],
-            [ts.DType.FP8E4M3, ts.DType.FP8E5M2],
+            [
+                ts.DType.FP4E2M1,
+                ts.DType.FP6E2M3,
+                ts.DType.FP6E3M2,
+                ts.DType.FP8E4M3,
+                ts.DType.FP8E5M2,
+            ],
             self.tosa_spec,
         )
         validate_valid_dtype(
diff --git a/backends/arm/operators/op_tosa_shapes.py b/backends/arm/operators/op_tosa_shapes.py
index 25c861a403d..b7480d78a4d 100644
--- a/backends/arm/operators/op_tosa_shapes.py
+++ b/backends/arm/operators/op_tosa_shapes.py
@@ -13,6 +13,7 @@
     NodeVisitor,
     register_node_visitor,
 )
+from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
 from executorch.backends.arm.tosa.utils import normalize_symint
 
@@ -21,9 +22,6 @@
 class TosaConstShapeVisitor(NodeVisitor):
     target = "tosa.CONST_SHAPE.default"
 
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: torch.fx.Node,
@@ -43,3 +41,217 @@ def define_node(
             vals=vals,
             name=output.name,
         )
+
+
+class TosaShapeNodeVisitor(NodeVisitor):
+
+    tosa_specs = TosaSpecification.all_profiles_for_version("1.1")
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
+        tosa_graph.currRegion.currBasicBlock.addShape(
+            output.name,
+            output.shape[0],
+        )
+
+
+class TosaBasicShapeVisitor(TosaShapeNodeVisitor):
+    tosa_op: ts.Op
+    attr_method: str
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        super().define_node(node, tosa_graph, inputs, output)
+        self.serialize(
+            node,
+            tosa_graph,
+            tosa_op=self.tosa_op,
+            inputs=inputs,
+            output=output,
+            attr_method=self.attr_method,
+        )
+
+
+@register_node_visitor
+class TosaDimShapeVisitor(TosaShapeNodeVisitor):
+    target = "tosa.DIM.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        super().define_node(node, tosa_graph, inputs, output)
+
+        attr = ts.TosaSerializerAttribute()
+        attr.DimAttribute(axis=node.kwargs["axis"])
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.Op.DIM,
+            [inputs[0].name],
+            [output.name],
+            attr,
+        )
+
+
+@register_node_visitor
+class TosaAddShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.ADD_SHAPE.default"
+
+    tosa_op = ts.Op.ADD_SHAPE
+    attr_method = "AddShapeAttribute"
+
+
+@register_node_visitor
+class TosaSubShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.SUB_SHAPE.default"
+
+    tosa_op = ts.Op.SUB_SHAPE
+    attr_method = "SubShapeAttribute"
+
+
+@register_node_visitor
+class TosaAssertEqualShapeVisitor(TosaShapeNodeVisitor):
+    target = "tosa.ASSERT_EQUAL_SHAPE.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        super().define_node(node, tosa_graph, inputs, output)
+        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
+        attr = ts.TosaSerializerAttribute()
+        attr.AssertEqualShapeAttribute(allow_broadcast=node.kwargs["allow_broadcast"])
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.Op.ASSERT_EQUAL_SHAPE,
+            [inputs[0].name, inputs[1].name],
+            [output.name],
+            attr,
+        )
+
+
+@register_node_visitor
+class TosaCatShapeVisitor(TosaShapeNodeVisitor):
+    target = "tosa.CONCAT_SHAPE.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        super().define_node(node, tosa_graph, inputs, output)
+        tosa_graph = cast(ts.TosaSerializer, tosa_graph)
+
+        input_shape_list = [input.name for input in inputs[0].special]
+
+        attr = ts.TosaSerializerAttribute()
+        attr.ConcatShapeAttribute()
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.Op.CONCAT_SHAPE,
+            input_shape_list,
+            [output.name],
+            attr,
+        )
+
+
+@register_node_visitor
+class TosaDivCeilShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.DIV_CEIL_SHAPE.default"
+
+    tosa_op = ts.Op.DIV_CEIL_SHAPE
+    attr_method = "DivCeilShapeAttribute"
+
+
+@register_node_visitor
+class TosaDivShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.DIV_FLOOR_SHAPE.default"
+
+    tosa_op = ts.Op.DIV_FLOOR_SHAPE
+    attr_method = "DivFloorShapeAttribute"
+
+
+@register_node_visitor
+class TosaExp2ShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.EXP2_SHAPE.default"
+
+    tosa_op = ts.Op.EXP2_SHAPE
+    attr_method = "Exp2ShapeAttribute"
+
+
+@register_node_visitor
+class TosaLog2CeilShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.LOG2_CEIL_SHAPE.default"
+
+    tosa_op = ts.Op.LOG2_CEIL_SHAPE
+    attr_method = "Log2CeilShapeAttribute"
+
+
+@register_node_visitor
+class TosaLog2FloorShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.LOG2_FLOOR_SHAPE.default"
+
+    tosa_op = ts.Op.LOG2_FLOOR_SHAPE
+    attr_method = "Log2FloorShapeAttribute"
+
+
+@register_node_visitor
+class TosaMaxShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.MAX_SHAPE.default"
+
+    tosa_op = ts.Op.MAX_SHAPE
+    attr_method = "MaxShapeAttribute"
+
+
+@register_node_visitor
+class TosaMinShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.MIN_SHAPE.default"
+
+    tosa_op = ts.Op.MIN_SHAPE
+    attr_method = "MinShapeAttribute"
+
+
+@register_node_visitor
+class TosaMulShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.MUL_SHAPE.default"
+
+    tosa_op = ts.Op.MUL_SHAPE
+    attr_method = "MulShapeAttribute"
+
+
+@register_node_visitor
+class TosaSliceShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.SLICE_SHAPE.default"
+
+    tosa_op = ts.Op.SLICE_SHAPE
+    attr_method = "SliceShapeAttribute"
+
+
+@register_node_visitor
+class TosaModShapeVisitor(TosaBasicShapeVisitor):
+    target = "tosa.MOD_SHAPE.default"
+
+    tosa_op = ts.Op.MOD_SHAPE
+    attr_method = "ModShapeAttribute"
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 5f9c3e3938c..a0c2dbeb1fb 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -56,14 +56,82 @@ def _tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
 
 
 def _prepare_const_values_for_tosa_dtype(
-    values: np.ndarray, tosa_dtype: ts.DType
+    values: np.ndarray, tosa_arg: TosaArg
 ) -> np.ndarray:
     """Normalize constant storage to the expected TOSA serializer dtype."""
-    if tosa_dtype == ts.DType.INT48 and values.dtype != np.int64:
+    if tosa_arg.dtype == ts.DType.INT48 and values.dtype != np.int64:
         return values.astype(np.int64)
+    if tosa_arg.dtype in (ts.DType.FP6E2M3, ts.DType.FP6E3M2):
+        if values.dtype == np.uint8:
+            try:
+                import ml_dtypes  # type: ignore[import-not-found]
+            except ImportError as e:
+                raise RuntimeError(
+                    "ml_dtypes is required to serialize FP6 tensors for TOSA. "
+                    "Have you run setup.sh?"
+                ) from e
+            ml_dtype = {
+                ts.DType.FP6E2M3: ml_dtypes.float6_e2m3fn,
+                ts.DType.FP6E3M2: ml_dtypes.float6_e3m2fn,
+            }[tosa_arg.dtype]
+            return values.view(ml_dtype)
     return values
 
 
+def _get_const_shape(values: np.ndarray, tosa_arg: TosaArg) -> list[int]:
+    """Return the TOSA logical shape for a serialized constant."""
+    if tosa_arg.dtype == ts.DType.FP4E2M1:
+        return normalize_symint(tosa_arg.shape)
+    return normalize_symint(values.shape)
+
+
+def _is_packed_fp4_const(values: np.ndarray, tosa_arg: TosaArg) -> bool:
+    """FP4 elements are pairwise in each byte of a uint8 tensor.
+
+    This function checks if the given values and TOSA argument represent a
+    packed FP4 constant.
+
+    """
+
+    return (
+        tosa_arg.dtype == ts.DType.FP4E2M1
+        and values.dtype == np.uint8
+        and values.shape[-1] * 2 == tosa_arg.shape[-1]
+    )
+
+
+def _add_const(
+    tosa_graph: Any,
+    values: np.ndarray,
+    tosa_arg: TosaArg,
+    name: str,
+) -> None:
+    """Add a constant, preserving packed FP4 storage when required."""
+    if _is_packed_fp4_const(values, tosa_arg):
+        # TOSA FP4 tensors have logical FP4 shape, but constants are stored as
+        # packed bytes (two values per byte). Add the raw bytes as INT8 first
+        # then set TOSA dtype and shape correctly on the tensor metadata.
+        tosa_graph.addConst(
+            normalize_symint(values.shape),
+            ts.DType.INT8,
+            values,
+            name=name,
+        )
+        tensor = tosa_graph.currRegion.currBasicBlock.tensors[name]
+        tensor.setDtype(ts.DType.FP4E2M1)
+        for dim, size in enumerate(normalize_symint(tosa_arg.shape)):
+            tensor.SetDimSize(dim, size)
+        return
+
+    prepared_values = _prepare_const_values_for_tosa_dtype(values, tosa_arg)
+    tosa_graph.addConst(
+        _get_const_shape(prepared_values, tosa_arg),
+        tosa_arg.dtype,
+        prepared_values,
+        name=name,
+    )
+
+
 def process_call_function(
     node: torch.fx.Node,
     tosa_graph: Any,
@@ -154,16 +222,7 @@ def process_inputs_to_parameters(
             f"{type(parameter_data).__name__}"
         )
     parameter_values = _tensor_to_numpy(parameter_data)
-    parameter_values = _prepare_const_values_for_tosa_dtype(
-        parameter_values, tosa_arg.dtype
-    )
-
-    tosa_graph.addConst(
-        normalize_symint(parameter_values.shape),
-        tosa_arg.dtype,
-        parameter_values,
-        name=tosa_arg.name,
-    )
+    _add_const(tosa_graph, parameter_values, tosa_arg, name=tosa_arg.name)
 
 
 def process_inputs_to_buffers(
@@ -188,14 +247,7 @@ def process_inputs_to_buffers(
             f"{type(buffer_data).__name__}"
         )
     buffer_values = _tensor_to_numpy(buffer_data)
-    buffer_values = _prepare_const_values_for_tosa_dtype(buffer_values, tosa_arg.dtype)
-
-    tosa_graph.addConst(
-        normalize_symint(buffer_values.shape),
-        tosa_arg.dtype,
-        buffer_values,
-        name=tosa_arg.name,
-    )
+    _add_const(tosa_graph, buffer_values, tosa_arg, name=tosa_arg.name)
 
 
 def process_inputs_to_lifted_tensor_constants(
@@ -217,14 +269,7 @@ def process_inputs_to_lifted_tensor_constants(
         f"{type(tensor).__name__}"
     )
     tensor_values = _tensor_to_numpy(tensor)
-    tensor_values = _prepare_const_values_for_tosa_dtype(tensor_values, tosa_arg.dtype)
-
-    tosa_graph.addConst(
-        normalize_symint(tensor_values.shape),
-        tosa_arg.dtype,
-        tensor_values,
-        name=tosa_arg.name,
-    )
+    _add_const(tosa_graph, tensor_values, tosa_arg, name=tosa_arg.name)
 
 
 def _is_submodule_input(
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 7810077a679..3b713659e84 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -532,6 +532,7 @@ def _get_fixed_qparams_qspec(
     torch.ops.aten.selu.default,
     torch.ops.aten.celu.default,
     torch.ops.aten.floor.default,
+    torch.ops.aten.round.default,
     torch.ops.aten.log.default,
     torch.ops.aten.reciprocal.default,
     torch.ops.aten.rsqrt.default,
diff --git a/backends/arm/scripts/install_models_for_test.sh b/backends/arm/scripts/install_models_for_test.sh
index d6a7b9cdec0..1e91cd9c08f 100644
--- a/backends/arm/scripts/install_models_for_test.sh
+++ b/backends/arm/scripts/install_models_for_test.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +8,8 @@ set -e
 pip install -r backends/arm/requirements-arm-models-test.txt
 
 # Install model gym repository
-git clone https://github.com/arm/neural-graphics-model-gym.git
+MODEL_GYM_REF="${MODEL_GYM_REF:-v0.3.0}"
+git clone --depth 1 --branch "$MODEL_GYM_REF" https://github.com/arm/neural-graphics-model-gym.git
 cd neural-graphics-model-gym
 # Remove model-converter installation from model-gym repository (to prevent overwriting executorch version)
 if [[ "$(uname)" == "Darwin" ]]; then
@@ -18,4 +19,4 @@ else
 fi
 pip install . --no-deps
 cd ..
-rm -rf neural-graphics-model-gym
\ No newline at end of file
+rm -rf neural-graphics-model-gym
diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index 1aa51a8f9ac..9c324e0d784 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -177,7 +177,7 @@ for COMMIT in ${COMMITS}; do
     for committed_file in "${license_files[@]}"; do
         # Skip files with certain extensions
         case "$committed_file" in
-            *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS)
+            *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS|*/generated/*)
                 echo -e "${INFO} Skipping license check for ${committed_file} (excluded extension)"
                 continue
                 ;;
diff --git a/backends/arm/test/misc/test_mxfp_linear_ao.py b/backends/arm/test/misc/test_mxfp_linear_ao.py
index 0f2b6b9198c..1412d8ffdfe 100644
--- a/backends/arm/test/misc/test_mxfp_linear_ao.py
+++ b/backends/arm/test/misc/test_mxfp_linear_ao.py
@@ -5,9 +5,11 @@
 
 import torch
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
+from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType
 from executorch.backends.arm.ao_ext.ops import MXFPLinearOp
 
 from torch.export import export
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 class LinearModule(torch.nn.Module):
@@ -19,21 +21,86 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.linear(x)
 
 
-def test_mxfp_linear_quantize_swaps_module() -> None:
+def _test_mxfp_linear_quantize_swaps_module(
+    weight_dtype: MXFPDType,
+    expected_weight_qdata_dtype: torch.dtype,
+    expected_weight_qdata_shape: tuple[int, ...],
+) -> None:
     model = LinearModule().eval()
 
-    to_mxfp(model, MXFPOpConfig())
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=weight_dtype),
+    )
 
     assert isinstance(model.linear, MXFPLinearOp)
-    assert model.linear.weight_qdata.dtype == torch.float8_e4m3fn
+    assert model.linear.weight_qdata.dtype == expected_weight_qdata_dtype
+    assert model.linear.weight_dtype == mxfp_dtype_to_str(weight_dtype)
     assert model.linear.weight_scale.dtype == torch.float8_e8m0fnu
-    assert tuple(model.linear.weight_qdata.shape) == (1, 8, 32)
+    assert tuple(model.linear.weight_qdata.shape) == expected_weight_qdata_shape
     assert tuple(model.linear.weight_scale.shape) == (1, 8, 1)
 
 
-def test_mxfp_linear_export_preserves_custom_op() -> None:
+def test_mxfp8_e4m3_linear_quantize_swaps_module() -> None:
+    _test_mxfp_linear_quantize_swaps_module(
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fn,
+        (1, 8, 32),
+    )
+
+
+def test_mxfp4_linear_quantize_swaps_module() -> None:
+    _test_mxfp_linear_quantize_swaps_module(
+        torch.float4_e2m1fn_x2,
+        torch.uint8,
+        (1, 8, 16),
+    )
+
+
+def test_mxfp6_e2m3_linear_quantize_swaps_module() -> None:
+    _test_mxfp_linear_quantize_swaps_module(
+        DTYPE_FP6_E2M3,
+        torch.uint8,
+        (1, 8, 32),
+    )
+
+
+def test_mxfp6_e3m2_linear_quantize_swaps_module() -> None:
+    _test_mxfp_linear_quantize_swaps_module(
+        DTYPE_FP6_E3M2,
+        torch.uint8,
+        (1, 8, 32),
+    )
+
+
+def test_mxfp_linear_quantize_filter_fn_selects_modules() -> None:
+    class TwoLinearModule(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.selected = torch.nn.Linear(32, 8)
+            self.skipped = torch.nn.Linear(32, 8)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return self.selected(x) + self.skipped(x)
+
+    def _is_selected_linear(module: torch.nn.Module, fqn: str) -> bool:
+        return isinstance(module, torch.nn.Linear) and fqn == "selected"
+
+    model = TwoLinearModule().eval()
+
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+        filter_fn=_is_selected_linear,
+    )
+
+    assert isinstance(model.selected, MXFPLinearOp)
+    assert isinstance(model.skipped, torch.nn.Linear)
+
+
+def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None:
     model = LinearModule().eval()
-    to_mxfp(model, MXFPOpConfig())
+    to_mxfp(model, config)
 
     exported = export(model, (torch.randn(4, 32),), strict=False)
 
@@ -44,3 +111,27 @@ def test_mxfp_linear_export_preserves_custom_op() -> None:
     ]
 
     assert torch.ops.tosa_mxfp.linear.default in targets
+
+
+def test_mxfp8_e4m3_linear_export_preserves_custom_op() -> None:
+    _test_mxfp_linear_export_preserves_custom_op(
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+    )
+
+
+def test_mxfp4_linear_export_preserves_custom_op() -> None:
+    _test_mxfp_linear_export_preserves_custom_op(
+        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2)
+    )
+
+
+def test_mxfp6_e2m3_linear_export_preserves_custom_op() -> None:
+    _test_mxfp_linear_export_preserves_custom_op(
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3)
+    )
+
+
+def test_mxfp6_e3m2_linear_export_preserves_custom_op() -> None:
+    _test_mxfp_linear_export_preserves_custom_op(
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2)
+    )
diff --git a/backends/arm/test/misc/test_process_node.py b/backends/arm/test/misc/test_process_node.py
index 1ef348abdbf..02d2a5e012b 100644
--- a/backends/arm/test/misc/test_process_node.py
+++ b/backends/arm/test/misc/test_process_node.py
@@ -3,14 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from types import SimpleNamespace
+from typing import cast
+
 import numpy as np
 import torch
 import tosa_serializer as ts
-from executorch.backends.arm.process_node import process_placeholder
-from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.backends.arm.process_node import _add_const, process_placeholder
+from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import TosaSpecification
 from executorch.exir import to_edge
 from torch._export.utils import is_param
+from tosa.TosaGraph import TosaGraph  # type: ignore[import-not-found, import-untyped]
+from tosa_serializer.numpy_utils import pack_6bit_array
 
 
 class Int32BiasModule(torch.nn.Module):
@@ -94,3 +99,74 @@ def test_process_placeholder_int48_normalizes_int32_const_values() -> None:
     assert tosa_graph.values is not None
     assert tosa_graph.values.dtype == np.int64
     assert tosa_graph.serialized_bytes == _expected_int48_bytes(module.bias)
+
+
+def test_add_const_fp4_in_packed_storage() -> None:
+    packed_values = np.array([0xDE, 0xFE, 0x6D, 0x55], dtype=np.uint8).reshape(
+        1,
+        1,
+        4,
+    )
+    tosa_arg = cast(
+        TosaArg,
+        SimpleNamespace(dtype=ts.DType.FP4E2M1, shape=(1, 1, 8)),
+    )
+    tosa_graph = ts.TosaSerializer()
+
+    _add_const(tosa_graph, packed_values, tosa_arg, name="fp4_weight")
+
+    graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0)
+    block = graph.Regions(0).Blocks(0)
+    tensors = {
+        block.Tensors(index).Name().decode(): block.Tensors(index)
+        for index in range(block.TensorsLength())
+    }
+    tensor = tensors["fp4_weight"]
+
+    assert tensor.Type() == ts.DType.FP4E2M1
+    assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [1, 1, 8]
+    assert [tensor.Data(index) for index in range(tensor.DataLength())] == [
+        0xDE,
+        0xFE,
+        0x6D,
+        0x55,
+    ]
+
+
+def _test_add_const_fp6_in_packed_storage(dtype: int) -> None:
+    values = np.arange(32, dtype=np.uint8).reshape(1, 1, 32)
+
+    tosa_arg = cast(
+        TosaArg,
+        SimpleNamespace(dtype=dtype, shape=(1, 1, 32)),
+    )
+    tosa_graph = ts.TosaSerializer()
+
+    _add_const(tosa_graph, values, tosa_arg, name="fp6_weight")
+
+    graph = TosaGraph.GetRootAs(bytes(tosa_graph.serialize()), 0)
+    block = graph.Regions(0).Blocks(0)
+    tensors = {
+        block.Tensors(index).Name().decode(): block.Tensors(index)
+        for index in range(block.TensorsLength())
+    }
+    tensor = tensors["fp6_weight"]
+
+    assert tensor.Type() == dtype
+    assert [tensor.Shape(index) for index in range(tensor.ShapeLength())] == [
+        1,
+        1,
+        32,
+    ]
+    assert tensor.DataLength() == 24
+    assert [tensor.Data(index) for index in range(tensor.DataLength())] == (
+        pack_6bit_array(values).reshape(-1).tolist()
+    )
+
+
+def test_add_const_fp6e2m3_in_packed_storage() -> None:
+    _test_add_const_fp6_in_packed_storage(ts.DType.FP6E2M3)
+
+
+def test_add_const_fp6e3m2_in_packed_storage() -> None:
+    _test_add_const_fp6_in_packed_storage(ts.DType.FP6E3M2)
diff --git a/backends/arm/test/misc/test_runner_utils.py b/backends/arm/test/misc/test_runner_utils.py
index 3c78b21e008..54d41548a22 100644
--- a/backends/arm/test/misc/test_runner_utils.py
+++ b/backends/arm/test/misc/test_runner_utils.py
@@ -3,9 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import json
 from pathlib import Path
+from types import SimpleNamespace
 from typing import Any, cast
 
+import numpy as np
+import torch
 from executorch.backends.arm.test import runner_utils
 
 
@@ -113,3 +117,115 @@ def test_get_elf_path_accepts_nested_runner_output(monkeypatch, tmp_path: Path)
     monkeypatch.setattr(runner_utils, "_elf_search_roots", lambda: [tmp_path])
 
     assert runner_utils.get_elf_path("corstone-300") == str(elf_path)
+
+
+def test_shape_inference_json_uses_tosa_input_layout(tmp_path: Path) -> None:
+    test_case_path = tmp_path / "test_case.json"
+    artifact_path = tmp_path / "model.tosa"
+    input_tensor = torch.randn(1, 3, 4, 5).to(memory_format=torch.channels_last)
+
+    runner_utils.TosaReferenceModelDispatch()._generate_shape_inference_json(
+        b"",
+        artifact_path,
+        test_case_path,
+        ["input"],
+        (input_tensor,),
+    )
+
+    test_case = json.loads(test_case_path.read_text(encoding="utf-8"))
+
+    assert test_case == {
+        "tosa_file": str(artifact_path),
+        "shapes": {"input": [1, 4, 5, 3]},
+    }
+
+
+def test_numpy_to_torch_tensor_converts_dynamic_nhwc_output(monkeypatch) -> None:
+    symbolic_dim = object()
+    output_tensor = SimpleNamespace(
+        shape=(1, 3, symbolic_dim, 5),
+        dtype=torch.float32,
+        dim_order=lambda: runner_utils.NHWC_ORDER,
+    )
+    monkeypatch.setattr(
+        runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor
+    )
+    array = np.arange(60, dtype=np.float32).reshape(1, 4, 5, 3)
+
+    result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object()))
+
+    assert result.shape == (1, 3, 4, 5)
+    assert result.is_contiguous(memory_format=torch.channels_last)
+    torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 3, 1, 2))
+
+
+def test_numpy_to_torch_tensor_converts_dynamic_nnhwc_output(monkeypatch) -> None:
+    symbolic_dim = object()
+    output_tensor = SimpleNamespace(
+        shape=(1, 2, 3, symbolic_dim, 5),
+        dtype=torch.float32,
+        dim_order=lambda: runner_utils.NNHWC_ORDER,
+    )
+    monkeypatch.setattr(
+        runner_utils, "get_first_fake_tensor", lambda output_node: output_tensor
+    )
+    array = np.arange(120, dtype=np.float32).reshape(1, 2, 4, 5, 3)
+
+    result = runner_utils.numpy_to_torch_tensor(array, cast(Any, object()))
+
+    assert result.shape == (1, 2, 3, 4, 5)
+    assert result.dim_order() == runner_utils.NNHWC_ORDER
+    torch.testing.assert_close(result, torch.from_numpy(array).permute(0, 1, 4, 2, 3))
+
+
+def _program_with_user_input(name: str) -> SimpleNamespace:
+    return SimpleNamespace(
+        graph_signature=SimpleNamespace(user_inputs=[name]),
+        graph=SimpleNamespace(nodes=[SimpleNamespace(op="placeholder", name=name)]),
+    )
+
+
+def test_user_inputs_need_shape_inference_rejects_static_input(monkeypatch) -> None:
+    monkeypatch.setattr(
+        runner_utils,
+        "get_first_fake_tensor",
+        lambda node: SimpleNamespace(shape=(1, 2)),
+    )
+
+    assert not runner_utils.user_inputs_need_shape_inference(
+        cast(Any, _program_with_user_input("input"))
+    )
+
+
+def test_user_inputs_need_shape_inference_accepts_symbolic_input(monkeypatch) -> None:
+    symbolic_dim = object()
+    monkeypatch.setattr(
+        runner_utils,
+        "get_first_fake_tensor",
+        lambda node: SimpleNamespace(shape=(1, symbolic_dim)),
+    )
+
+    assert runner_utils.user_inputs_need_shape_inference(
+        cast(Any, _program_with_user_input("input"))
+    )
+
+
+def test_user_inputs_need_shape_inference_ignores_non_user_inputs(monkeypatch) -> None:
+    program = SimpleNamespace(
+        graph_signature=SimpleNamespace(user_inputs=["input"]),
+        graph=SimpleNamespace(
+            nodes=[
+                SimpleNamespace(op="placeholder", name="input"),
+                SimpleNamespace(op="placeholder", name="param"),
+            ]
+        ),
+    )
+
+    def fake_tensor(node):
+        if node.name == "input":
+            return SimpleNamespace(shape=(1, 2))
+        return SimpleNamespace(shape=(1, object()))
+
+    monkeypatch.setattr(runner_utils, "get_first_fake_tensor", fake_tensor)
+
+    assert not runner_utils.user_inputs_need_shape_inference(cast(Any, program))
diff --git a/backends/arm/test/misc/test_vgf_backend.py b/backends/arm/test/misc/test_vgf_backend.py
index 22a8607fbc7..406ba1b405a 100644
--- a/backends/arm/test/misc/test_vgf_backend.py
+++ b/backends/arm/test/misc/test_vgf_backend.py
@@ -3,8 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 from types import SimpleNamespace
 from typing import cast
+from unittest import mock
 
 import pytest
 
@@ -14,7 +16,14 @@
     clear_registered_pass_insertions,
     PassInsertions,
 )
-from executorch.backends.arm.vgf import backend as vgf_backend, VgfCompileSpec
+
+from executorch.backends.arm.vgf import backend, backend as vgf_backend, VgfCompileSpec
+from executorch.backends.arm.vgf.backend import (
+    _copy_failure_artifacts,
+    _format_repro_command,
+    _replace_converter_input_path,
+    vgf_compile,
+)
 from executorch.exir.backend.backend_details import PreprocessResult
 from executorch.exir.pass_base import ExportPass
 from torch.export.exported_program import ExportedProgram
@@ -105,3 +114,180 @@ def _raise(*args, **kwargs):
         assert _registry_state() == original_registry
     finally:
         clear_registered_pass_insertions()
+
+
+def test_format_repro_command_quotes_shell_metacharacters():
+    command = [
+        "model-converter",
+        "--flag=value with spaces",
+        "-i",
+        "input file.tosa",
+        "-o",
+        "output file.vgf",
+    ]
+
+    formatted = _format_repro_command(command)
+
+    assert formatted == (
+        "model-converter "
+        "'--flag=value with spaces' "
+        "-i "
+        "'input file.tosa' "
+        "-o "
+        "'output file.vgf'"
+    )
+
+
+def test_replace_converter_input_path_replaces_input_after_i():
+    command = [
+        "model-converter",
+        "--some-flag",
+        "-i",
+        "original.tosa",
+        "-o",
+        "output.vgf",
+    ]
+
+    replaced = _replace_converter_input_path(command, "preserved.tosa")
+
+    assert replaced == [
+        "model-converter",
+        "--some-flag",
+        "-i",
+        "preserved.tosa",
+        "-o",
+        "output.vgf",
+    ]
+    assert command[3] == "original.tosa"
+
+
+def test_copy_failure_artifacts_returns_none_without_artifact_path(tmp_path):
+    tosa_path = tmp_path / "input.tosa"
+    tosa_path.write_bytes(b"tosa bytes")
+
+    copied_path = _copy_failure_artifacts(
+        str(tosa_path),
+        artifact_path=None,
+        tag_name="delegate_0",
+    )
+
+    assert copied_path is None
+
+
+def test_copy_failure_artifacts_copies_tosa_with_tag_name(tmp_path):
+    tosa_path = tmp_path / "input.tosa"
+    artifact_path = tmp_path / "artifacts"
+    tosa_path.write_bytes(b"tosa bytes")
+
+    copied_path = _copy_failure_artifacts(
+        str(tosa_path),
+        str(artifact_path),
+        tag_name="delegate_0",
+    )
+
+    assert copied_path == os.path.join(
+        str(artifact_path),
+        "failed_model_converter_input_delegate_0.tosa",
+    )
+    assert os.path.exists(copied_path)
+    assert open(copied_path, "rb").read() == b"tosa bytes"
+
+
+def test_copy_failure_artifacts_copies_tosa_without_tag_name(tmp_path):
+    tosa_path = tmp_path / "input.tosa"
+    artifact_path = tmp_path / "artifacts"
+    tosa_path.write_bytes(b"tosa bytes")
+
+    copied_path = _copy_failure_artifacts(
+        str(tosa_path),
+        str(artifact_path),
+        tag_name="",
+    )
+
+    assert copied_path == os.path.join(
+        str(artifact_path),
+        "failed_model_converter_input.tosa",
+    )
+    assert os.path.exists(copied_path)
+    assert open(copied_path, "rb").read() == b"tosa bytes"
+
+
+@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env")
+@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary")
+@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run")
+def test_vgf_compile_failure_includes_repro_command_and_copies_tosa(
+    mock_run,
+    mock_require_model_converter_binary,
+    mock_model_converter_env,
+    tmp_path,
+):
+    artifact_path = tmp_path / "artifacts"
+
+    mock_require_model_converter_binary.return_value = "model-converter"
+    mock_model_converter_env.return_value = {"PATH": "/test/bin"}
+    mock_run.side_effect = backend.subprocess.CalledProcessError(
+        returncode=1,
+        cmd=["model-converter"],
+        output=b"converter stdout",
+        stderr=b"converter stderr",
+    )
+
+    with pytest.raises(RuntimeError) as exc_info:
+        vgf_compile(
+            b"serialized tosa",
+            ["--flag=value with spaces"],
+            artifact_path=str(artifact_path),
+            tag_name="delegate_0",
+        )
+
+    copied_tosa_path = os.path.join(
+        str(artifact_path),
+        "failed_model_converter_input_delegate_0.tosa",
+    )
+
+    assert os.path.exists(copied_tosa_path)
+    assert open(copied_tosa_path, "rb").read() == b"serialized tosa"
+
+    error = str(exc_info.value)
+    assert "Vgf compiler failed." in error
+    assert "Repro command:" in error
+    assert "model-converter '--flag=value with spaces' -i" in error
+    assert copied_tosa_path in error
+    assert " -o " in error
+    assert "Stderr:\nconverter stderr" in error
+    assert "Stdout:\nconverter stdout" in error
+
+
+@mock.patch("executorch.backends.arm.vgf.backend.model_converter_env")
+@mock.patch("executorch.backends.arm.vgf.backend.require_model_converter_binary")
+@mock.patch("executorch.backends.arm.vgf.backend.subprocess.run")
+def test_vgf_compile_failure_includes_temp_repro_command_without_artifact_path(
+    mock_run,
+    mock_require_model_converter_binary,
+    mock_model_converter_env,
+):
+    mock_require_model_converter_binary.return_value = "model-converter"
+    mock_model_converter_env.return_value = {"PATH": "/test/bin"}
+    mock_run.side_effect = backend.subprocess.CalledProcessError(
+        returncode=1,
+        cmd=["model-converter"],
+        output=b"converter stdout",
+        stderr=b"converter stderr",
+    )
+
+    with pytest.raises(RuntimeError) as exc_info:
+        vgf_compile(
+            b"serialized tosa",
+            ["--some-flag"],
+            artifact_path=None,
+            tag_name="delegate_0",
+        )
+
+    error = str(exc_info.value)
+    assert "Vgf compiler failed." in error
+    assert "Repro command:" in error
+    assert "model-converter --some-flag -i" in error
+    assert "output_delegate_0.tosa.vgf" in error
+    assert "failed_model_converter_input_delegate_0.tosa" not in error
+    assert "Stderr:\nconverter stderr" in error
+    assert "Stdout:\nconverter stdout" in error
diff --git a/backends/arm/test/misc/test_vgf_check_env.py b/backends/arm/test/misc/test_vgf_check_env.py
index 499a9f35db0..646eb6b9a79 100644
--- a/backends/arm/test/misc/test_vgf_check_env.py
+++ b/backends/arm/test/misc/test_vgf_check_env.py
@@ -9,8 +9,10 @@
 from pathlib import Path
 
 import executorch.backends.arm.vgf.check_env as check_env
+import executorch.backends.arm.vgf.model_converter as model_converter
 
 import pytest
+from executorch.backends.arm.vgf import backend as vgf_backend
 from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec
 
 
@@ -119,7 +121,7 @@ def test_is_vgf_runtime_available(monkeypatch):
 
 
 def test_model_converter_check_fails_when_missing(monkeypatch):
-    monkeypatch.setattr(check_env, "find_model_converter_binary", lambda: None)
+    monkeypatch.setattr(model_converter, "find_model_converter_binary", lambda: None)
 
     result = check_env._check_model_converter()
 
@@ -139,7 +141,7 @@ def test_model_converter_check_reports_version(monkeypatch, tmp_path):
         "raise SystemExit(1)\n",
     )
     monkeypatch.setattr(
-        check_env, "find_model_converter_binary", lambda: str(converter)
+        model_converter, "find_model_converter_binary", lambda: str(converter)
     )
 
     result = check_env._check_model_converter()
@@ -172,20 +174,20 @@ def test_find_existing_lib_finds_libvgf(tmp_path):
 
 def test_runtime_backend_check_passes_when_vgf_registered(monkeypatch):
     class BackendRegistry:
-        registered_backend_names = [check_env.VGF_BACKEND_NAME]
+        registered_backend_names = [vgf_backend.VGF_BACKEND_NAME]
 
         def is_available(self, backend_name):
-            return backend_name == check_env.VGF_BACKEND_NAME
+            return backend_name == vgf_backend.VGF_BACKEND_NAME
 
     class Runtime:
         backend_registry = BackendRegistry()
 
-    monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime())
+    monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime())
 
     result = check_env._check_runtime_vgf_backend()
 
     assert result.status == check_env.STATUS_OK
-    assert check_env.VGF_BACKEND_NAME in result.detail
+    assert vgf_backend.VGF_BACKEND_NAME in result.detail
 
 
 def test_runtime_backend_check_fails_when_vgf_not_registered(monkeypatch):
@@ -198,12 +200,12 @@ def is_available(self, backend_name):
     class Runtime:
         backend_registry = BackendRegistry()
 
-    monkeypatch.setattr(check_env, "_load_runtime", lambda: Runtime())
+    monkeypatch.setattr(vgf_backend, "_load_runtime", lambda: Runtime())
 
     result = check_env._check_runtime_vgf_backend()
 
     assert result.status == check_env.STATUS_FAIL
-    assert check_env.VGF_BACKEND_NAME in result.detail
+    assert vgf_backend.VGF_BACKEND_NAME in result.detail
     assert "XnnpackBackend" in result.detail
 
 
@@ -357,3 +359,84 @@ def test_main_source_build_mode(monkeypatch, capsys):
 def test_main_rejects_build_dir_without_source_build():
     with pytest.raises(SystemExit):
         check_env.main(["--build-dir", "cmake-out-vkml"])
+
+
+def test_check_env_model_converter_probe_delegates_to_model_converter_module(
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        model_converter,
+        "check_model_converter_environment",
+        lambda: model_converter.ModelConverterEnvironmentCheck(
+            "converter", model_converter.STATUS_OK, "from-owner"
+        ),
+    )
+
+    result = check_env._check_model_converter()
+
+    assert result.status == check_env.STATUS_OK
+    assert result.detail == "from-owner"
+
+
+def test_check_env_model_converter_lib_dir_probe_delegates_to_model_converter_module(
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        model_converter,
+        "check_model_converter_lib_dir_environment",
+        lambda: model_converter.ModelConverterEnvironmentCheck(
+            "lib-dir", model_converter.STATUS_OK, "from-owner"
+        ),
+    )
+
+    result = check_env._check_model_converter_lib_dir()
+
+    assert result.status == check_env.STATUS_OK
+    assert result.detail == "from-owner"
+
+
+def test_check_env_runtime_probe_delegates_to_backend_module(monkeypatch):
+    monkeypatch.setattr(
+        vgf_backend,
+        "check_vgf_runtime_backend_environment",
+        lambda: vgf_backend.VgfRuntimeEnvironmentCheck(
+            "runtime", vgf_backend.STATUS_OK, "from-owner"
+        ),
+    )
+
+    result = check_env._check_runtime_vgf_backend()
+
+    assert result.status == check_env.STATUS_OK
+    assert result.detail == "from-owner"
+
+
+def test_model_converter_preflight_and_vgf_compile_share_executable_resolution(
+    monkeypatch,
+    tmp_path,
+):
+    converter = _make_executable(
+        tmp_path / "model-converter",
+        "#!/usr/bin/env python3\n"
+        "from pathlib import Path\n"
+        "import sys\n"
+        "\n"
+        "if '--version' in sys.argv:\n"
+        "    print('model-converter integration-test')\n"
+        "    raise SystemExit(0)\n"
+        "\n"
+        "out_index = sys.argv.index('-o') + 1\n"
+        "Path(sys.argv[out_index]).write_bytes(b'compiled-vgf')\n"
+        "raise SystemExit(0)\n",
+    )
+
+    monkeypatch.setenv("MODEL_CONVERTER_PATH", str(converter))
+
+    preflight = check_env._check_model_converter()
+    compiled = vgf_backend.vgf_compile(
+        tosa_flatbuffer=b"fake-tosa-flatbuffer",
+        compile_flags=[],
+    )
+
+    assert preflight.status == check_env.STATUS_OK
+    assert str(converter) in preflight.detail
+    assert compiled == b"compiled-vgf"
diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
index 940023fa624..77c42bf9f24 100644
--- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
+++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops import cast_to_block_scaled  # noqa: F401
 from executorch.backends.arm.tosa.specification import (
@@ -13,6 +14,7 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 def test_cast_to_block_scaled_requires_mxfp_extension() -> None:
@@ -27,7 +29,7 @@ def test_cast_to_block_scaled_requires_mxfp_extension() -> None:
             exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
                 mode.from_tensor(sample_input),
                 32,
-                output_dtype=torch.float8_e4m3fn,
+                output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
             )
 
 
@@ -39,7 +41,7 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None:
         output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
             mode.from_tensor(sample_input),
             32,
-            output_dtype=torch.float8_e4m3fn,
+            output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
         )
 
     assert output_data.dtype == torch.float8_e4m3fn
@@ -48,6 +50,48 @@ def test_cast_to_block_scaled_tosa_fp_mxfp() -> None:
     assert tuple(output_scale.shape) == (2, 1)
 
 
+def test_cast_to_block_scaled_tosa_fp_mxfp4() -> None:
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
+    sample_input = torch.randn((2, 32), dtype=torch.float32)
+
+    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
+        output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
+            mode.from_tensor(sample_input),
+            32,
+            output_dtype=mxfp_dtype_to_str(torch.float4_e2m1fn_x2),
+        )
+
+    assert output_data.dtype == torch.uint8
+    assert tuple(output_data.shape) == (2, 16)
+    assert output_scale.dtype == torch.float8_e8m0fnu
+    assert tuple(output_scale.shape) == (2, 1)
+
+
+def _test_cast_to_block_scaled_tosa_fp_mxfp6(dtype: str) -> None:
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
+    sample_input = torch.randn((2, 32), dtype=torch.float32)
+
+    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
+        output_data, output_scale = exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
+            mode.from_tensor(sample_input),
+            32,
+            output_dtype=mxfp_dtype_to_str(dtype),
+        )
+
+    assert output_data.dtype == torch.uint8
+    assert tuple(output_data.shape) == (2, 32)
+    assert output_scale.dtype == torch.float8_e8m0fnu
+    assert tuple(output_scale.shape) == (2, 1)
+
+
+def test_cast_to_block_scaled_tosa_fp_mxfp6e2m3() -> None:
+    _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E2M3)
+
+
+def test_cast_to_block_scaled_tosa_fp_mxfp6e3m2() -> None:
+    _test_cast_to_block_scaled_tosa_fp_mxfp6(DTYPE_FP6_E3M2)
+
+
 def test_cast_to_block_scaled_invalid_shape() -> None:
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
@@ -59,5 +103,5 @@ def test_cast_to_block_scaled_invalid_shape() -> None:
             exir_ops.backend.tosa.CAST_TO_BLOCK_SCALED.default(
                 mode.from_tensor(torch.randn((2, 30), dtype=torch.float32)),
                 32,
-                output_dtype=torch.float8_e4m3fn,
+                output_dtype=mxfp_dtype_to_str(torch.float8_e4m3fn),
             )
diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
index 74ce04bf3c1..7dcffdeb4d9 100644
--- a/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
+++ b/backends/arm/test/misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str, MXFPDType
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops import matmul_t_block_scaled  # noqa: F401
 from executorch.backends.arm.tosa.specification import (
@@ -13,6 +14,7 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E3M2
 
 
 def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None:
@@ -35,6 +37,38 @@ def test_matmul_t_block_scaled_tosa_fp_mxfp() -> None:
     assert tuple(output.shape) == (1, 4, 8)
 
 
+def _test_matmul_t_block_scaled_tosa_fp_subbyte(
+    payload_dtype: MXFPDType,
+    qdata_last_dim: int,
+) -> None:
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
+    a_data = torch.empty((1, 4, qdata_last_dim), dtype=torch.uint8)
+    a_scale = torch.empty((1, 4, 1), dtype=torch.float8_e8m0fnu)
+    b_data = torch.empty((1, 8, qdata_last_dim), dtype=torch.uint8)
+    b_scale = torch.empty((1, 8, 1), dtype=torch.float8_e8m0fnu)
+
+    with TosaLoweringContext(tosa_spec), FakeTensorMode() as mode:
+        output = exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default(
+            mode.from_tensor(a_data),
+            mode.from_tensor(a_scale),
+            mode.from_tensor(b_data),
+            mode.from_tensor(b_scale),
+            32,
+            payload_dtype=mxfp_dtype_to_str(payload_dtype),
+        )
+
+    assert output.dtype == torch.float32
+    assert tuple(output.shape) == (1, 4, 8)
+
+
+def test_matmul_t_block_scaled_tosa_fp_mxfp4() -> None:
+    _test_matmul_t_block_scaled_tosa_fp_subbyte(torch.float4_e2m1fn_x2, 16)
+
+
+def test_matmul_t_block_scaled_tosa_fp_mxfp6() -> None:
+    _test_matmul_t_block_scaled_tosa_fp_subbyte(DTYPE_FP6_E3M2, 32)
+
+
 def test_matmul_t_block_scaled_invalid_scale_shape() -> None:
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
     a_data = torch.randn((1, 4, 32), dtype=torch.float32).to(torch.float8_e4m3fn)
diff --git a/backends/arm/test/ops/mxfp/test_mxfp_linear.py b/backends/arm/test/ops/mxfp/test_mxfp_linear.py
index 5cdd44cf138..fbec9307795 100644
--- a/backends/arm/test/ops/mxfp/test_mxfp_linear.py
+++ b/backends/arm/test/ops/mxfp/test_mxfp_linear.py
@@ -10,7 +10,7 @@
 
 import torch
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
-from executorch.backends.arm.test import common as arm_common
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.ops.mxfp.common import (
     MXFPTosaPipelineFP,
     MXFPVgfPipeline,
@@ -18,14 +18,12 @@
 from executorch.backends.arm.test.tester.analyze_output_utils import (
     compare_rel_frobenius_and_cosine_similarity,
 )
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 aten_op = "torch.ops.tosa_mxfp.linear.default"
 
 input_t1 = Tuple[torch.Tensor]
 
-_MXFP_FROBENIUS_THRESHOLD = 0.06
-_MXFP_COSINE_THRESHOLD = 0.995
-
 
 def _block_input_rank1() -> torch.Tensor:
     """Create a rank-1 input with distinct MXFP activation block scales."""
@@ -161,6 +159,7 @@ def _channels_last_rank4_input() -> torch.Tensor:
 
 test_data_vgf_fp = test_data_fp
 
+# TODO: MLETORCH-2141
 _vgf_xfail_reason = (
     "MXFP is not yet supported in the VGF toolchain. Enable this test when "
     "toolchain support is available."
@@ -215,35 +214,45 @@ def _is_linear(module: torch.nn.Module, _fqn: str) -> bool:
     return isinstance(module, torch.nn.Linear)
 
 
-@arm_common.parametrize("test_data", test_data_fp)
-def test_mxfp_linear_tosa_FP(test_data) -> None:
+def _test_mxfp_linear_eager_cpu(
+    test_data,
+    config: MXFPOpConfig,
+    frobenius_threshold=0.3,
+    cosine_threshold=0.95,
+) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
-    module = Linear(
+    ref_model = Linear(
         in_features=in_features,
         out_features=out_features,
         bias=has_bias,
     ).eval()
 
     if set_block_weights:
-        module.set_block_test_weights()
+        ref_model.set_block_test_weights()
+    test_model = copy.deepcopy(ref_model).eval()
 
-    pipeline = MXFPTosaPipelineFP[input_t1](
-        module,
-        (test_input,),
-        aten_op,
-        filter_fn=_is_linear,
-        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
-        cosine_threshold=_MXFP_COSINE_THRESHOLD,
-        tosa_version="1.1",
-        tosa_extensions=["mxfp"],
+    to_mxfp(test_model, config, filter_fn=_is_linear)
+
+    test_output = test_model(test_input)
+    ref_output = ref_model(test_input)
+
+    compare_rel_frobenius_and_cosine_similarity(
+        ref_output,
+        test_output,
+        quantization_parameters=None,
+        frobenius_threshold=frobenius_threshold,
+        cosine_threshold=cosine_threshold,
+        clean_reference=False,
     )
-    pipeline.run()
 
 
-@arm_common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
-@arm_common.SkipIfNoModelConverter
-def test_mxfp_linear_vgf(test_data) -> None:
+def _test_mxfp_linear_vgf(
+    test_data,
+    config: MXFPOpConfig,
+    frobenius_threshold,
+    cosine_threshold,
+) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
     module = Linear(
@@ -260,36 +269,169 @@ def test_mxfp_linear_vgf(test_data) -> None:
         (test_input,),
         aten_op,
         filter_fn=_is_linear,
-        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
-        cosine_threshold=_MXFP_COSINE_THRESHOLD,
+        frobenius_threshold=frobenius_threshold,
+        cosine_threshold=cosine_threshold,
+        mxfp_config=config,
         tosa_spec="TOSA-1.1+FP+mxfp",
     )
     pipeline.run()
 
 
-@arm_common.parametrize("test_data", test_data_fp)
-def test_mxfp_linear_eager_cpu(test_data) -> None:
+def _test_mxfp_linear_tosa_FP(
+    test_data,
+    config: MXFPOpConfig,
+    frobenius_threshold=0.08,
+    cosine_threshold=0.995,
+) -> None:
     test_input, out_features, has_bias, set_block_weights = test_data()
     in_features = test_input.shape[-1]
-    ref_model = Linear(
+    module = Linear(
         in_features=in_features,
         out_features=out_features,
         bias=has_bias,
     ).eval()
+
     if set_block_weights:
-        ref_model.set_block_test_weights()
-    test_model = copy.deepcopy(ref_model).eval()
+        module.set_block_test_weights()
 
-    to_mxfp(test_model, MXFPOpConfig(), filter_fn=_is_linear)
+    pipeline = MXFPTosaPipelineFP[input_t1](
+        module,
+        (test_input,),
+        aten_op,
+        filter_fn=_is_linear,
+        frobenius_threshold=frobenius_threshold,
+        cosine_threshold=cosine_threshold,
+        mxfp_config=config,
+        tosa_version="1.1",
+        tosa_extensions=["mxfp"],
+    )
+    pipeline.run()
 
-    test_output = test_model(test_input)
-    ref_output = ref_model(test_input)
 
-    compare_rel_frobenius_and_cosine_similarity(
-        ref_output,
-        test_output,
-        quantization_parameters=None,
-        frobenius_threshold=_MXFP_FROBENIUS_THRESHOLD,
-        cosine_threshold=_MXFP_COSINE_THRESHOLD,
-        clean_reference=False,
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp8_linear_tosa_FP(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_tosa_FP(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp4_linear_tosa_FP(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_tosa_FP(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
+        frobenius_threshold=0.3,
+        cosine_threshold=0.95,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp6_e2m3_linear_tosa_FP(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_tosa_FP(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp6_e3m2_linear_tosa_FP(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_tosa_FP(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
+    )
+
+
+@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
+@common.SkipIfNoModelConverter
+def test_mxfp8_linear_vgf(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_vgf(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+        frobenius_threshold=0.08,
+        cosine_threshold=0.995,
+    )
+
+
+@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
+@common.SkipIfNoModelConverter
+def test_mxfp4_linear_vgf(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_vgf(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
+        frobenius_threshold=0.3,
+        cosine_threshold=0.95,
+    )
+
+
+@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
+@common.SkipIfNoModelConverter
+def test_mxfp6_e2m3_linear_vgf(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_vgf(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
+    )
+
+
+@common.parametrize("test_data", test_data_vgf_fp, xfails=_vgf_xfails)
+@common.SkipIfNoModelConverter
+def test_mxfp6_e3m2_linear_vgf(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_vgf(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp8_linear_eager_cpu(test_data: torch.Tensor) -> None:
+    """Check eager MXFP implementation.
+
+    The Arm lowering tests compare lowered output against the eager CPU
+    implementation, so the eager implementation must be accurate for it to be
+    used as a reference in other tests.
+
+    """
+    _test_mxfp_linear_eager_cpu(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+        frobenius_threshold=0.08,
+        cosine_threshold=0.995,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp4_linear_eager_cpu(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_eager_cpu(
+        test_data,
+        MXFPOpConfig(weight_dtype=torch.float4_e2m1fn_x2),
+        frobenius_threshold=0.3,
+        cosine_threshold=0.95,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp6_e2m3_linear_eager_cpu(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_eager_cpu(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
+    )
+
+
+@common.parametrize("test_data", test_data_fp)
+def test_mxfp6_e3m2_linear_eager_cpu(test_data: torch.Tensor) -> None:
+    _test_mxfp_linear_eager_cpu(
+        test_data,
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2),
+        frobenius_threshold=0.2,
+        cosine_threshold=0.98,
     )
diff --git a/backends/arm/test/ops/test_round.py b/backends/arm/test/ops/test_round.py
index ff86dbffff0..bcc71b70725 100644
--- a/backends/arm/test/ops/test_round.py
+++ b/backends/arm/test/ops/test_round.py
@@ -6,7 +6,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -67,7 +66,6 @@ def test_round_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(reason="where.self not supported on U55")
 def test_round_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Round(),
diff --git a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
index 64594403dae..69e1830e3ee 100644
--- a/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
+++ b/backends/arm/test/passes/test_insert_dynamic_padding_pass.py
@@ -3,72 +3,122 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import executorch.backends.arm.tosa.dialect  # noqa: F401
 import torch
 from executorch.backends.arm._passes.insert_dynamic_padding import (
     InsertDynamicPaddingPass,
 )
-from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
     TosaSpecification,
 )
-from executorch.exir import to_edge
+from executorch.backends.test.graph_builder import GraphBuilder
 from executorch.exir.dialects._ops import ops as exir_ops
-from torch._export.utils import _get_shape_env_from_gm
-from torch.export import Dim, export
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassResult
 
 
-class ConvModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(3, 16, kernel_size=2, stride=3, padding=2)
+SPEC = TosaSpecification.create_from_string("TOSA-1.1+FP+shape")
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.conv(x)
+
+def _build_conv_graph(
+    target_op,
+    input_shape: tuple[int, ...],
+    weight_shape: tuple[int, ...],
+    padding: list[int],
+    stride: list[int],
+    dilation: list[int],
+) -> GraphModule:
+    with TosaLoweringContext(SPEC):
+        builder = GraphBuilder()
+        input_tensor = builder.placeholder("input", torch.randn(input_shape))
+        weight = builder.placeholder("weight", torch.randn(weight_shape))
+        bias = builder.placeholder("bias", torch.randn(weight_shape[0]))
+        padding_shape = builder.call_operator(
+            exir_ops.backend.tosa.CONST_SHAPE.default, (padding,)
+        )
+        padding_shape.node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE
+        conv = builder.call_operator(
+            target_op,
+            (input_tensor, weight, bias, stride, padding_shape, dilation),
+        )
+        builder.output([conv])
+        return ExportPass().call(builder.get_graph_module()).graph_module
+
+
+def _run_insert_dynamic_padding(graph_module: GraphModule) -> GraphModule:
+    with TosaLoweringContext(SPEC):
+        result = InsertDynamicPaddingPass()(graph_module)
+    assert isinstance(result, PassResult)
+    return result.graph_module
+
+
+def _assert_inserted_padding(
+    graph_module: GraphModule,
+    target_op,
+    zero_spatial_padding: list[int],
+    expected_full_padding_len: int,
+) -> None:
+    nodes = graph_module.graph.nodes
+    conv_node = next(n for n in nodes if n.target == target_op)
+    assert conv_node.args[4] == zero_spatial_padding
+
+    padding_node = next(
+        n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default
+    )
+    padding_shape_node = padding_node.args[1]
+    assert padding_shape_node.target == exir_ops.backend.tosa.CONCAT_SHAPE.default
+
+    n_padding, spatial_padding, c_padding = padding_shape_node.args[0]
+    assert n_padding.meta["val"] == [0, 0]
+    assert spatial_padding.target == exir_ops.backend.tosa.CONST_SHAPE.default
+    assert c_padding.meta["val"] == [0, 0]
+
+    pad_list = padding_shape_node.meta["val"]
+    spatial_padding_value = spatial_padding.meta["val"]
+    assert len(pad_list) == expected_full_padding_len
+    assert pad_list[:2] == [0, 0]
+    assert pad_list[2:-2] == spatial_padding_value
+    assert pad_list[-2:] == [0, 0]
 
 
 def test_insert_dynamic_padding():
-    model = ConvModule()
-    example_inputs = (torch.randn(1, 3, 8, 8),)
-    ep = export(
-        model,
-        example_inputs,
-        dynamic_shapes={
-            "x": {2: Dim("height", min=4, max=10), 3: Dim("width", min=4, max=10)}
-        },
+    graph_module = _build_conv_graph(
+        exir_ops.backend.tosa.CONV2D.default,
+        input_shape=(1, 8, 8, 3),
+        weight_shape=(16, 2, 2, 3),
+        padding=[2, 2, 2, 2],
+        stride=[3, 3],
+        dilation=[1, 1],
     )
-    edge_model = to_edge(ep)
-    shape_env = _get_shape_env_from_gm(edge_model.exported_program().graph_module)
-    with TosaLoweringContext(
-        TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env
-    ):
-        edge_model = edge_model.transform(
-            [RewriteConvPass(edge_model.exported_program())]
-        )
-        nodes = edge_model.exported_program().graph.nodes
-        conv_node = next(
-            n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default
-        )
-        initial_padding = conv_node.args[4]
-        assert any(isinstance(p, torch.SymInt) for p in initial_padding)
 
-        edge_model = edge_model.transform(
-            [
-                InsertDynamicPaddingPass(),
-            ]
-        )
-        nodes = edge_model.exported_program().graph.nodes
-        conv_node = next(
-            n for n in nodes if n.target == exir_ops.backend.tosa.CONV2D.default
-        )
-        padding = conv_node.args[4]
-        assert padding == [0, 0, 0, 0]
-        padding_node = next(
-            n for n in nodes if n.target == exir_ops.backend.tosa.PAD.default
-        )
-        assert padding_node is not None
-        pad_list = padding_node.args[1].meta["val"]
-        assert len(pad_list) == 8
-        assert pad_list[:2] == [0, 0]  # N-padding
-        assert pad_list[2:6] == initial_padding  # HW-padding in NHWC order
-        assert pad_list[6:] == [0, 0]  # C-padding
+    graph_module = _run_insert_dynamic_padding(graph_module)
+
+    _assert_inserted_padding(
+        graph_module,
+        exir_ops.backend.tosa.CONV2D.default,
+        zero_spatial_padding=[0, 0, 0, 0],
+        expected_full_padding_len=8,
+    )
+
+
+def test_insert_dynamic_padding_conv3d():
+    graph_module = _build_conv_graph(
+        exir_ops.backend.tosa.CONV3D.default,
+        input_shape=(1, 8, 8, 8, 3),
+        weight_shape=(16, 2, 2, 2, 3),
+        padding=[2, 2, 2, 2, 2, 2],
+        stride=[3, 3, 3],
+        dilation=[1, 1, 1],
+    )
+
+    graph_module = _run_insert_dynamic_padding(graph_module)
+
+    _assert_inserted_padding(
+        graph_module,
+        exir_ops.backend.tosa.CONV3D.default,
+        zero_spatial_padding=[0, 0, 0, 0, 0, 0],
+        expected_full_padding_len=10,
+    )
diff --git a/backends/arm/test/passes/test_rewrite_conv_pass.py b/backends/arm/test/passes/test_rewrite_conv_pass.py
index fc8478afee5..736aa685b86 100644
--- a/backends/arm/test/passes/test_rewrite_conv_pass.py
+++ b/backends/arm/test/passes/test_rewrite_conv_pass.py
@@ -336,11 +336,15 @@ def test_rewrite_conv_dynamic_keeps_static_padding_when_symbolic_remainder_is_ze
     assert all(not isinstance(p, torch.SymInt) for p in padding)
 
 
-def test_rewrite_conv_adjust_pad_if_needed_static_raises_before_negative_padding():
+def test_rewrite_conv_adjust_pad_if_needed_static_allows_negative_padding_until_later_validation():
     rewrite_pass, _, _ = _make_rewrite_pass((torch.randn(1, 3, 9, 12),))
 
-    with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
+    try:
         rewrite_pass._adjust_pad_if_needed(6, 2, 3, 0, 1)
+    except RuntimeError as e:
+        assert "SizeAdjustInputPass" in str(e)
+    else:
+        pytest.fail("Expected RuntimeError was not raised")
 
 
 def test_rewrite_conv_adjust_pad_if_needed_static_positive_padding_stays_non_negative():
@@ -387,7 +391,7 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_exact_zero_keeps_positive_pa
     assert adjusted_pad == 1
 
 
-def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_raises_before_negative_padding():
+def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_returns_symbolic_padding():
     rewrite_pass, shape_env, input_len = _make_rewrite_pass(
         (torch.randn(1, 3, 8, 8),),
         dynamic_shapes={
@@ -399,8 +403,9 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_positive_padding_range_raise
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
-        with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
-            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1)
+        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 1, 1)
+
+    assert isinstance(adjusted_pad, torch.SymInt)
 
 
 def test_rewrite_conv_symbolic_comparison_with_int_specializes_to_hint():
@@ -438,11 +443,12 @@ def unsafe_adjust(input_len, input_weight, stride, pad, dilation):
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
-        with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
-            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
+        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
 
+    assert isinstance(adjusted_pad, torch.SymInt)
 
-def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_raises_before_negative_padding():
+
+def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_returns_symbolic_padding():
     rewrite_pass, shape_env, input_len = _make_rewrite_pass(
         (torch.randn(1, 3, 8, 8),),
         dynamic_shapes={
@@ -451,8 +457,22 @@ def test_rewrite_conv_adjust_pad_if_needed_symbolic_zero_padding_range_raises_be
         },
     )
 
+    with TosaLoweringContext(
+        TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
+    ):
+        adjusted_pad = rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
+
+    assert isinstance(adjusted_pad, torch.SymInt)
+
+
+def test_rewrite_conv_adjust_pad_if_needed_symbolic_singleton_overflow_still_raises():
+    rewrite_pass, shape_env, input_len = _make_rewrite_pass(
+        (torch.randn(1, 3, 9, 12),),
+        dynamic_shapes=_multiples_of_three_dynamic_shapes(),
+    )
+
     with TosaLoweringContext(
         TosaSpecification.create_from_string("TOSA-1.1+FP+shape"), shape_env=shape_env
     ):
         with pytest.raises(RuntimeError, match="SizeAdjustInputPass"):
-            rewrite_pass._adjust_pad_if_needed(input_len, 2, 3, 0, 1)
+            rewrite_pass._adjust_pad_if_needed(input_len, 3, 3, 1, 1)
diff --git a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
index 572a2b247e9..f89872f93b8 100644
--- a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
+++ b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
@@ -9,12 +9,15 @@
 import torch
 from executorch.backends.arm._passes.rewrite_mxfp_linear import RewriteMXFPLinearPass
 from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp
+from executorch.backends.arm.ao_ext.mxfp import mxfp_dtype_to_str
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
     TosaSpecification,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export import export
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3
 
 
 class _LinearModule(torch.nn.Module):
@@ -49,9 +52,11 @@ def _get_nodes_from_target(
     ]
 
 
-def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
+def _rewrite_linear_module(
+    config: MXFPOpConfig,
+) -> tuple[torch.fx.GraphModule, list[torch.fx.Node], list[torch.fx.Node]]:
     model = _LinearModule(bias=True).eval()
-    to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear)
+    to_mxfp(model, config, filter_fn=_is_linear)
     exported = export(model, (torch.randn(4, 5, 32),), strict=False)
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
@@ -66,6 +71,11 @@ def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
     matmul_nodes = _get_nodes_from_target(
         graph_module, exir_ops.backend.tosa.MATMUL_T_BLOCK_SCALED.default
     )
+    return graph_module, cast_nodes, matmul_nodes
+
+
+def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
+    graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(MXFPOpConfig())
 
     assert (
         len(_get_nodes_from_target(graph_module, torch.ops.tosa_mxfp.linear.default))
@@ -88,6 +98,34 @@ def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
     assert tuple(output_node.meta["val"][0].shape) == (4, 5, 8)
 
 
+def test_rewrite_mxfp6_linear_marks_payload_dtype() -> None:
+    graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(
+        MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3)
+    )
+    cast_node = cast_nodes[0]
+    matmul_node = matmul_nodes[0]
+    input_qdata_node = next(
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function"
+        and node.target == operator.getitem
+        and node.args[0] == cast_node
+        and node.args[1] == 0
+    )
+    weight_qdata_node = matmul_node.args[2]
+    assert isinstance(weight_qdata_node, torch.fx.Node)
+
+    assert cast_node.kwargs["output_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3)
+    assert matmul_node.kwargs["payload_dtype"] == mxfp_dtype_to_str(DTYPE_FP6_E2M3)
+    assert tuple(cast_node.meta["val"][0].shape) == (1, 4 * 5, 32)
+    assert (
+        input_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3
+    )
+    assert (
+        weight_qdata_node.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.FP6E2M3
+    )
+
+
 def test_rewrite_mxfp_dual_linear() -> None:
     model = _DualLinearModule().eval()
     to_mxfp(model, MXFPOpConfig(), filter_fn=_is_linear)
diff --git a/backends/arm/test/passes/test_symbolic_value_range.py b/backends/arm/test/passes/test_symbolic_value_range.py
index 7a6ecfdf79c..99dfafc93a6 100644
--- a/backends/arm/test/passes/test_symbolic_value_range.py
+++ b/backends/arm/test/passes/test_symbolic_value_range.py
@@ -68,3 +68,16 @@ def test_evaluate_symbolic_expr_values_bails_out_for_large_symbol_ranges() -> No
     shape_env, symint = _make_shape_env(hint=3, compiler_min=1, compiler_max=400)
 
     assert evaluate_symbolic_expr_values(symint, shape_env) is None
+
+
+def test_evaluate_symbolic_expr_values_does_not_require_shape_env_bounds(
+    monkeypatch,
+) -> None:
+    shape_env, symint = _make_shape_env(hint=3, compiler_min=2, compiler_max=6)
+
+    def raise_recursion(_expr):
+        raise RecursionError
+
+    monkeypatch.setattr(shape_env, "bound_sympy", raise_recursion)
+
+    assert evaluate_symbolic_expr_values(symint, shape_env) == {2, 3, 4, 5, 6}
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index ff26d17ee13..9a63452e325 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -2,10 +2,10 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 import importlib.resources as _resources
 import json
 import logging
+import numbers
 import os
 import re
 import shutil
@@ -14,13 +14,11 @@
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
-
 from types import NoneType
 from typing import Any, cast, Dict, List, Optional, Tuple
 
 import executorch.backends.arm.test as arm_test_package
 import executorch.backends.arm.tosa.schemas as tosa_schemas_package
-
 import numpy as np
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
@@ -31,7 +29,6 @@
     NNHWC_INVERSE_ORDER,
     NNHWC_ORDER,
 )
-
 from executorch.backends.arm.ethosu import EthosUCompileSpec
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification
@@ -43,7 +40,6 @@
 from executorch.exir import ExecutorchProgramManager, ExportedProgram
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from torch.fx.node import Node
-
 from torch.overrides import TorchFunctionMode
 from tosa.TosaGraph import TosaGraph  # type: ignore[import-not-found, import-untyped]
 
@@ -79,6 +75,7 @@
     "corstone-320",
     "vkml_emulation_layer",
 }
+INFER_SHAPES_PATH = "infer_shapes"
 
 
 class QuantizationParams:
@@ -102,7 +99,9 @@ def __init__(
         self.dtype = dtype
 
 
-def get_input_names(program: ExportedProgram) -> list[str]:
+def get_input_names(
+    program: ExportedProgram, is_lowered_module: bool = False
+) -> list[str]:
     """Get a list[str] with the names of the inputs to this model.
 
     Args:
@@ -111,7 +110,15 @@ def get_input_names(program: ExportedProgram) -> list[str]:
         A list of strings with the names of the model input.
 
     """
-    return [spec.arg.name for spec in program.graph_signature.input_specs]
+
+    if not is_lowered_module:
+        return [spec.arg.name for spec in program.graph_signature.input_specs]
+    else:
+        return [
+            user_input
+            for user_input in program.graph_signature.user_inputs
+            if isinstance(user_input, str)
+        ]
 
 
 def get_input_quantization_params(
@@ -204,25 +211,59 @@ def torch_tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
     return tensor.numpy()
 
 
+def torch_tensor_to_tosa_shape(tensor: torch.Tensor) -> list[int]:
+    shape = list(tensor.shape)
+    dim_order = tensor.dim_order()
+    if dim_order in (NHWC_ORDER, NNHWC_ORDER):
+        shape = [shape[index] for index in dim_order]
+    return [int(dim) for dim in shape]
+
+
+def user_inputs_need_shape_inference(program: ExportedProgram) -> bool:
+    user_inputs = {
+        user_input
+        for user_input in program.graph_signature.user_inputs
+        if isinstance(user_input, str)
+    }
+    for node in program.graph.nodes:
+        if node.op != "placeholder" or node.name not in user_inputs:
+            continue
+        input_tensor = get_first_fake_tensor(node)
+        if any(not isinstance(dim, numbers.Integral) for dim in input_tensor.shape):
+            return True
+    return False
+
+
 def numpy_to_torch_tensor(array: np.ndarray, output_node: Node) -> torch.Tensor:
     output_tensor = get_first_fake_tensor(output_node)
     shape = output_tensor.shape
     dim_order = output_tensor.dim_order()
+
+    def is_concrete_shape(shape_like) -> bool:
+        return all(isinstance(dim, numbers.Integral) for dim in shape_like)
+
+    def to_torch_tensor() -> torch.Tensor:
+        if array.dtype.type is np.void:
+            # If dtype is void, "cheat" and use the output_tensor dtype.
+            return torch.frombuffer(array, dtype=output_tensor.dtype)
+        return torch.from_numpy(array)
+
     if dim_order == NHWC_ORDER:
-        shape_with_dim_order = [shape[i] for i in NHWC_ORDER]
-        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
+        tensor = to_torch_tensor()
+        if is_concrete_shape(shape):
+            tensor = tensor.reshape([shape[i] for i in NHWC_ORDER])
         return tensor.permute(NHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
     elif dim_order == NNHWC_ORDER:
-        shape_with_dim_order = [shape[i] for i in NNHWC_ORDER]
-        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
-        return tensor.permute(NNHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
+        tensor = to_torch_tensor()
+        if is_concrete_shape(shape):
+            tensor = tensor.reshape([shape[i] for i in NNHWC_ORDER])
+        return tensor.permute(NNHWC_INVERSE_ORDER)
     else:
-        if array.dtype.type is np.void:
-            # If dtype is void, "cheat" and use the output_tensor dtype.
-            tensor = torch.frombuffer(array, dtype=output_tensor.dtype)
-        else:
-            tensor = torch.from_numpy(array)
-        return tensor.reshape(shape)
+        tensor = to_torch_tensor()
+
+        if is_concrete_shape(shape):
+            return tensor.reshape(shape)
+        return tensor
 
 
 class TosaReferenceModelDispatch(TorchFunctionMode):
@@ -234,12 +275,65 @@ def __init__(self):
         self.ran_tosa_dispatch = False
         super().__init__()
 
+    def _generate_shape_inference_json(
+        self,
+        tosa_buffer: bytes,
+        artifact_path: Path,
+        test_case_path: Path,
+        input_names: list[str],
+        inputs: Tuple[torch.Tensor, ...],
+    ):
+        shapes = dict(
+            zip(input_names, [torch_tensor_to_tosa_shape(input) for input in inputs])
+        )
+        with open(test_case_path, "w", encoding="utf-8") as f:
+            json.dump({"tosa_file": str(artifact_path), "shapes": shapes}, f, indent=2)
+
+    def _run_infer_shapes(
+        self,
+        tosa_buffer: bytes,
+        input_names: list[str],
+        inputs: Tuple[torch.Tensor, ...],
+        temp_dir_path: Path,
+        infer_shapes_path: str = INFER_SHAPES_PATH,
+    ) -> bytes:
+        model_suffix = "model.tosa"
+        tosa_sym_int_model = temp_dir_path / model_suffix
+        tosa_sym_int_model.write_bytes(tosa_buffer)
+        test_case_file = temp_dir_path / "test_case.json"
+
+        self._generate_shape_inference_json(
+            tosa_buffer, tosa_sym_int_model, test_case_file, input_names, inputs
+        )
+        subprocess.run(
+            [
+                infer_shapes_path,
+                f"{test_case_file}",
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )  # nosec
+        resolved_file = temp_dir_path / f"resolved_{model_suffix}"
+        with open(resolved_file, "rb") as f:
+            return f.read()
+
     def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs):
         tosa_buffer = lowered_backend_module.processed_bytes
         compile_spec = TosaCompileSpec._from_list(lowered_backend_module.compile_specs)
-
+        tosa_spec = compile_spec.tosa_spec
         output_node = lowered_backend_module.original_module.graph.output_node()
-        return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs, output_node)
+        if tosa_spec.support_extension("shape") and user_inputs_need_shape_inference(
+            lowered_backend_module.original_module
+        ):
+            input_names = get_input_names(lowered_backend_module.original_module, True)
+            # Generate json file for shape inference extension, which is required by the reference model.
+            with tempfile.TemporaryDirectory() as temp_dir:
+                tosa_buffer = self._run_infer_shapes(
+                    tosa_buffer, input_names, inputs, Path(temp_dir)
+                )
+
+        return run_tosa_graph(tosa_buffer, tosa_spec, inputs, output_node)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
@@ -282,7 +376,7 @@ def __torch_function__(self, func, types, args=..., kwargs=None):
 
 def run_target(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor],
+    inputs: Tuple[torch.Tensor, ...],
     intermediate_path: str | Path,
     target_board: str,
     elf_path: str | Path,
@@ -310,7 +404,7 @@ def run_target(
 
 def save_inputs_to_file(
     exported_program: ExportedProgram,
-    inputs: Tuple[torch.Tensor],
+    inputs: Tuple[torch.Tensor, ...],
     intermediate_path: str | Path,
 ):
     input_file_paths: list[str] = []
@@ -342,7 +436,7 @@ def get_output_from_file(
 
 def run_vkml_emulation_layer(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor],
+    inputs: Tuple[torch.Tensor, ...],
     intermediate_path: str | Path,
     elf_path: str | Path,
 ):
@@ -390,7 +484,7 @@ def run_vkml_emulation_layer(
 
 def run_corstone(
     executorch_program_manager: ExecutorchProgramManager,
-    inputs: Tuple[torch.Tensor],
+    inputs: Tuple[torch.Tensor, ...],
     intermediate_path: str | Path,
     target_board: str,
     elf_path: str | Path,
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 4df310f6dc1..d321766e8d8 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -23,6 +23,7 @@ def define_arm_tests():
         "ops/test_log10.py",
         "ops/test_max_pool1d.py",
         "ops/test_mul.py",
+        "ops/test_mxfp_conv2d.py",
         "ops/mxfp/test_mxfp_linear.py",
         "ops/test_permute.py",
         "ops/test_rsqrt.py",
@@ -57,12 +58,14 @@ def define_arm_tests():
         # "misc/test_evaluate_model.py",
         "misc/test_pass_pipeline_config.py",
         "misc/tosa_dialect/test_tosa_dialect_cast_to_block_scaled.py",
+        "misc/tosa_dialect/test_tosa_dialect_mxfp_conv2d.py",
         "misc/tosa_dialect/test_tosa_dialect_mxfp_linear.py",
         "misc/tosa_dialect/test_tosa_resize.py",
         "misc/test_tosa_spec.py",
         "misc/test_bn_relu_folding_qat.py",
         "misc/test_custom_partition.py",
         "misc/test_debug_hook.py",
+        "misc/test_mxfp_conv2d_ao.py",
         "misc/test_mxfp_linear_ao.py",
         "misc/test_post_quant_device_switch.py",
         "misc/test_vgf_check_env.py",
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
index 0585f7a1ff8..4d059b64efe 100644
--- a/backends/arm/tosa/dialect/__init__.py
+++ b/backends/arm/tosa/dialect/__init__.py
@@ -11,6 +11,7 @@
     binary_elementwise,
     cast_to_block_scaled,
     conv2d,
+    conv2d_block_scaled,
     conv3d,
     custom,
     data_layout_ops,
diff --git a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
index ed109be6124..8dbff7c11c5 100644
--- a/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
+++ b/backends/arm/tosa/dialect/ops/cast_to_block_scaled.py
@@ -5,24 +5,28 @@
 
 from __future__ import annotations
 
+from typing import cast
+
 import torch
 
+from executorch.backends.arm.ao_ext.mxfp import mxfp_str_to_dtype
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
     get_context_spec,
     TosaSpecification,
 )
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP6_E2M3, DTYPE_FP6_E3M2
 
 
 @register_fake_tosa_op(
-    "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, ScalarType output_dtype) -> (Tensor, Tensor)",
+    "CAST_TO_BLOCK_SCALED(Tensor input, SymInt block_size, str output_dtype) -> (Tensor, Tensor)",
     [TosaSpecification.create_from_string("TOSA-1.1+FP")],
 )
 def CAST_TO_BLOCK_SCALED(
     input: torch.Tensor,
     block_size: int,
-    output_dtype: torch.dtype,
+    output_dtype: str,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     tosa_spec = get_context_spec()
 
@@ -62,12 +66,25 @@ def CAST_TO_BLOCK_SCALED(
         )
 
     scale_tensor_dtype = torch.float8_e8m0fnu
-    if output_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
+    elem_dtype = mxfp_str_to_dtype(output_dtype)
+    if elem_dtype not in (
+        torch.float4_e2m1fn_x2,
+        DTYPE_FP6_E2M3,
+        DTYPE_FP6_E3M2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    ):
         raise TosaValueError(
             f"Unsupported block-scaled output dtype {output_dtype}",
             op="CAST_TO_BLOCK_SCALED",
         )
     scale_shape = (*input.shape[:-1], input.shape[-1] // block_size)
-    output_data = torch.empty_like(input, dtype=output_dtype)
+    if elem_dtype == torch.float4_e2m1fn_x2:
+        output_shape = (*input.shape[:-1], input.shape[-1] // 2)
+        output_data = input.new_empty(output_shape, dtype=torch.uint8)
+    elif elem_dtype in (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2):
+        output_data = input.new_empty(input.shape, dtype=torch.uint8)
+    else:
+        output_data = torch.empty_like(input, dtype=cast(torch.dtype, elem_dtype))
     output_scale = input.new_empty(scale_shape, dtype=scale_tensor_dtype)
     return output_data, output_scale
diff --git a/backends/arm/tosa/dialect/ops/conv2d.py b/backends/arm/tosa/dialect/ops/conv2d.py
index 5af0ca1617a..d0db2d60fcd 100644
--- a/backends/arm/tosa/dialect/ops/conv2d.py
+++ b/backends/arm/tosa/dialect/ops/conv2d.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
 from typing import Optional
 
 import torch
@@ -89,6 +88,23 @@ def validate_conv2d_args_dtypes(  # noqa: C901
     return output_dtype
 
 
+def conv_output_dim(
+    input_dim: int | torch.SymInt,
+    kernel_dim: int,
+    stride: int,
+    pad_before: int | torch.SymInt,
+    pad_after: int | torch.SymInt,
+    dilation: int,
+) -> int | torch.SymInt:
+    receptive_field = dilation * (kernel_dim - 1) + 1
+    total_pad = pad_before + pad_after
+
+    if stride == 1:
+        return input_dim + total_pad - receptive_field + 1
+
+    return (input_dim + total_pad - receptive_field) // stride + 1
+
+
 @register_fake_tosa_op(
     "CONV2D(Tensor input, "
     "Tensor weight, "
@@ -110,17 +126,14 @@ def CONV2D(
 
     output_dtype = validate_conv2d_args_dtypes(tosa_spec, x, weight, bias, op="CONV2D")
 
-    torch_pad = [pad[0], pad[2]]
     N = x.shape[0]
+    H_in, W_in = x.shape[1:3]
     C_out = weight.shape[0]
-    H_in, W_in = x.shape[1], x.shape[2]
-    H_out = math.floor(
-        (H_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0]
-        + 1
+    H_out = conv_output_dim(
+        H_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0]
     )
-    W_out = math.floor(
-        (W_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1]
-        + 1
+    W_out = conv_output_dim(
+        W_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1]
     )
     output_shape = [N, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/conv3d.py b/backends/arm/tosa/dialect/ops/conv3d.py
index 67ceb0596c6..a81ae0dae53 100644
--- a/backends/arm/tosa/dialect/ops/conv3d.py
+++ b/backends/arm/tosa/dialect/ops/conv3d.py
@@ -3,12 +3,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
 from typing import Optional
 
 import torch
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
-from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes
+from executorch.backends.arm.tosa.dialect.ops.conv2d import (
+    conv_output_dim,
+    validate_conv2d_args_dtypes,
+)
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
     get_context_spec,
@@ -35,7 +37,7 @@ def validate_conv3d_args_dtypes(
     "Tensor weight, "
     "Tensor bias, "
     "int[3] stride, "
-    "int[6] pad, "
+    "SymInt[6] pad, "
     "int[3] dilation) -> Tensor",
     TosaSpecification.all_versions_and_profiles(),
 )
@@ -44,28 +46,24 @@ def CONV3D(
     weight: torch.Tensor,
     bias: torch.Tensor,
     stride: list[int],
-    pad: list[int],
+    pad: list[int | torch.SymInt],
     dilation: list[int],
 ) -> torch.Tensor:
     tosa_spec = get_context_spec()
 
     output_dtype = validate_conv3d_args_dtypes(tosa_spec, x, weight, bias)
 
-    torch_pad = [pad[0], pad[2], pad[4]]
     N = x.shape[0]
     C_out = weight.shape[0]
-    D_in, H_in, W_in = x.shape[1], x.shape[2], x.shape[3]
-    D_out = math.floor(
-        (D_in + 2 * torch_pad[0] - dilation[0] * (weight.shape[1] - 1) - 1) / stride[0]
-        + 1
+    D_in, H_in, W_in = x.shape[1:4]
+    D_out = conv_output_dim(
+        D_in, weight.shape[1], stride[0], pad[0], pad[1], dilation[0]
     )
-    H_out = math.floor(
-        (H_in + 2 * torch_pad[1] - dilation[1] * (weight.shape[2] - 1) - 1) / stride[1]
-        + 1
+    H_out = conv_output_dim(
+        H_in, weight.shape[2], stride[1], pad[2], pad[3], dilation[1]
     )
-    W_out = math.floor(
-        (W_in + 2 * torch_pad[2] - dilation[2] * (weight.shape[3] - 1) - 1) / stride[2]
-        + 1
+    W_out = conv_output_dim(
+        W_in, weight.shape[3], stride[2], pad[4], pad[5], dilation[2]
     )
     output_shape = [N, D_out, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
index ae864f29d62..83ef3ff72fb 100644
--- a/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
+++ b/backends/arm/tosa/dialect/ops/depthwise_conv2d.py
@@ -3,10 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
-
 import torch
-from executorch.backends.arm.tosa.dialect.ops.conv2d import validate_conv2d_args_dtypes
+from executorch.backends.arm.tosa.dialect.ops.conv2d import (
+    conv_output_dim,
+    validate_conv2d_args_dtypes,
+)
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 
 from executorch.backends.arm.tosa.specification import (
@@ -38,17 +39,11 @@ def DEPTHWISE_CONV2D(
         tosa_spec, x, weight, bias, op="DEPTHWISE_CONV2D"
     )
 
-    torch_pad = [pad[0], pad[2]]
-    # Weight format is [KH, KW, IC, M], where C_out = IC * M.
     kernel_h, kernel_w = weight.shape[0], weight.shape[1]
     C_out = weight.shape[2] * weight.shape[3]
     N = x.shape[0]
-    H_in, W_in = x.shape[1], x.shape[2]
-    H_out = math.floor(
-        (H_in + 2 * torch_pad[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1
-    )
-    W_out = math.floor(
-        (W_in + 2 * torch_pad[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1
-    )
+    H_in, W_in = x.shape[1:3]
+    H_out = conv_output_dim(H_in, kernel_h, stride[0], pad[0], pad[1], dilation[0])
+    W_out = conv_output_dim(W_in, kernel_w, stride[1], pad[2], pad[3], dilation[1])
     output_shape = [N, H_out, W_out, C_out]
     return torch.empty(size=output_shape, dtype=output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
index b42e2855e4c..fcea104320f 100644
--- a/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
+++ b/backends/arm/tosa/dialect/ops/matmul_t_block_scaled.py
@@ -7,6 +7,11 @@
 
 import torch
 
+from executorch.backends.arm.ao_ext.mxfp import (
+    mxfp_str_to_dtype,
+    MXFPDType,
+    SUPPORTED_MXFP_DTYPES,
+)
 from executorch.backends.arm.tosa.dialect.lib import TosaValueError
 from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
 from executorch.backends.arm.tosa.specification import (
@@ -28,18 +33,39 @@ def _validate_block_size(block_size: int) -> None:
         )
 
 
+def _get_payload_dtype(
+    data: torch.Tensor,
+    payload_dtype: str = "",
+) -> MXFPDType:
+    if payload_dtype:
+        return mxfp_str_to_dtype(payload_dtype)
+    if data.dtype == torch.uint8:
+        return torch.float4_e2m1fn_x2
+    return data.dtype
+
+
+def _get_logical_last_dim(data: torch.Tensor, payload_dtype: str = "") -> int:
+    last_dim = data.shape[-1]
+    if _get_payload_dtype(data, payload_dtype) == torch.float4_e2m1fn_x2:
+        return last_dim * 2
+    return last_dim
+
+
 def _validate_dtypes(
     A_data: torch.Tensor,
     A_scale: torch.Tensor,
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
+    payload_dtype: str = "",
 ) -> None:
-    if A_data.dtype not in (torch.float8_e4m3fn, torch.float8_e5m2):
+    A_dtype = _get_payload_dtype(A_data, payload_dtype)
+    B_dtype = _get_payload_dtype(B_data, payload_dtype)
+    if A_dtype not in SUPPORTED_MXFP_DTYPES:
         raise TosaValueError(
             f"Unsupported A_data dtype {A_data.dtype}",
             op="MATMUL_T_BLOCK_SCALED",
         )
-    if B_data.dtype != A_data.dtype:
+    if B_dtype != A_dtype:
         raise TosaValueError(
             f"B_data dtype {B_data.dtype} must match A_data dtype {A_data.dtype}",
             op="MATMUL_T_BLOCK_SCALED",
@@ -57,6 +83,7 @@ def _validate_shapes(
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
     block_size: int,
+    payload_dtype: str = "",
 ) -> tuple[int, int, int]:
     if A_data.ndim != 3 or A_scale.ndim != 3 or B_data.ndim != 3 or B_scale.ndim != 3:
         raise TosaValueError(
@@ -64,8 +91,10 @@ def _validate_shapes(
             op="MATMUL_T_BLOCK_SCALED",
         )
 
-    N, H, C = A_data.shape
-    D, W, Cb = B_data.shape
+    N, H = A_data.shape[:2]
+    D, W = B_data.shape[:2]
+    C = _get_logical_last_dim(A_data, payload_dtype)
+    Cb = _get_logical_last_dim(B_data, payload_dtype)
     if C != Cb:
         raise TosaValueError(
             f"A_data last dim {C} must match B_data last dim {Cb}",
@@ -100,7 +129,8 @@ def _validate_shapes(
 
 
 @register_fake_tosa_op(
-    "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, Tensor B_scale, SymInt block_size) -> Tensor",
+    "MATMUL_T_BLOCK_SCALED(Tensor A_data, Tensor A_scale, Tensor B_data, "
+    "Tensor B_scale, SymInt block_size, str payload_dtype='') -> Tensor",
     [TosaSpecification.create_from_string("TOSA-1.1+FP")],
 )
 def MATMUL_T_BLOCK_SCALED(
@@ -109,6 +139,7 @@ def MATMUL_T_BLOCK_SCALED(
     B_data: torch.Tensor,
     B_scale: torch.Tensor,
     block_size: int,
+    payload_dtype: str = "",
 ) -> torch.Tensor:
     tosa_spec = get_context_spec()
 
@@ -119,12 +150,13 @@ def MATMUL_T_BLOCK_SCALED(
         )
 
     _validate_block_size(block_size)
-    _validate_dtypes(A_data, A_scale, B_data, B_scale)
+    _validate_dtypes(A_data, A_scale, B_data, B_scale, payload_dtype)
     output_shape = _validate_shapes(
         A_data,
         A_scale,
         B_data,
         B_scale,
         block_size,
+        payload_dtype,
     )
     return A_data.new_empty(output_shape, dtype=torch.float32)
diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py
index 245a9c00235..5e661676149 100644
--- a/backends/arm/tosa/mapping.py
+++ b/backends/arm/tosa/mapping.py
@@ -35,6 +35,9 @@
 class TosaSpecialDtype(Enum):
     """Special TOSA dtypes not natively expressed in PyTorch."""
 
+    FP4E2M1 = ts.DType.FP4E2M1
+    FP6E2M3 = ts.DType.FP6E2M3
+    FP6E3M2 = ts.DType.FP6E3M2
     INT48 = ts.DType.INT48
     INT4 = ts.DType.INT4
     SHAPE = ts.DType.SHAPE
@@ -102,6 +105,7 @@ def map_dtype(data_type: torch.dtype) -> Any:
         torch.float8_e4m3fn: ts.DType.FP8E4M3,
         torch.float8_e5m2: ts.DType.FP8E5M2,
         torch.float8_e8m0fnu: ts.DType.FP8UE8M0,
+        torch.float4_e2m1fn_x2: ts.DType.FP4E2M1,
         torch.int8: ts.DType.INT8,
         # TOSA uses signless int8; unsigned semantics are expressed via RESCALE.
         torch.uint8: ts.DType.INT8,
@@ -156,8 +160,10 @@ def extract_tensor_meta(meta):
         raise ValueError(
             f"Expected first value in node.meta['val'] to be FakeTensor, got {val.__class__}"
         )
-    dtype = map_dtype(val.dtype)
     shape = tuple(val.size())
+    if special_dtype == TosaSpecialDtype.FP4E2M1 and val.dtype == torch.uint8:
+        shape = (*shape[:-1], shape[-1] * 2)
+    dtype = map_dtype(val.dtype)
 
     return (dtype, shape)
 
@@ -249,6 +255,15 @@ def __validate(self, tosa_spec: TosaSpecification) -> bool:
                     or tosa_spec.support_extension("mxfp")
                 ):
                     return False
+            case ts.DType.FP4E2M1:
+                if not tosa_spec.support_extension("mxfp"):
+                    return False
+            case ts.DType.FP6E2M3:
+                if not tosa_spec.support_extension("mxfp"):
+                    return False
+            case ts.DType.FP6E3M2:
+                if not tosa_spec.support_extension("mxfp"):
+                    return False
 
         return True
 
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index 37b9cd7cc2a..8c4257e9472 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -309,7 +309,9 @@ def _detag_boundary_nodes(
             elif detag_first_fp_node and not is_q_node and not is_dq_node:
                 # For non Q/DQ nodes, remove tag from first node in partition if any input has fp dtype
                 for input in node.all_input_nodes:
-                    if is_partitioned(input, tag):
+                    if is_partitioned(input, tag) or isinstance(
+                        input.meta["val"], torch.SymInt
+                    ):
                         continue
                     if get_first_fake_tensor(input).dtype.is_floating_point:
                         reporter.report_reject(
@@ -356,7 +358,13 @@ def _partition_has_invalid_uint8(self, partition: Partition, tag: str) -> bool:
                 if dtype is None:
                     try:
                         dtype = get_first_fake_tensor(node).dtype
-                    except (AttributeError, KeyError, RuntimeError, ValueError):
+                    except (
+                        AttributeError,
+                        KeyError,
+                        RuntimeError,
+                        ValueError,
+                        TypeError,
+                    ):
                         dtype = None
             if dtype is None:
                 continue
diff --git a/backends/arm/tosa/utils.py b/backends/arm/tosa/utils.py
index b44793cec5f..b1d727d7d01 100644
--- a/backends/arm/tosa/utils.py
+++ b/backends/arm/tosa/utils.py
@@ -164,6 +164,10 @@ def build_reshape_tosa(
 def normalize_symint(shape):
     """Dynamic shapes in executorch are represented with torch.SymInt objects in
     the shapes, in TOSA we do not have this concept and instead use -1.
+
+    This function replaces each symbolic dimension with -1. Static dimensions
+    are preserved unchanged.
+
     """
     removed_symints = tuple([-1 if isinstance(d, torch.SymInt) else d for d in shape])
     return list(removed_symints)
diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py
index f062cdc90c6..cc2e5a088f4 100644
--- a/backends/arm/vgf/backend.py
+++ b/backends/arm/vgf/backend.py
@@ -14,10 +14,12 @@
 
 import logging
 import os  # nosec B404 - used alongside subprocess for tool invocation
+import shlex
 import shutil
 import subprocess  # nosec B404 - required to drive external converter CLI
 import tempfile
-from typing import final, List
+from dataclasses import dataclass
+from typing import Any, final, List
 
 from executorch.backends.arm._passes import RewriteConvPass
 from executorch.backends.arm._passes.arm_pass_manager import (
@@ -38,7 +40,7 @@
 )
 from executorch.backends.arm.vgf.model_converter import (  # type: ignore[import-not-found]
     model_converter_env,
-    require_model_converter_binary,
+    require_model_converter_executable,
 )
 from executorch.exir.backend.backend_details import (  # type: ignore[import-not-found]
     BackendDetails,
@@ -52,6 +54,94 @@
 # debug functionality
 logger = logging.getLogger(__name__)
 
+STATUS_OK = "PASS"
+STATUS_FAIL = "FAIL"
+VGF_BACKEND_NAME = "VgfBackend"
+
+
+@dataclass(frozen=True)
+class VgfRuntimeEnvironmentCheck:
+    """One VGF runtime backend environment preflight result.
+
+    This lives next to the Python VGF backend name and backend implementation,
+    while importing the actual ExecuTorch runtime lazily so AoT import behavior
+    remains unchanged.
+
+    """
+
+    name: str
+    status: str
+    detail: str
+    action: str | None = None
+
+    @property
+    def ok(self) -> bool:
+        return self.status != STATUS_FAIL
+
+    def to_dict(self) -> dict[str, str | None]:
+        return {
+            "name": self.name,
+            "status": self.status,
+            "detail": self.detail,
+            "action": self.action,
+        }
+
+
+def _load_runtime() -> Any:
+    from executorch.runtime import Runtime
+
+    return Runtime.get()
+
+
+def check_vgf_runtime_backend_environment() -> VgfRuntimeEnvironmentCheck:
+    """Check whether the installed runtime exposes the VGF backend."""
+
+    try:
+        runtime = _load_runtime()
+    except Exception as exc:
+        return VgfRuntimeEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_FAIL,
+            f"Could not initialize executorch.runtime.Runtime: {exc}",
+            "Install or rebuild ExecuTorch with runtime pybindings. For source "
+            "builds, enable the VGF runtime backend and reinstall the package.",
+        )
+
+    try:
+        registered_backend_names = list(
+            runtime.backend_registry.registered_backend_names
+        )
+        is_available = runtime.backend_registry.is_available(
+            backend_name=VGF_BACKEND_NAME
+        )
+    except Exception as exc:
+        return VgfRuntimeEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_FAIL,
+            f"Runtime backend registry query failed: {exc}",
+            "Reinstall or rebuild ExecuTorch with backend registry pybindings.",
+        )
+
+    if is_available:
+        return VgfRuntimeEnvironmentCheck(
+            "VGF runtime backend",
+            STATUS_OK,
+            f"{VGF_BACKEND_NAME} is available in the runtime backend registry.",
+        )
+
+    rendered = ", ".join(registered_backend_names[:20])
+    if len(registered_backend_names) > 20:
+        rendered += ", ..."
+
+    return VgfRuntimeEnvironmentCheck(
+        "VGF runtime backend",
+        STATUS_FAIL,
+        f"{VGF_BACKEND_NAME} is not available. Registered backends: "
+        f"{rendered or '<none>'}.",
+        "Use a runtime build/package that includes the VGF backend. For source "
+        "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.",
+    )
+
 
 def _register_grid_sampler_rewrite_pass() -> None:
     """Register VGF-only custom shader lowering passes."""
@@ -162,6 +252,52 @@ def preprocess(
         return PreprocessResult(processed_bytes=binary)
 
 
+def _format_repro_command(command: List[str]) -> str:
+    """Return a shell-safe command string for reproducing converter failures."""
+    return " ".join(shlex.quote(arg) for arg in command)
+
+
+def _copy_failure_artifacts(
+    tosa_path: str,
+    artifact_path: str | None,
+    tag_name: str,
+) -> str | None:
+    """Copy the failing TOSA input to the artifact directory, if configured.
+
+    Args:
+        tosa_path: Temporary TOSA flatbuffer passed to model-converter.
+        artifact_path: User-configured intermediate artifact directory.
+        tag_name: Optional delegation tag used to disambiguate artifacts.
+
+    Returns:
+        Path to the copied TOSA file, or None if no artifact path was configured.
+
+    """
+    if not artifact_path:
+        return None
+
+    os.makedirs(artifact_path, exist_ok=True)
+
+    suffix = f"_{tag_name}" if tag_name else ""
+    failure_tosa_path = os.path.join(
+        artifact_path,
+        f"failed_model_converter_input{suffix}.tosa",
+    )
+    shutil.copy2(tosa_path, failure_tosa_path)
+    return failure_tosa_path
+
+
+def _replace_converter_input_path(
+    conversion_command: List[str],
+    input_path: str,
+) -> List[str]:
+    """Return a converter command that uses a preserved TOSA input path."""
+    input_flag_index = conversion_command.index("-i")
+    repro_command = list(conversion_command)
+    repro_command[input_flag_index + 1] = input_path
+    return repro_command
+
+
 def vgf_compile(
     tosa_flatbuffer: bytes,
     compile_flags: List[str],
@@ -191,7 +327,7 @@ def vgf_compile(
             f.write(tosa_flatbuffer)
 
         compile_flags = [f for f in compile_flags if f and f.strip()]
-        converter_binary = require_model_converter_binary()
+        converter_binary = str(require_model_converter_executable())
         vgf_path = tosa_path + ".vgf"
         conversion_command = [
             converter_binary,
@@ -210,11 +346,21 @@ def vgf_compile(
                 env=model_converter_env(),
             )
         except subprocess.CalledProcessError as process_error:
-            conversion_command_str = " ".join(conversion_command)
+            failure_tosa_path = _copy_failure_artifacts(
+                tosa_path,
+                artifact_path,
+                tag_name,
+            )
+            repro_command = (
+                _replace_converter_input_path(conversion_command, failure_tosa_path)
+                if failure_tosa_path
+                else conversion_command
+            )
             raise RuntimeError(
-                f"Vgf compiler ('{conversion_command_str}') failed with error:\n \
-                {process_error.stderr.decode()}\n \
-                Stdout:\n{process_error.stdout.decode()}"
+                "Vgf compiler failed.\n"
+                f"Repro command:\n  {_format_repro_command(repro_command)}\n"
+                f"Stderr:\n{process_error.stderr.decode()}\n"
+                f"Stdout:\n{process_error.stdout.decode()}"
             )
 
         if artifact_path:
diff --git a/backends/arm/vgf/check_env.py b/backends/arm/vgf/check_env.py
index 576964df160..2c7fb9c5396 100644
--- a/backends/arm/vgf/check_env.py
+++ b/backends/arm/vgf/check_env.py
@@ -26,25 +26,18 @@
 import os
 import re
 import shutil
-import subprocess  # nosec B404 - invoked only for trusted local tools
 import sys
 from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
-from executorch.backends.arm.vgf.model_converter import (
-    find_model_converter_binary,
-    model_converter_env,
-)
-
+from executorch.backends.arm.vgf import model_converter
 
 STATUS_OK = "PASS"
 STATUS_WARN = "WARN"
 STATUS_FAIL = "FAIL"
 
-VGF_BACKEND_NAME = "VgfBackend"
-
 _REQUIRED_VKML_INSTANCE_LAYERS = {
     "VK_LAYER_ML_Graph_Emulation",
     "VK_LAYER_ML_Tensor_Emulation",
@@ -216,6 +209,17 @@ def _format_check(check: VgfEnvironmentCheck) -> str:
     return "\n".join(lines)
 
 
+def _as_environment_check(check: Any) -> VgfEnvironmentCheck:
+    """Convert a module-owned preflight result into the CLI report type."""
+
+    return VgfEnvironmentCheck(
+        check.name,
+        check.status,
+        check.detail,
+        getattr(check, "action", None),
+    )
+
+
 def _repo_root() -> Path:
     resolved = Path(__file__).resolve()
     for parent in resolved.parents:
@@ -297,165 +301,22 @@ def _check_tosa_serializer() -> VgfEnvironmentCheck:
     )
 
 
-def _resolve_executable(binary: str) -> Path | None:
-    path = Path(binary)
-    if path.is_absolute() or path.parent != Path("."):
-        if _safe_is_file(path) and os.access(path, os.X_OK):
-            return path
-        return None
-
-    resolved = shutil.which(binary)
-    if resolved:
-        return Path(resolved)
-    return None
-
-
-def _command_output(result: subprocess.CompletedProcess[str]) -> str:
-    text = "\n".join(
-        part.strip() for part in (result.stdout, result.stderr) if part.strip()
-    )
-    lines = text.splitlines()
-    if not lines:
-        return "<no output>"
-    return "\n".join(lines[:4])
-
-
 def _check_model_converter() -> VgfEnvironmentCheck:
-    binary = find_model_converter_binary()
-    if binary is None:
-        return VgfEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            "Could not find model-converter on PATH and MODEL_CONVERTER_PATH "
-            "does not point to an executable file.",
-            "Install VGF AoT dependencies with "
-            "python -m pip install 'executorch[vgf]' or, in a source checkout, "
-            "python -m pip install -r backends/arm/requirements-arm-vgf.txt. "
-            "Alternatively set MODEL_CONVERTER_PATH to the converter executable.",
-        )
-
-    executable = _resolve_executable(binary)
-    if executable is None:
-        return VgfEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"Resolved converter candidate {binary!r}, but it is not executable.",
-            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.",
-        )
-
-    try:
-        result = subprocess.run(  # nosec B603 - local converter executable
-            [str(executable), "--version"],
-            check=False,
-            capture_output=True,
-            text=True,
-            timeout=20,
-            env=model_converter_env(),
-        )
-    except Exception as exc:
-        return VgfEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"Found {executable}, but running '--version' failed: {exc}",
-            "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. "
-            "For source setup, source examples/arm/arm-scratch/setup_path.sh.",
-        )
-
-    if result.returncode != 0:
-        return VgfEnvironmentCheck(
-            "MLSDK model converter",
-            STATUS_FAIL,
-            f"{executable} --version exited with {result.returncode}:\n"
-            f"{_command_output(result)}",
-            "Check that the model-converter binary and its shared libraries are "
-            "from the same MLSDK install.",
-        )
-
-    return VgfEnvironmentCheck(
-        "MLSDK model converter",
-        STATUS_OK,
-        f"{executable} --version succeeded:\n{_command_output(result)}",
-    )
+    """Convert a module-owned preflight result into the CLI report type."""
+    return _as_environment_check(model_converter.check_model_converter_environment())
 
 
 def _check_model_converter_lib_dir() -> VgfEnvironmentCheck:
-    lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR")
-    if not lib_dir:
-        return VgfEnvironmentCheck(
-            "MODEL_CONVERTER_LIB_DIR",
-            STATUS_OK,
-            "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader "
-            "paths. This is OK when model-converter --version succeeds.",
-        )
-
-    path = Path(lib_dir).expanduser()
-    if _safe_is_dir(path):
-        return VgfEnvironmentCheck(
-            "MODEL_CONVERTER_LIB_DIR",
-            STATUS_OK,
-            f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}",
-        )
-
-    return VgfEnvironmentCheck(
-        "MODEL_CONVERTER_LIB_DIR",
-        STATUS_FAIL,
-        f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.",
-        "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.",
+    """Convert a module-owned preflight result into the CLI report type."""
+    return _as_environment_check(
+        model_converter.check_model_converter_lib_dir_environment()
     )
 
 
-def _load_runtime() -> Any:
-    from executorch.runtime import Runtime
-
-    return Runtime.get()
-
-
 def _check_runtime_vgf_backend() -> VgfEnvironmentCheck:
-    try:
-        runtime = _load_runtime()
-    except Exception as exc:
-        return VgfEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_FAIL,
-            f"Could not initialize executorch.runtime.Runtime: {exc}",
-            "Install or rebuild ExecuTorch with runtime pybindings. For source "
-            "builds, enable the VGF runtime backend and reinstall the package.",
-        )
-
-    try:
-        registered_backend_names = list(
-            runtime.backend_registry.registered_backend_names
-        )
-        is_available = runtime.backend_registry.is_available(
-            backend_name=VGF_BACKEND_NAME
-        )
-    except Exception as exc:
-        return VgfEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_FAIL,
-            f"Runtime backend registry query failed: {exc}",
-            "Reinstall or rebuild ExecuTorch with backend registry pybindings.",
-        )
-
-    if is_available:
-        return VgfEnvironmentCheck(
-            "VGF runtime backend",
-            STATUS_OK,
-            f"{VGF_BACKEND_NAME} is available in the runtime backend registry.",
-        )
+    from executorch.backends.arm.vgf import backend as vgf_backend
 
-    rendered = ", ".join(registered_backend_names[:20])
-    if len(registered_backend_names) > 20:
-        rendered += ", ..."
-
-    return VgfEnvironmentCheck(
-        "VGF runtime backend",
-        STATUS_FAIL,
-        f"{VGF_BACKEND_NAME} is not available. Registered backends: "
-        f"{rendered or '<none>'}.",
-        "Use a runtime build/package that includes the VGF backend. For source "
-        "builds, configure with -DEXECUTORCH_BUILD_VGF=ON and reinstall.",
-    )
+    return _as_environment_check(vgf_backend.check_vgf_runtime_backend_environment())
 
 
 def _package_dirs(package: str) -> list[Path]:
diff --git a/backends/arm/vgf/model_converter.py b/backends/arm/vgf/model_converter.py
index 2d3868837b1..d76abbbcdf6 100644
--- a/backends/arm/vgf/model_converter.py
+++ b/backends/arm/vgf/model_converter.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,12 +6,45 @@
 from __future__ import annotations
 
 import os
+import subprocess  # nosec B404 - invoked only for trusted local converter tools
+from dataclasses import dataclass
+from pathlib import Path
 from shutil import which
 from typing import Optional
 
 MODEL_CONVERTER_BINARY = "model-converter"
 _MODEL_CONVERTER_FALLBACK_BINARY = "model_converter"
 
+STATUS_OK = "PASS"
+STATUS_FAIL = "FAIL"
+
+
+@dataclass(frozen=True)
+class ModelConverterEnvironmentCheck:
+    """One model-converter environment preflight result.
+
+    This lives in the same module that resolves and launches the converter so
+    the standalone VGF preflight CLI cannot drift from the actual compiler path.
+
+    """
+
+    name: str
+    status: str
+    detail: str
+    action: str | None = None
+
+    @property
+    def ok(self) -> bool:
+        return self.status != STATUS_FAIL
+
+    def to_dict(self) -> dict[str, str | None]:
+        return {
+            "name": self.name,
+            "status": self.status,
+            "detail": self.detail,
+            "action": self.action,
+        }
+
 
 def find_model_converter_binary() -> Optional[str]:
     """Return the path/name of the first model converter executable found."""
@@ -25,6 +58,20 @@ def find_model_converter_binary() -> Optional[str]:
     return None
 
 
+def _safe_is_file(path: Path) -> bool:
+    try:
+        return path.is_file()
+    except OSError:
+        return False
+
+
+def _safe_is_dir(path: Path) -> bool:
+    try:
+        return path.is_dir()
+    except OSError:
+        return False
+
+
 def model_converter_env() -> dict[str, str]:
     """Return an env dict suitable for running model-converter as a subprocess.
 
@@ -52,3 +99,134 @@ def require_model_converter_binary() -> str:
             f"Tried: {tried}. Ensure the Model Converter is installed and on PATH."
         )
     return binary
+
+
+def resolve_model_converter_executable(binary: str) -> Path | None:
+    """Resolve a converter candidate to an executable path, if possible.
+
+    This is shared by the VGF compiler path and the preflight checker so both
+    agree on what a usable converter executable means.
+
+    """
+
+    path = Path(binary)
+    if path.is_absolute() or path.parent != Path("."):
+        if _safe_is_file(path) and os.access(path, os.X_OK):
+            return path
+        return None
+
+    resolved = which(binary)
+    if resolved:
+        return Path(resolved)
+    return None
+
+
+def require_model_converter_executable() -> Path:
+    """Return a usable converter executable path or raise a helpful error."""
+
+    binary = require_model_converter_binary()
+    executable = resolve_model_converter_executable(binary)
+    if executable is None:
+        raise RuntimeError(
+            f"Resolved converter candidate {binary!r}, but it is not executable. "
+            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH."
+        )
+    return executable
+
+
+def _command_output(result: subprocess.CompletedProcess[str]) -> str:
+    text = "\n".join(
+        part.strip() for part in (result.stdout, result.stderr) if part.strip()
+    )
+    lines = text.splitlines()
+    if not lines:
+        return "<no output>"
+    return "\n".join(lines[:4])
+
+
+def check_model_converter_environment() -> ModelConverterEnvironmentCheck:
+    """Check the model-converter dependency used by VGF compilation."""
+
+    binary = find_model_converter_binary()
+    if binary is None:
+        return ModelConverterEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            "Could not find model-converter on PATH and MODEL_CONVERTER_PATH "
+            "does not point to an executable file.",
+            "Install VGF AoT dependencies with "
+            "python -m pip install 'executorch[vgf]' or, in a source checkout, "
+            "python -m pip install -r backends/arm/requirements-arm-vgf.txt. "
+            "Alternatively set MODEL_CONVERTER_PATH to the converter executable.",
+        )
+
+    executable = resolve_model_converter_executable(binary)
+    if executable is None:
+        return ModelConverterEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"Resolved converter candidate {binary!r}, but it is not executable.",
+            "Fix MODEL_CONVERTER_PATH or place model-converter on PATH.",
+        )
+
+    try:
+        result = subprocess.run(  # nosec B603 - local converter executable
+            [str(executable), "--version"],
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=20,
+            env=model_converter_env(),
+        )
+    except Exception as exc:
+        return ModelConverterEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"Found {executable}, but running '--version' failed: {exc}",
+            "Check MODEL_CONVERTER_LIB_DIR and the process loader paths. "
+            "For source setup, source examples/arm/arm-scratch/setup_path.sh.",
+        )
+
+    if result.returncode != 0:
+        return ModelConverterEnvironmentCheck(
+            "MLSDK model converter",
+            STATUS_FAIL,
+            f"{executable} --version exited with {result.returncode}:\n"
+            f"{_command_output(result)}",
+            "Check that the model-converter binary and its shared libraries are "
+            "from the same MLSDK install.",
+        )
+
+    return ModelConverterEnvironmentCheck(
+        "MLSDK model converter",
+        STATUS_OK,
+        f"{executable} --version succeeded:\n{_command_output(result)}",
+    )
+
+
+def check_model_converter_lib_dir_environment() -> ModelConverterEnvironmentCheck:
+    """Check MODEL_CONVERTER_LIB_DIR used by model_converter_env()."""
+
+    lib_dir = os.environ.get("MODEL_CONVERTER_LIB_DIR")
+    if not lib_dir:
+        return ModelConverterEnvironmentCheck(
+            "MODEL_CONVERTER_LIB_DIR",
+            STATUS_OK,
+            "MODEL_CONVERTER_LIB_DIR is not set; relying on the process loader "
+            "paths. This is OK when model-converter --version succeeds.",
+        )
+
+    path = Path(lib_dir).expanduser()
+    if _safe_is_dir(path):
+        return ModelConverterEnvironmentCheck(
+            "MODEL_CONVERTER_LIB_DIR",
+            STATUS_OK,
+            f"MODEL_CONVERTER_LIB_DIR points to existing directory: {path}",
+        )
+
+    return ModelConverterEnvironmentCheck(
+        "MODEL_CONVERTER_LIB_DIR",
+        STATUS_FAIL,
+        f"MODEL_CONVERTER_LIB_DIR={lib_dir!r} does not exist or is not a directory.",
+        "Unset MODEL_CONVERTER_LIB_DIR or set it to the converter library directory.",
+    )
diff --git a/backends/cadence/fused_quant/op_add.cpp b/backends/cadence/fused_quant/op_add.cpp
index 62e58c71c83..1aea2ccfb6c 100644
--- a/backends/cadence/fused_quant/op_add.cpp
+++ b/backends/cadence/fused_quant/op_add.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_add.h b/backends/cadence/fused_quant/op_add.h
index 9db1e907294..b32710f41de 100644
--- a/backends/cadence/fused_quant/op_add.h
+++ b/backends/cadence/fused_quant/op_add.h
@@ -19,19 +19,18 @@ executorch::aten::Tensor& add_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    const std::optional<executorch::aten::Tensor>& inp_scale,
+    const std::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>&
-        other_zero_point,
+    const std::optional<executorch::aten::Tensor>& other_scale,
+    const std::optional<executorch::aten::Tensor>& other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    const std::optional<executorch::aten::Tensor>& out_scale,
+    const std::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp
index 7204ab6c88f..8d071b48a33 100644
--- a/backends/cadence/fused_quant/op_bmm.cpp
+++ b/backends/cadence/fused_quant/op_bmm.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h
index ef9598eac98..c6a4502983f 100644
--- a/backends/cadence/fused_quant/op_bmm.h
+++ b/backends/cadence/fused_quant/op_bmm.h
@@ -19,19 +19,18 @@ executorch::aten::Tensor& bmm_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    const std::optional<executorch::aten::Tensor>& inp_scale,
+    const std::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>&
-        other_zero_point,
+    const std::optional<executorch::aten::Tensor>& other_scale,
+    const std::optional<executorch::aten::Tensor>& other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    const std::optional<executorch::aten::Tensor>& out_scale,
+    const std::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_hardswish.cpp b/backends/cadence/fused_quant/op_hardswish.cpp
index 452ea90a405..4b968cebe6c 100644
--- a/backends/cadence/fused_quant/op_hardswish.cpp
+++ b/backends/cadence/fused_quant/op_hardswish.cpp
@@ -16,10 +16,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_hardswish.h b/backends/cadence/fused_quant/op_hardswish.h
index ba9e09da23c..de7d88b427b 100644
--- a/backends/cadence/fused_quant/op_hardswish.h
+++ b/backends/cadence/fused_quant/op_hardswish.h
@@ -18,13 +18,13 @@ namespace native {
 executorch::aten::Tensor& hardswish_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    const std::optional<executorch::aten::Tensor>& inp_scale,
+    const std::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    const std::optional<executorch::aten::Tensor>& out_scale,
+    const std::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_mul.cpp b/backends/cadence/fused_quant/op_mul.cpp
index 3d071f7c2da..a2595104ae8 100644
--- a/backends/cadence/fused_quant/op_mul.cpp
+++ b/backends/cadence/fused_quant/op_mul.cpp
@@ -14,10 +14,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_mul.h b/backends/cadence/fused_quant/op_mul.h
index f7afa016b79..62314c98003 100644
--- a/backends/cadence/fused_quant/op_mul.h
+++ b/backends/cadence/fused_quant/op_mul.h
@@ -19,19 +19,18 @@ executorch::aten::Tensor& mul_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
     const executorch::aten::Tensor& other,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    const std::optional<executorch::aten::Tensor>& inp_scale,
+    const std::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>&
-        other_zero_point,
+    const std::optional<executorch::aten::Tensor>& other_scale,
+    const std::optional<executorch::aten::Tensor>& other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    const std::optional<executorch::aten::Tensor>& out_scale,
+    const std::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/op_relu.cpp b/backends/cadence/fused_quant/op_relu.cpp
index ebe7933a7b9..e8e58522d2e 100644
--- a/backends/cadence/fused_quant/op_relu.cpp
+++ b/backends/cadence/fused_quant/op_relu.cpp
@@ -16,10 +16,10 @@ namespace cadence {
 namespace fused_quant {
 namespace native {
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/op_relu.h b/backends/cadence/fused_quant/op_relu.h
index e8527c7633f..522144eacd0 100644
--- a/backends/cadence/fused_quant/op_relu.h
+++ b/backends/cadence/fused_quant/op_relu.h
@@ -18,13 +18,13 @@ namespace native {
 executorch::aten::Tensor& relu_out(
     executorch::runtime::KernelRuntimeContext& ctx,
     const executorch::aten::Tensor& inp,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    const std::optional<executorch::aten::Tensor>& inp_scale,
+    const std::optional<executorch::aten::Tensor>& inp_zero_point,
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
-    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    const std::optional<executorch::aten::Tensor>& out_scale,
+    const std::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
diff --git a/backends/cadence/fused_quant/quant_utils.h b/backends/cadence/fused_quant/quant_utils.h
index fff669a9e0e..78884bfcceb 100644
--- a/backends/cadence/fused_quant/quant_utils.h
+++ b/backends/cadence/fused_quant/quant_utils.h
@@ -64,8 +64,8 @@ struct QParams {
 };
 
 inline QParams extract_qparams(
-    const executorch::aten::optional<executorch::aten::Tensor>& scale_tensor,
-    const executorch::aten::optional<executorch::aten::Tensor>& zp_tensor,
+    const std::optional<executorch::aten::Tensor>& scale_tensor,
+    const std::optional<executorch::aten::Tensor>& zp_tensor,
     int64_t quant_min,
     int64_t quant_max,
     const executorch::aten::Tensor& data_tensor) {
diff --git a/backends/cadence/fused_quant/tests/test_op_add.cpp b/backends/cadence/fused_quant/tests/test_op_add.cpp
index dca110cf0e1..61124f0b9b2 100644
--- a/backends/cadence/fused_quant/tests/test_op_add.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_add.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
index 5ede47ea8a9..bae04993a7a 100644
--- a/backends/cadence/fused_quant/tests/test_op_bmm.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
index 502d680d2e3..eb6231161f2 100644
--- a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_mul.cpp b/backends/cadence/fused_quant/tests/test_op_mul.cpp
index 0b9addabc5e..da27c7287c9 100644
--- a/backends/cadence/fused_quant/tests/test_op_mul.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_mul.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/fused_quant/tests/test_op_relu.cpp b/backends/cadence/fused_quant/tests/test_op_relu.cpp
index 6b83551fd2b..1096daae202 100644
--- a/backends/cadence/fused_quant/tests/test_op_relu.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_relu.cpp
@@ -14,10 +14,10 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
-using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
+using std::optional;
 
 namespace {
 
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.cpp b/backends/cadence/generic/operators/op_avg_pool2d.cpp
index b04187db62e..c33f91151fb 100644
--- a/backends/cadence/generic/operators/op_avg_pool2d.cpp
+++ b/backends/cadence/generic/operators/op_avg_pool2d.cpp
@@ -19,11 +19,11 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 // Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype,
 // and AT is the accumulation datatype. 'quantized' is true when the input is
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.h b/backends/cadence/generic/operators/op_avg_pool2d.h
index 05f1810bb61..85b5d55a84b 100644
--- a/backends/cadence/generic/operators/op_avg_pool2d.h
+++ b/backends/cadence/generic/operators/op_avg_pool2d.h
@@ -23,9 +23,8 @@ ::executorch::aten::Tensor& avg_pool2d_out(
     ::executorch::aten::IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    ::executorch::aten::optional<int64_t> divisor_override,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>&
-        in_zero_point_t,
+    std::optional<int64_t> divisor_override,
+    const std::optional<::executorch::aten::Tensor>& in_zero_point_t,
     bool channel_last,
     ::executorch::aten::Tensor& out);
 
diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp
index 36befc52102..b65f8016880 100644
--- a/backends/cadence/generic/operators/op_fully_connected.cpp
+++ b/backends/cadence/generic/operators/op_fully_connected.cpp
@@ -15,10 +15,10 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 void linear(
     const Tensor& input,
diff --git a/backends/cadence/generic/operators/op_fully_connected.h b/backends/cadence/generic/operators/op_fully_connected.h
index d23bcbeb70c..7e03f5ef664 100644
--- a/backends/cadence/generic/operators/op_fully_connected.h
+++ b/backends/cadence/generic/operators/op_fully_connected.h
@@ -15,9 +15,9 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 Tensor& fully_connected_out(
     KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_linalg_svd.cpp b/backends/cadence/generic/operators/op_linalg_svd.cpp
index 4974b617418..4cb4f6397ea 100644
--- a/backends/cadence/generic/operators/op_linalg_svd.cpp
+++ b/backends/cadence/generic/operators/op_linalg_svd.cpp
@@ -261,7 +261,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> linalg_svd_out(
     const Tensor& A,
     bool full_matrices,
     bool compute_uv,
-    ::executorch::aten::optional<::executorch::aten::string_view> driver,
+    std::optional<std::string_view> driver,
     Tensor& U,
     Tensor& S,
     Tensor& Vh) {
diff --git a/backends/cadence/generic/operators/op_linalg_svd.h b/backends/cadence/generic/operators/op_linalg_svd.h
index 7635276c4f5..e8335b7fa0e 100644
--- a/backends/cadence/generic/operators/op_linalg_svd.h
+++ b/backends/cadence/generic/operators/op_linalg_svd.h
@@ -26,7 +26,7 @@ linalg_svd_out(
     const ::executorch::aten::Tensor& A,
     bool full_matrices,
     bool compute_uv,
-    ::executorch::aten::optional<::executorch::aten::string_view> driver,
+    std::optional<std::string_view> driver,
     ::executorch::aten::Tensor& U,
     ::executorch::aten::Tensor& S,
     ::executorch::aten::Tensor& Vh);
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
index 6f42543cfc1..8a427045a83 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
@@ -256,7 +256,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
-    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
   (void)ctx;
   quantized_conv1d_nlc(
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
index 4f4d2877b27..f1780497f73 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
@@ -54,7 +54,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const ::executorch::aten::optional<Tensor>& offset,
+    const std::optional<Tensor>& offset,
     Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.cpp b/backends/cadence/generic/operators/op_quantized_conv2d.cpp
index 0811267a3b8..f6755f9dda8 100644
--- a/backends/cadence/generic/operators/op_quantized_conv2d.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv2d.cpp
@@ -16,11 +16,11 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 /* This implements a generic 2d conv kernel that operates on raw pointers.
  * The quantized version handles quantized convolutions for 2D inputs.
@@ -936,7 +936,7 @@ Tensor& quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     ET_UNUSED int64_t out_multiplier,
     ET_UNUSED int64_t out_shift,
-    ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
+    ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
   quantized_conv2d_nhwc(
       input,
diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.h b/backends/cadence/generic/operators/op_quantized_conv2d.h
index bb9476e2644..02740d3afec 100644
--- a/backends/cadence/generic/operators/op_quantized_conv2d.h
+++ b/backends/cadence/generic/operators/op_quantized_conv2d.h
@@ -205,7 +205,7 @@ ::executorch::aten::Tensor& quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const ::executorch::aten::optional<Tensor>& offset,
+    const std::optional<Tensor>& offset,
     Tensor& out);
 
 ::executorch::aten::Tensor& quantized_conv2d_depthwise_nhwc_out(
diff --git a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
index a8f98a76ffc..05fb809cd51 100644
--- a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -57,7 +57,7 @@ ::executorch::aten::Tensor& quantized_depthwise_conv1d_nlc_per_tensor_out(
       output_zero_point,
       out_multiplier,
       out_shift,
-      ::executorch::aten::optional<Tensor>(),
+      std::optional<Tensor>(),
       out);
 }
 
diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
index 55ca67648ca..d2e0d6a8bd9 100644
--- a/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
+++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.cpp
@@ -19,11 +19,11 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 #define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
   _(uint8_t, Byte)                           \
diff --git a/backends/cadence/generic/operators/op_quantized_embedding_byte.h b/backends/cadence/generic/operators/op_quantized_embedding_byte.h
index a46bebe09df..84fc53620a0 100644
--- a/backends/cadence/generic/operators/op_quantized_embedding_byte.h
+++ b/backends/cadence/generic/operators/op_quantized_embedding_byte.h
@@ -19,8 +19,7 @@ ::executorch::aten::Tensor& quantized_embedding_byte_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& weight,
     const ::executorch::aten::Tensor& weight_scales,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>&
-        weight_zero_points,
+    const std::optional<::executorch::aten::Tensor>& weight_zero_points,
     const ::executorch::aten::Tensor& indices,
     bool pruned_weights,
     ::executorch::aten::Tensor& out);
diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
index 55e29cb7f52..ce74b5b8b7f 100644
--- a/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
+++ b/backends/cadence/generic/operators/op_quantized_fully_connected.cpp
@@ -16,10 +16,10 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 Tensor& quantized_fully_connected_out(
     ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_fully_connected.h b/backends/cadence/generic/operators/op_quantized_fully_connected.h
index a7510fba95f..408fbabe726 100644
--- a/backends/cadence/generic/operators/op_quantized_fully_connected.h
+++ b/backends/cadence/generic/operators/op_quantized_fully_connected.h
@@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out(
@@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_fully_connected_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
@@ -52,7 +52,7 @@ quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
@@ -66,7 +66,7 @@ quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
index e34ed342d22..85825cff94d 100644
--- a/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
+++ b/backends/cadence/generic/operators/op_quantized_layer_norm.cpp
@@ -24,7 +24,6 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
@@ -32,6 +31,7 @@ using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
diff --git a/backends/cadence/generic/operators/op_quantized_linear.cpp b/backends/cadence/generic/operators/op_quantized_linear.cpp
index 87f990a855b..02ff97de74d 100644
--- a/backends/cadence/generic/operators/op_quantized_linear.cpp
+++ b/backends/cadence/generic/operators/op_quantized_linear.cpp
@@ -18,11 +18,11 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::executorch::runtime::toString;
+using std::optional;
 
 Tensor& quantized_linear_out(
     ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_linear.h b/backends/cadence/generic/operators/op_quantized_linear.h
index b5396cb9701..517357d5bf9 100644
--- a/backends/cadence/generic/operators/op_quantized_linear.h
+++ b/backends/cadence/generic/operators/op_quantized_linear.h
@@ -25,7 +25,7 @@ ::executorch::aten::Tensor& quantized_linear_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& quantized_linear_per_tensor_out(
@@ -38,7 +38,7 @@ ::executorch::aten::Tensor& quantized_linear_per_tensor_out(
     const int64_t out_multiplier,
     const int64_t out_shift,
     const int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor&
diff --git a/backends/cadence/generic/operators/op_quantized_matmul.cpp b/backends/cadence/generic/operators/op_quantized_matmul.cpp
index e3fb0f00fdc..b84c879e65d 100644
--- a/backends/cadence/generic/operators/op_quantized_matmul.cpp
+++ b/backends/cadence/generic/operators/op_quantized_matmul.cpp
@@ -21,12 +21,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 // The quantized matmul. The quantized matmul accumulates in a wider register,
 // whose type is TA.
diff --git a/backends/cadence/generic/operators/op_quantized_matmul.h b/backends/cadence/generic/operators/op_quantized_matmul.h
index 70775380aac..c28862aa11e 100644
--- a/backends/cadence/generic/operators/op_quantized_matmul.h
+++ b/backends/cadence/generic/operators/op_quantized_matmul.h
@@ -15,9 +15,9 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 Tensor& quantized_matmul_out(
     KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_quantized_mul.cpp b/backends/cadence/generic/operators/op_quantized_mul.cpp
index 30352ee9d52..359a305b020 100644
--- a/backends/cadence/generic/operators/op_quantized_mul.cpp
+++ b/backends/cadence/generic/operators/op_quantized_mul.cpp
@@ -21,13 +21,13 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 DECLARE_POINTWISE_TENSOR_QUANTIZED_BINARY_OP(quantized_mul_, *);
 
diff --git a/backends/cadence/generic/operators/op_quantized_relu.cpp b/backends/cadence/generic/operators/op_quantized_relu.cpp
index 9430951f65b..ecb87bd1b90 100644
--- a/backends/cadence/generic/operators/op_quantized_relu.cpp
+++ b/backends/cadence/generic/operators/op_quantized_relu.cpp
@@ -21,12 +21,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 template <typename T>
 void quantized_relu_per_tensor_out_(
diff --git a/backends/cadence/generic/operators/op_requantize.cpp b/backends/cadence/generic/operators/op_requantize.cpp
index f846a1964a3..b9df6f1f355 100644
--- a/backends/cadence/generic/operators/op_requantize.cpp
+++ b/backends/cadence/generic/operators/op_requantize.cpp
@@ -19,13 +19,13 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 // Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
 // The scale and zero_point for requantization are in the args.
diff --git a/backends/cadence/generic/operators/op_rope.cpp b/backends/cadence/generic/operators/op_rope.cpp
index 17ee6d2a684..fcc7d629cf7 100644
--- a/backends/cadence/generic/operators/op_rope.cpp
+++ b/backends/cadence/generic/operators/op_rope.cpp
@@ -12,8 +12,8 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
+using std::optional;
 
 Tensor& rope_out(
     ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
@@ -75,8 +75,8 @@ namespace impl {
 namespace generic {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Tensor;
+using std::optional;
 
 Tensor& rope_rotate_stacked_halves_out(
     ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
diff --git a/backends/cadence/generic/operators/op_rope.h b/backends/cadence/generic/operators/op_rope.h
index 638677bf118..d738cfda6c1 100644
--- a/backends/cadence/generic/operators/op_rope.h
+++ b/backends/cadence/generic/operators/op_rope.h
@@ -20,7 +20,7 @@ ::executorch::aten::Tensor& rope_out(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& sin_tensor,
     const ::executorch::aten::Tensor& cos_tensor,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& pos,
+    const std::optional<::executorch::aten::Tensor>& pos,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& rope_rotate_stacked_halves_out(
@@ -28,7 +28,7 @@ ::executorch::aten::Tensor& rope_rotate_stacked_halves_out(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& sin_tensor,
     const ::executorch::aten::Tensor& cos_tensor,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& pos,
+    const std::optional<::executorch::aten::Tensor>& pos,
     ::executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_softmax.cpp b/backends/cadence/generic/operators/op_softmax.cpp
index 97c64a22511..b680d1e2471 100644
--- a/backends/cadence/generic/operators/op_softmax.cpp
+++ b/backends/cadence/generic/operators/op_softmax.cpp
@@ -125,7 +125,7 @@ Tensor& _softmax_f32_f32_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& X,
     int64_t dim,
-    __ET_UNUSED ::executorch::aten::optional<bool> half_to_float,
+    __ET_UNUSED std::optional<bool> half_to_float,
     Tensor& Y) {
   _softmax_out(ctx, X, dim, false, Y);
 
diff --git a/backends/cadence/generic/operators/op_softmax.h b/backends/cadence/generic/operators/op_softmax.h
index ec51b1d00c0..d83703117b0 100644
--- a/backends/cadence/generic/operators/op_softmax.h
+++ b/backends/cadence/generic/operators/op_softmax.h
@@ -26,7 +26,7 @@ ::executorch::aten::Tensor& _softmax_f32_f32_out(
     __ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& X,
     int64_t dim,
-    __ET_UNUSED ::executorch::aten::optional<bool> half_to_float,
+    __ET_UNUSED std::optional<bool> half_to_float,
     ::executorch::aten::Tensor& Y);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_transposed_convolution.cpp b/backends/cadence/generic/operators/op_transposed_convolution.cpp
index 121b479e65f..b742ec635b2 100644
--- a/backends/cadence/generic/operators/op_transposed_convolution.cpp
+++ b/backends/cadence/generic/operators/op_transposed_convolution.cpp
@@ -16,12 +16,12 @@ namespace generic {
 namespace native {
 
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::quantize;
+using std::optional;
 
 // This implements a generic 2d transposed_conv kernel that operates on raw
 // pointers. The version handles both quantized and fp32 convolutions.
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 514813fbe05..ccd54e80698 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -17,8 +17,8 @@ using executorch::aten::RuntimeContext;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::ArrayRef;
+using std::optional;
 using torch::executor::Error;
-using torch::executor::optional;
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index 5171c2908bc..9d363469f74 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -238,7 +238,7 @@ void quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
   // HiFi nnlib kernels only support dilation=1.
   // Fall back to generic implementation for dilation > 1.
diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
index ea3a756f995..86ef244711d 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
@@ -17,7 +17,7 @@ using Tensor = executorch::aten::Tensor;
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
 using ScalarType = executorch::aten::ScalarType;
 using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::optional;
+using std::optional;
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 4299990b52a..a8e2b42d77d 100644
--- a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -206,7 +206,7 @@ void quantized_depthwise_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
-        ::executorch::aten::optional<Tensor>(),
+        std::optional<Tensor>(),
         out);
     return;
   }
diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h
index c53a07b58aa..a567c7f650d 100644
--- a/backends/cadence/hifi/operators/op_quantized_matmul_out.h
+++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.h
@@ -21,7 +21,7 @@ ::executorch::aten::Tensor& quantized_matmul_out(
     int64_t X_zero_point,
     const ::executorch::aten::Tensor& Y,
     int64_t Y_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    const std::optional<::executorch::aten::Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
diff --git a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
index 074ff29b301..907156af1f7 100644
--- a/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
+++ b/backends/cadence/hifi/operators/op_softmax_f32_f32.cpp
@@ -22,7 +22,7 @@ inline Tensor& _softmax_f32_f32_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
-    ::executorch::aten::optional<bool> half_to_float,
+    std::optional<bool> half_to_float,
     Tensor& out) {
   constexpr int kNnlibMaxDim = 16;
 
@@ -146,7 +146,7 @@ Tensor& softmax_f32_f32_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
-    ::executorch::aten::optional<bool> half_to_float,
+    std::optional<bool> half_to_float,
     Tensor& out) {
   return _softmax_f32_f32_out(ctx, in, dim, half_to_float, out);
 }
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
index 3ca505d40cb..fa6847f744b 100644
--- a/backends/cadence/hifi/operators/operators.h
+++ b/backends/cadence/hifi/operators/operators.h
@@ -72,7 +72,7 @@ void quantized_linear_out(
     const ::executorch::aten::Tensor& out_multiplier,
     const ::executorch::aten::Tensor& out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 void quantized_linear_per_tensor_out(
@@ -85,7 +85,7 @@ void quantized_linear_per_tensor_out(
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 void quantized_conv2d_nhwc_out(
@@ -158,7 +158,7 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
+    const std::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
 ::executorch::aten::Tensor& cat_out(
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index be4b34bff03..aaba9f5696d 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -582,7 +582,7 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
-    ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
+    ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
   quantized_conv_per_tensor_out(
       ctx,
diff --git a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
index 29aa8906414..c53f7f7667a 100644
--- a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
@@ -13,10 +13,10 @@ namespace impl {
 namespace vision {
 namespace native {
 
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 void quantized_fully_connected_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
index b6b7cdd17bc..7b3daed8ef6 100644
--- a/backends/cadence/vision/operators/op_quantized_linear_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
@@ -84,7 +84,7 @@ void quantized_linear_out(
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
-    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
   // TODO: refactor to use switch case as quantized_linear_per_tensor_out
   if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
@@ -127,7 +127,7 @@ void quantized_linear_per_tensor_out(
     const int64_t out_multiplier,
     const int64_t out_shift,
     const int64_t out_zero_point,
-    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const std::optional<Tensor>& offset,
     Tensor& out) {
 #define typed_quantized_linear_per_tensor(ctype, dtype) \
   case executorch::aten::ScalarType::dtype: {           \
diff --git a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
index 54a303288c3..e63ae5bdda1 100644
--- a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
@@ -60,7 +60,7 @@ void inline _typed_quantized_matmul(
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const executorch::aten::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
@@ -114,7 +114,7 @@ void quantized_matmul_out(
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const executorch::aten::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index 58ca33c6a0b..6b93709b226 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -50,7 +50,7 @@ Tensor& _softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
 
-  const executorch::aten::optional<int64_t>& dim_t = dim;
+  const std::optional<int64_t>& dim_t = dim;
   const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim());
   const size_t size = in.size(d);
 
diff --git a/backends/cadence/vision/operators/operators.h b/backends/cadence/vision/operators/operators.h
index 8b5db4161eb..1c756c0b237 100644
--- a/backends/cadence/vision/operators/operators.h
+++ b/backends/cadence/vision/operators/operators.h
@@ -31,7 +31,7 @@ using ::executorch::runtime::getLeadingDims;
 inline __attribute__((always_inline)) void linear_(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    const std::optional<::executorch::aten::Tensor>& bias,
     ::executorch::aten::Tensor& output) {
   const float* __restrict__ input_data = input.const_data_ptr<float>();
   const float* __restrict__ weight_data = weight.const_data_ptr<float>();
diff --git a/backends/cortex_m/TARGETS b/backends/cortex_m/TARGETS
index b84add05516..1b73bb03bfc 100644
--- a/backends/cortex_m/TARGETS
+++ b/backends/cortex_m/TARGETS
@@ -20,12 +20,23 @@ python_library(
     ],
 )
 
+python_library(
+    name = "cmsis_nn",
+    srcs = [
+        "library/__init__.py",
+        "library/cmsis_nn.py",
+    ],
+    deps = [
+        "fbsource//third-party/cmsis-nn:cmsis_nn_py",
+    ],
+)
+
 python_library(
     name = "target_config",
     srcs = [
         "target_config.py",
     ],
     deps = [
-        "fbsource//third-party/cmsis-nn:cmsis_nn_py",
+        ":cmsis_nn",
     ],
 )
diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
index 656309abcee..807cf18cebc 100644
--- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
+++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -49,7 +49,7 @@ class CMSISScratchBufferContext final {
       Tensor& scratch_buffer,
       const Tensor& weights,
       const Tensor& weight_zero_point,
-      const torch::executor::optional<Tensor>& bias)
+      const std::optional<Tensor>& bias)
       : scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
         total_size_(scratch_buffer.size(0)),
         base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index 3d4f19e10d0..13e8b132410 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -19,7 +19,7 @@ bool validate_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Tensor& output,
     const Int64ArrayRef& stride,
     const Int64ArrayRef& padding,
@@ -103,7 +103,7 @@ Tensor& quantized_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef dilation,
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index a8e1fc21ed7..0793606de44 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -19,7 +19,7 @@ bool validate_depthwise_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Tensor& output,
     const Int64ArrayRef& stride,
     const Int64ArrayRef& padding,
@@ -140,7 +140,7 @@ Tensor& quantized_depthwise_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef dilation,
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
index 7448058de8e..c92ec493cd5 100644
--- a/backends/cortex_m/ops/op_quantized_linear.cpp
+++ b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -18,8 +18,8 @@ Tensor& quantized_linear_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weights,
-    const torch::executor::optional<Tensor>& bias,
-    const torch::executor::optional<Tensor>& kernel_sum,
+    const std::optional<Tensor>& bias,
+    const std::optional<Tensor>& kernel_sum,
     const int64_t input_offset,
     const int64_t filter_offset,
     const int64_t output_offset,
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
index e7ecbc7c7b4..04d57d4c693 100644
--- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
@@ -21,7 +21,7 @@ bool validate_transpose_conv2d_arguments(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Tensor& output,
     const Tensor& requantize_multipliers,
     const Tensor& requantize_shifts) {
@@ -88,7 +88,7 @@ Tensor& quantized_transpose_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Int64ArrayRef stride,
     const Int64ArrayRef padding,
     const Int64ArrayRef output_padding,
diff --git a/backends/cortex_m/passes/BUCK b/backends/cortex_m/passes/BUCK
index 20444f16718..c792583f657 100644
--- a/backends/cortex_m/passes/BUCK
+++ b/backends/cortex_m/passes/BUCK
@@ -1,6 +1,7 @@
 load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -40,6 +41,7 @@ fbcode_target(_kind = runtime.python_library,
     deps=[
         "//caffe2:torch",
         "//executorch/backends/arm/_passes:passes",
+        "//executorch/backends/cortex_m:cmsis_nn",
         "//executorch/backends/cortex_m:target_config",
         "//executorch/backends/cortex_m/ops:ops",
         "//executorch/backends/cortex_m/passes:passes_utils",
diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py
index 6d6783488fe..ec3d67c4d31 100644
--- a/backends/cortex_m/passes/__init__.py
+++ b/backends/cortex_m/passes/__init__.py
@@ -3,36 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from importlib.util import find_spec
-
-
-def _missing_dependencies_error(missing: str) -> ModuleNotFoundError:
-    return ModuleNotFoundError(
-        "Cortex-M backend dependencies are not installed "
-        f"(missing: {missing}). Install ExecuTorch with "
-        "`pip install executorch[cortex_m]`, or if building from source run "
-        "`examples/arm/setup.sh --i-agree-to-the-contained-eula`."
-    )
-
-
-def _ensure_cortex_m_dependencies() -> None:
-    required_modules = {
-        "cmsis_nn": "cmsis_nn",
-    }
-    missing_packages = []
-    for module_name, package_name in required_modules.items():
-        try:
-            if find_spec(module_name) is None:
-                missing_packages.append(package_name)
-        except (ImportError, ValueError):
-            missing_packages.append(package_name)
-
-    if missing_packages:
-        raise _missing_dependencies_error(", ".join(missing_packages))
-
-
-_ensure_cortex_m_dependencies()
-
 from .cortex_m_pass import CortexMPass  # noqa  # usort: skip
 from .activation_fusion_pass import ActivationFusionPass  # noqa
 from .aten_to_cortex_m_pass import AtenToCortexMPass  # noqa
diff --git a/backends/cortex_m/passes/aten_to_cortex_m_pass.py b/backends/cortex_m/passes/aten_to_cortex_m_pass.py
index ecc7187797d..3f5a6055331 100644
--- a/backends/cortex_m/passes/aten_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/aten_to_cortex_m_pass.py
@@ -8,12 +8,12 @@
 import math
 from typing import cast, Optional
 
-import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import executorch.backends.cortex_m.ops.operators  # noqa
 import executorch.exir as exir
 import torch
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.cortex_m.library import cmsis_nn
 
 from executorch.backends.cortex_m.passes.passes_utils import (
     build_activation_lut,
diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py
index 95a9c441f61..b247e2be944 100644
--- a/backends/cortex_m/passes/scratch_buffer_sizes.py
+++ b/backends/cortex_m/passes/scratch_buffer_sizes.py
@@ -6,11 +6,11 @@
 from collections.abc import Callable
 from typing import Any, cast
 
-import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import executorch.backends.cortex_m.ops.operators  # noqa
 
 import torch
 import torch.fx
+from executorch.backends.cortex_m.library import cmsis_nn
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
diff --git a/backends/cortex_m/target_config.py b/backends/cortex_m/target_config.py
index 23cb15c4a53..341ae612cb5 100644
--- a/backends/cortex_m/target_config.py
+++ b/backends/cortex_m/target_config.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +11,7 @@
 from enum import auto, Enum
 from typing import Optional
 
-import cmsis_nn  # type: ignore[import-not-found, import-untyped]
+from executorch.backends.cortex_m.library import cmsis_nn
 
 
 class CortexM(Enum):
diff --git a/backends/cortex_m/test/misc/test_cmsis_pybind.py b/backends/cortex_m/test/misc/test_cmsis_pybind.py
index f85a4bacece..08a1d973234 100644
--- a/backends/cortex_m/test/misc/test_cmsis_pybind.py
+++ b/backends/cortex_m/test/misc/test_cmsis_pybind.py
@@ -1,5 +1,4 @@
 # Copyright 2026 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,7 +10,7 @@
 
 def _import_cmsis_nn():
     try:
-        return importlib.import_module("cmsis_nn")
+        return importlib.import_module("executorch.backends.cortex_m.library.cmsis_nn")
     except Exception as exc:
         pytest.fail(f"Failed to resolve cmsis_nn: {exc}")
 
diff --git a/backends/cortex_m/test/misc/test_target_config.py b/backends/cortex_m/test/misc/test_target_config.py
index 3e648b0a81c..472d1927886 100644
--- a/backends/cortex_m/test/misc/test_target_config.py
+++ b/backends/cortex_m/test/misc/test_target_config.py
@@ -1,12 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import pytest
 
+from executorch.backends.cortex_m.library import cmsis_nn
 from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
 
 
diff --git a/backends/cortex_m/test/ops/test_avg_pool2d.py b/backends/cortex_m/test/ops/test_avg_pool2d.py
index 315d968188f..a2992b50905 100644
--- a/backends/cortex_m/test/ops/test_avg_pool2d.py
+++ b/backends/cortex_m/test/ops/test_avg_pool2d.py
@@ -93,7 +93,7 @@ def test_dialect_avg_pool2d(test_case, cortex_m_target):
         qtol=1,
     )
 
-    import cmsis_nn  # type: ignore[import-not-found, import-untyped]
+    from executorch.backends.cortex_m.library import cmsis_nn
 
     module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
     pool_target = exir_ops.edge.cortex_m.quantized_avg_pool2d.default
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index f9f23a842f9..b328a05df54 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -61,29 +61,81 @@ def _is_cpu_clone_active() -> bool:
     return getattr(_CPU_CLONE_GUARD, "active", False)
 
 
+def _full_zeros_preserving_strides(x: torch.Tensor, device) -> torch.Tensor:
+    """Allocate a zero-filled tensor matching ``x``'s size/stride/dtype on ``device``.
+
+    Used to re-synthesize KV-cache buffers whose storage was freed (``resize_(0)``)
+    during the low-memory device move. KV content is all zeros, so this exactly
+    reproduces the buffer for both the lifted graph value and serialization.
+    """
+    needed = 1
+    for size, stride in zip(x.size(), x.stride()):
+        needed += (size - 1) * stride
+    buf = torch.zeros(int(needed), dtype=x.dtype, device=device)
+    return torch.as_strided(buf, x.size(), x.stride())
+
+
+def _is_emptied(x) -> bool:
+    return (
+        isinstance(x, torch.Tensor)
+        and x.numel() > 0
+        and x.untyped_storage().nbytes() == 0
+    )
+
+
 @contextlib.contextmanager
 def _compile_time_cpu_clones(target_device: torch.device):
     """Force AOTI's mutated-buffer clones onto CPU while preserving the
     serialized constants' target device."""
-    from torch._inductor import compile_fx as _cfx
+    from torch._inductor import compile_fx as _cfx, graph as _graph
     from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu as _Cpp
+    from torch._inductor.graph import GraphLowering as _GL
 
     orig_clone = _cfx.clone_preserve_strides
     orig_codegen_device = _Cpp.codegen_device
+    orig_get_const = _GL.get_original_value_of_constant
+    orig_is_same = _graph.is_same_tensor
+
+    def _is_same_skip_emptied(data, value):
+        # KV buffers freed via resize_(0) all have data_ptr 0, so the stock
+        # is_same_tensor would treat every same-shape KV constant as a duplicate
+        # and collapse the 60 layers' caches into one — the runtime needs each
+        # FQN's own buffer, so the collapsed ones load uninitialized garbage.
+        # Never dedup an emptied tensor.
+        if _is_emptied(data) or _is_emptied(value):
+            return False
+        return orig_is_same(data, value)
 
     def _cpu_clone_preserve_strides(x: torch.Tensor) -> torch.Tensor:
-        # `clone_preserve_strides` is shared by `_unlift_graph` (clones
-        # lifted buffers — can be safely kept on CPU) and by autotuning code
-        # in `triton_heuristics.py` (clones for benchmark — must stay on
-        # GPU for Triton). Discriminate by caller frame so we only force
-        # CPU clones for the buffer-lifting path.
+        # `clone_preserve_strides` is shared by `_unlift_graph` (clones lifted
+        # buffers — can be safely kept on CPU) and by autotuning code in
+        # `triton_heuristics.py` (clones for benchmark — must stay on GPU for
+        # Triton). Discriminate by caller frame so we only force CPU clones for
+        # the buffer-lifting path.
         import sys
 
         caller = sys._getframe(1).f_code.co_name
         if caller == "_unlift_graph":
+            # KV-cache buffers are emptied (storage resize_(0)) by the low-memory
+            # device move so they never occupy GPU memory during compile. Their
+            # content is all zeros, so re-synthesize zeros (on CPU, strides
+            # preserved) instead of cloning the now-empty storage.
+            if _is_emptied(x):
+                return _full_zeros_preserving_strides(x, "cpu")
             return orig_clone(x).cpu()
         return orig_clone(x)
 
+    def _get_const_synthesize_zeros(self, name):
+        # AOTI serializes each constant via get_original_value_of_constant ->
+        # _to_bytes. For KV buffers we freed with resize_(0) this would otherwise
+        # fall back to the empty-storage constant and write 0 bytes, producing a
+        # .ptd with an uninitialized cache. Re-synthesize the zeros so the blob
+        # holds a correctly-zeroed KV cache.
+        value = orig_get_const(self, name)
+        if _is_emptied(value):
+            return _full_zeros_preserving_strides(value, "cpu")
+        return value
+
     def _codegen_device_target_aware(self, device):
         # Translate accidental CPU device strings back to the model target
         # device only when a constant we forced to CPU is being serialized.
@@ -99,6 +151,8 @@ def _codegen_device_target_aware(self, device):
 
     _cfx.clone_preserve_strides = _cpu_clone_preserve_strides
     _Cpp.codegen_device = _codegen_device_target_aware
+    _GL.get_original_value_of_constant = _get_const_synthesize_zeros
+    _graph.is_same_tensor = _is_same_skip_emptied
     prev_active = getattr(_CPU_CLONE_GUARD, "active", False)
     _CPU_CLONE_GUARD.active = True
     try:
@@ -107,6 +161,107 @@ def _codegen_device_target_aware(self, device):
         _CPU_CLONE_GUARD.active = prev_active
         _cfx.clone_preserve_strides = orig_clone
         _Cpp.codegen_device = orig_codegen_device
+        _GL.get_original_value_of_constant = orig_get_const
+        _graph.is_same_tensor = orig_is_same
+
+
+def _is_kv_buffer(name, v) -> bool:
+    """True only for an actual KV-cache *content* buffer that is safe to free.
+
+    The low-memory path (``_move_to_device_resize_kv``) frees every buffer this
+    matches and re-synthesizes it as ZEROS in both the lifted graph and the
+    serialized ``.ptd`` (see ``_full_zeros_preserving_strides`` /
+    ``_get_const_synthesize_zeros``). That is only valid for genuine KV *content*,
+    which is all-zeros at export time (caches start empty).
+
+    It must NOT match the non-zero constants that some KV-cache modules register
+    alongside the cache — e.g. TurboQuant registers its codebook/rotation
+    (``centroids``/``boundaries``/``rotation``/``rotation_T``) as buffers on the
+    ``kv_cache`` module, so their FQNs also contain ``kv_cache``. Freeing+zeroing
+    those silently corrupts the serialized model (TQ4 dequant -> 0 -> garbage).
+    Gate on the buffer actually being all-zeros so only empty KV content is freed;
+    this is robust to any future constant name (a non-zero buffer is never freed).
+    """
+    if not isinstance(v, torch.Tensor) or isinstance(v, torch.nn.Parameter):
+        return False
+    if "kv_cache" not in name or v.numel() == 0 or v.is_meta:
+        return False
+    # Only the genuinely all-zero KV content may be freed + re-zeroed; non-zero
+    # constants (TurboQuant centroids/rotation/...) must be preserved as-is.
+    return bool(torch.count_nonzero(v) == 0)
+
+
+def _empty_strided_on_device(v, location):
+    """A device tensor with v's shape/stride/dtype but zero (freed) storage."""
+    t = torch.empty_strided(v.shape, v.stride(), dtype=v.dtype, device=location)
+    t.untyped_storage().resize_(0)  # free bytes, keep device + shape/stride
+    return t
+
+
+def _move_graph_nodes_to_device(graph_module, location):
+    """Point node device kwargs / aten.to.device targets / meta vals at location."""
+    import torch.utils._pytree as pytree
+
+    def _to_loc(v):
+        return v.to(location) if isinstance(v, torch.Tensor) else v
+
+    for m in graph_module.modules():
+        if not isinstance(m, torch.fx.GraphModule):
+            continue
+        for node in m.graph.nodes:
+            if "device" in node.kwargs:
+                node.kwargs = {**node.kwargs, "device": location}
+            if node.op == "call_function" and node.target is torch.ops.aten.to.device:
+                args = list(node.args)
+                args[1] = location
+                node.args = tuple(args)
+            node.meta["val"] = pytree.tree_map(_to_loc, node.meta.get("val"))
+
+
+def _move_to_device_resize_kv(ep, location):
+    """``move_to_device_pass`` variant that frees KV-cache storage on-device.
+
+    Mirrors ``torch.export.passes.move_to_device_pass`` exactly, except KV-cache
+    buffers (FQN contains ``kv_cache``) are placed on ``location`` but with their
+    storage immediately freed via ``resize_(0)``. This keeps ``device ==
+    location`` — so the fake-tensor device check on the ``index_copy`` cache
+    update passes (``self`` and ``values`` both on cuda) — while no real KV bytes
+    occupy the device during the AOTI compile. KV content is all zeros, so the
+    emptied tensors are re-synthesized as zeros at the ``_unlift_graph`` clone
+    (see ``_compile_time_cpu_clones``), which is reused as both the lifted initial
+    value and the serialized ``.ptd`` constant. The empty/free is interleaved per
+    tensor so the transient device peak is a single KV buffer, not the whole cache.
+    Only ``kv_cache`` tensors are emptied (they are the lone large zero-buffers);
+    every other tensor is moved normally so non-zero content is never lost.
+    """
+    import torch.utils._pytree as pytree
+
+    for k, v in ep.state_dict.items():
+        if isinstance(v, torch.nn.Parameter):
+            ep._state_dict[k] = torch.nn.Parameter(v.to(location), v.requires_grad)
+        elif _is_kv_buffer(k, v):
+            ep._state_dict[k] = _empty_strided_on_device(v, location)
+        else:
+            ep._state_dict[k] = v.to(location)
+
+    for k, v in ep.constants.items():
+        if isinstance(v, torch.Tensor):
+            ep._constants[k] = (
+                _empty_strided_on_device(v, location)
+                if _is_kv_buffer(k, v)
+                else v.to(location)
+            )
+
+    if ep.example_inputs is not None:
+        args, kwargs = ep.example_inputs
+        ep._example_inputs = (
+            pytree.tree_map_only(torch.Tensor, lambda t: t.to(location), args),
+            pytree.tree_map_only(torch.Tensor, lambda t: t.to(location), kwargs),
+        )
+
+    _move_graph_nodes_to_device(ep.graph_module, location)
+    ep.validate()
+    return ep
 
 
 @final
@@ -424,6 +579,29 @@ def _is_low_memory_mode(compile_specs: List[CompileSpec]) -> bool:
                 return spec.value.decode("utf-8").upper() == "ON"
         return False
 
+    @classmethod
+    def move_program_to_device(
+        cls,
+        edge_program,
+        device: str,
+        compile_specs: List[CompileSpec],
+    ):
+        """Move the program to ``device`` for AOTI compile.
+
+        On a low-memory export (``low_memory_mode="ON"``) the KV-cache buffers —
+        which can be 10+ GiB at long context — are placed on-device but with their
+        storage freed (``resize_(0)``), so they never occupy device memory during
+        the autotune / cpp_wrapper compile while still satisfying the device-match
+        check on the cache update. They are re-synthesized as zeros for the lifted
+        graph and the serialized blob. This activates automatically with low-memory
+        mode. Other (non-low-memory) exports use the stock pass.
+        """
+        from torch.export.passes import move_to_device_pass
+
+        if not cls._is_low_memory_mode(compile_specs):
+            return move_to_device_pass(edge_program, device)
+        return _move_to_device_resize_kv(edge_program, device)
+
     @classmethod
     def release_moved_tensors(
         cls,
diff --git a/backends/cuda/quantize_op_dispatch/int4_dispatch.py b/backends/cuda/quantize_op_dispatch/int4_dispatch.py
index c3b8921e2fe..1b8c370eecf 100644
--- a/backends/cuda/quantize_op_dispatch/int4_dispatch.py
+++ b/backends/cuda/quantize_op_dispatch/int4_dispatch.py
@@ -60,11 +60,29 @@ def _cuda(self, qdata, scale, zero, group_size):
     return _dequant_matmul(self, qdata, scale, zero, group_size)
 
 
+# Chunked dequant for the export GPU budget. The lm_head dequant (N = vocab_size,
+# e.g. 262144) runs through the int4_plain_mm custom op (M=1); AOTI executes that
+# op's CUDA impl during autotune / cpp_wrapper codegen, where it transiently holds
+# ~5 full-size bf16 temporaries (low/high/data/data-z/w_deq) — ~10 GiB for a
+# 262144-row weight even though the final w_deq is only ~2.6 GiB. Chunking along N
+# caps that at ~chunk rows. It is numerically identical (F.linear output rows are
+# independent), and because only the lm_head (custom-op) path crosses the N
+# threshold — never the M>4 prefill inline path — it never enters the runtime
+# graph: ZERO runtime / accuracy impact. Applied unconditionally to any weight
+# whose row count exceeds the threshold.
+_DEQUANT_N_THRESHOLD = 65536
+_DEQUANT_N_CHUNK = 32768
+
+
 def _dequant_matmul(x, qdata, scale, zero, group_size):
     """Dequant INT4 weights to input dtype and call F.linear.
 
     scale/zero are in the coalesced [N, n_groups] layout (baked into the
     weight constant at pack time), aligned row-for-row with qdata's [N, *].
+
+    Large weights (N > threshold, i.e. the lm_head) are chunked along N to bound
+    the dequant intermediate (see note above); smaller weights take the original
+    single-shot dequant.
     """
     N, K_half = qdata.shape
     K = K_half * 2
@@ -72,16 +90,24 @@ def _dequant_matmul(x, qdata, scale, zero, group_size):
     gs_half = group_size // 2
     dtype = x.dtype
 
-    p = qdata.to(torch.uint8).reshape(N, n_groups, gs_half)
-    low = (p & 0x0F).to(dtype)
-    high = ((p >> 4) & 0x0F).to(dtype)
-    data = torch.stack([low, high], dim=-1).reshape(N, n_groups, group_size)
-
-    s = scale.to(dtype).unsqueeze(-1)
-    z = zero.to(dtype).unsqueeze(-1)
-    w_deq = ((data - z) * s).reshape(N, K)
-
-    return F.linear(x, w_deq)
+    def _dq(qd, sc, ze, rows):
+        p = qd.to(torch.uint8).reshape(rows, n_groups, gs_half)
+        low = (p & 0x0F).to(dtype)
+        high = ((p >> 4) & 0x0F).to(dtype)
+        data = torch.stack([low, high], dim=-1).reshape(rows, n_groups, group_size)
+        s = sc.to(dtype).unsqueeze(-1)
+        z = ze.to(dtype).unsqueeze(-1)
+        w_deq = ((data - z) * s).reshape(rows, K)
+        return F.linear(x, w_deq)
+
+    if N <= _DEQUANT_N_THRESHOLD:
+        return _dq(qdata, scale, zero, N)
+
+    outs = []
+    for i in range(0, N, _DEQUANT_N_CHUNK):
+        j = min(i + _DEQUANT_N_CHUNK, N)
+        outs.append(_dq(qdata[i:j], scale[i:j], zero[i:j], j - i))
+    return torch.cat(outs, dim=-1)
 
 
 # ---------------------------------------------------------------------------
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index f7d095540ad..c3d7446eaa2 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -319,8 +319,12 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
-    std::string so_blob_key =
-        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+    std::string so_blob_key;
+    std::string weights_blob_key;
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        executorch::backends::aoti::resolve_blob_keys(
+            processed, method_name, so_blob_key, weights_blob_key),
+        "Malformed named-data key payload");
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
     auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
@@ -394,11 +398,11 @@ class ET_EXPERIMENTAL CudaBackend final
     // methods are independent sub-graphs that may have FQN collisions
     // (e.g. parakeet).
     if (is_weight_sharing_across_methods_enabled()) {
-      ET_CHECK_OK_OR_RETURN_ERROR(
-          load_constants_with_cache(handle, named_data_map, method_name));
+      ET_CHECK_OK_OR_RETURN_ERROR(load_constants_with_cache(
+          handle, named_data_map, method_name, weights_blob_key));
     } else {
       ET_CHECK_OK_OR_RETURN_ERROR(
-          load_constants_legacy(handle, named_data_map, method_name));
+          load_constants_legacy(handle, named_data_map, weights_blob_key));
     }
 
     // Use shared CUDA stream if enabled via options, otherwise create one.
@@ -1011,13 +1015,14 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_constants_with_cache(
       cuda::CudaDelegateHandle* handle,
       const NamedDataMap* named_data_map,
-      const std::string& method_name) const {
+      const std::string& method_name,
+      const std::string& weights_blob_key) const {
     // Check if the required APIs are available
     if (!handle->get_num_constants || !handle->get_constant_name ||
         !handle->get_constant_original_fqn || !handle->extract_constants_map ||
         !handle->update_user_managed_constant_buffer_pairs) {
       // Fall back to the legacy path
-      return load_constants_legacy(handle, named_data_map, method_name);
+      return load_constants_legacy(handle, named_data_map, weights_blob_key);
     }
 
     // Step 1: Enumerate constants and partition into cached/uncached
@@ -1069,8 +1074,6 @@ class ET_EXPERIMENTAL CudaBackend final
     if (!uncached_fqns.empty()) {
       // Need to load from blob — use update_constants_from_blob for all,
       // then extract the new constants into the cache.
-      std::string weights_blob_key =
-          method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
       auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
 
       ET_CHECK_OR_RETURN_ERROR(
@@ -1190,9 +1193,7 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_constants_legacy(
       cuda::CudaDelegateHandle* handle,
       const NamedDataMap* named_data_map,
-      const std::string& method_name) const {
-    std::string weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+      const std::string& weights_blob_key) const {
     auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
     if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
       ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py
index 0ee345be08a..89c1204ea00 100644
--- a/backends/cuda/tests/test_cuda_partitioner.py
+++ b/backends/cuda/tests/test_cuda_partitioner.py
@@ -12,17 +12,18 @@
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir.backend.partitioner import PartitionResult
 from executorch.exir.delegate import executorch_call_delegate
-from torch._export.utils import is_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export import export
+from torch.fx.passes.utils.fuser_utils import validate_partition
 
 
 class TestCudaPartitioner(unittest.TestCase):
     """
     Test CUDA partitioner functionality.
 
-    After CUDA partitioning, there should be exactly one partitioned graph that contains
-    all operators from the input graph. This means all operators should be tagged with
-    the same delegation tag, indicating they will all be executed by the CUDA backend.
+    A fully delegatable graph collapses to a single partition. When a
+    non-delegated node splits the delegatable ops, the partitioner emits one
+    convex partition per island.
     """
 
     def _get_partition_result(
@@ -178,12 +179,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         for node in partition_result.tagged_exported_program.graph.nodes:
             if node.op == "placeholder":
                 # Check if this is a constant (param, buffer, or lifted tensor constant)
-                from torch._export.utils import (
-                    is_buffer,
-                    is_lifted_tensor_constant,
-                    is_param,
-                )
-
                 is_constant = (
                     is_param(partition_result.tagged_exported_program, node)
                     or is_buffer(partition_result.tagged_exported_program, node)
@@ -216,8 +211,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             f"All constant placeholders should be tagged. Found untagged constants: {untagged_constants}",
         )
 
-        # Verify all tagged constants have the expected tag
-        expected_tag = "tag0"
+        # Verify all tagged constants share the (single) partition's tag.
+        self.assertEqual(len(partition_result.partition_tags), 1)
+        expected_tag = next(iter(partition_result.partition_tags))
         for node in constant_placeholders:
             actual_tag = node.meta.get("delegation_tag")
             self.assertEqual(
@@ -320,3 +316,143 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.assertNotIn("delegation_tag", buffer_placeholder.meta)
         self.assertNotIn("delegation_tag", delegate.meta)
         self.assertIn("delegation_tag", aten_node.meta)
+
+    def test_multiple_partitions_for_split_graph(self) -> None:
+        """Ops split by a non-delegated node must land in separate partitions.
+
+        One tag over the disconnected islands would be non-convex and fail fusion.
+        """
+
+        class TwoAddModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                a = x + 1.0
+                return a + 2.0
+
+        exported_program = export(TwoAddModule(), (torch.randn(3, 4),), strict=True)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+
+        add_nodes = [
+            n
+            for n in graph.nodes
+            if n.op == "call_function" and n.target != operator.getitem
+        ]
+        first_add, second_add = add_nodes[0], add_nodes[1]
+
+        # Splice an already-lowered region between the two adds so the second add
+        # depends on the first only through that non-delegated node.
+        graph_module.lowered_module_0 = torch.nn.Module()
+        with graph.inserting_before(second_add):
+            lowered = graph.get_attr("lowered_module_0")
+            delegate = graph.call_function(
+                executorch_call_delegate, (lowered, first_add)
+            )
+            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
+        second_add.replace_input_with(first_add, delegate_output)
+        graph.lint()
+
+        result = CudaPartitioner([]).partition(exported_program)
+
+        # Separated by the delegate, the adds must land in different partitions.
+        self.assertEqual(len(result.partition_tags), 2)
+        self.assertIn("delegation_tag", first_add.meta)
+        self.assertIn("delegation_tag", second_add.meta)
+        self.assertNotEqual(
+            first_add.meta["delegation_tag"], second_add.meta["delegation_tag"]
+        )
+        self.assertNotIn("delegation_tag", delegate.meta)
+        self.assertNotIn("delegation_tag", delegate_output.meta)
+
+        # Each partition must be convex on its own so fusion does not cycle.
+        for tag in result.partition_tags:
+            tagged = [
+                n
+                for n in exported_program.graph.nodes
+                if n.meta.get("delegation_tag") == tag
+            ]
+            self.assertTrue(validate_partition(tagged))
+
+    def test_control_flow_get_attr_shares_op_tag(self) -> None:
+        """A control-flow op's branch get_attrs must share the op's partition tag.
+
+        They are not call_function nodes, so the capability partitioner does not
+        claim them; they must be lowered into the same submodule as the op.
+        """
+
+        class CondModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.cond(x.sum() > 0, torch.sin, torch.cos, (x,))
+
+        exported_program = export(CondModule(), (torch.randn(3, 4),), strict=True)
+        result = CudaPartitioner([]).partition(exported_program)
+
+        cond_node = next(
+            n
+            for n in exported_program.graph.nodes
+            if n.op == "call_function" and n.target is torch.ops.higher_order.cond
+        )
+        branch_get_attrs = [
+            arg
+            for arg in cond_node.args
+            if isinstance(arg, torch.fx.Node) and arg.op == "get_attr"
+        ]
+
+        self.assertEqual(len(branch_get_attrs), 2)
+        self.assertIn(cond_node.meta["delegation_tag"], result.partition_tags)
+        for get_attr in branch_get_attrs:
+            self.assertEqual(
+                get_attr.meta.get("delegation_tag"),
+                cond_node.meta["delegation_tag"],
+            )
+
+    def test_shared_constant_across_partitions(self) -> None:
+        """A constant read by two partitions is claimed, not dropped.
+
+        tag_constant_data assigns it one partition's tag; backend lowering later
+        duplicates it per consumer, so partitioning must not crash or drop it.
+        """
+
+        class SharedWeightModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("w", torch.randn(3, 4))
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return (x + self.w) + self.w
+
+        exported_program = export(
+            SharedWeightModule(), (torch.randn(3, 4),), strict=True
+        )
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+
+        add_nodes = [
+            n
+            for n in graph.nodes
+            if n.op == "call_function" and n.target != operator.getitem
+        ]
+        first_add, second_add = add_nodes[0], add_nodes[1]
+
+        # Split the two adds (both reading w) with an already-lowered region.
+        graph_module.lowered_module_0 = torch.nn.Module()
+        with graph.inserting_before(second_add):
+            lowered = graph.get_attr("lowered_module_0")
+            delegate = graph.call_function(
+                executorch_call_delegate, (lowered, first_add)
+            )
+            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
+        second_add.replace_input_with(first_add, delegate_output)
+        graph.lint()
+
+        result = CudaPartitioner([]).partition(exported_program)
+
+        # Two islands, and the shared buffer is claimed by one of them, not dropped.
+        self.assertEqual(len(result.partition_tags), 2)
+        buffer_placeholder = next(
+            n
+            for n in graph.nodes
+            if n.op == "placeholder" and is_buffer(exported_program, n)
+        )
+        self.assertIn(
+            buffer_placeholder.meta.get("delegation_tag"), result.partition_tags
+        )
diff --git a/backends/cuda/tests/test_tq4_sdpa.py b/backends/cuda/tests/test_tq4_sdpa.py
index 9cf1e9e2d57..f9543b1ff18 100644
--- a/backends/cuda/tests/test_tq4_sdpa.py
+++ b/backends/cuda/tests/test_tq4_sdpa.py
@@ -20,7 +20,6 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-
 from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.backends.cuda.triton.kernels.tq4_sdpa import tq4_sdpa
@@ -253,7 +252,7 @@ def test_gqa_prefill(self):
                 self._run_test(1, H_q, H_kv, 64, 64, 128, is_causal=True)
 
     def test_gqa_8x_head_dim_256(self):
-        """GQA 8:1 with head_dim=256 — matches Qwen 3.5 MoE config."""
+        """GQA 8:1 with head_dim=256."""
         self._run_test(1, 16, 2, 1, 128, 256)
         L = 64
         mask = torch.tril(torch.ones(1, 1, L, L, dtype=torch.bool, device="cuda"))
@@ -375,8 +374,8 @@ def test_float_mask_rejected(self):
                 float_mask,
             )
 
-    def test_qwen35_moe_config(self):
-        """Qwen 3.5 MoE: head_dim=256, GQA 16:2, decode + prefill."""
+    def test_config_hd256_gqa_16_2(self):
+        """head_dim=256, GQA 16:2, decode + prefill."""
         self._run_test(1, 16, 2, 1, 256, 256)
         self._run_test(1, 16, 2, 128, 128, 256, is_causal=True)
 
@@ -438,6 +437,437 @@ def test_output_shape_and_dtype(self):
                     self.assertEqual(out.shape, (1, H_q, Lq, D))
                     self.assertEqual(out.dtype, torch.bfloat16)
 
+    # ------------------------------------------------------------------
+    # 128k code path: kv_len clamp (decode) + mask_is_causal (prefill)
+    #
+    # Every test above calls tq4_sdpa WITHOUT kv_len and WITHOUT
+    # mask_is_causal, so they only exercise the kv_len=None fallback
+    # (full-Lk loop) at short KV. The cases below drive the actual
+    # long-context paths at two representative GQA shapes (head_dim=512
+    # GQA 8:4, and head_dim=256 GQA 16:2):
+    #   * the on-device kv_len scalar that bounds the KV loop to the
+    #     filled context (decode), and
+    #   * the mask_is_causal per-tile causal block-skip (prefill).
+    #
+    # "GARBAGE TAIL": in production the KV cache is a fixed buffer
+    # pre-allocated to max_seq_len (e.g. 131072). At any step only the
+    # first kv_len positions hold real K/V; the rest is stale /
+    # uninitialized memory that attention must ignore. We simulate that
+    # tail by writing large-magnitude (x1000) values into [kv_len:]. If
+    # the clamp / block-skip works the kernel never reads the tail and
+    # the output matches a reference built from [0, kv_len) only; if it
+    # is broken the huge tail values dominate the softmax and the cosine
+    # collapses to ~0. So the garbage tail is a built-in negative control
+    # (verified: dropping kv_len drives the cosine to ~-0.01 and fails).
+    #
+    # CAUSAL ALIGNMENT (top-left vs bottom-right): when L_q < L_kv (a
+    # chunked prefill / decode, where the Lq new queries sit at the END
+    # of a kv_len-long context) there are two ways to place the causal
+    # triangle. PyTorch F.sdpa(is_causal=True) uses TOP-LEFT alignment
+    # (query row i attends to keys [0, i]) -- wrong for a KV cache. This
+    # kernel (and a KV-cache decoder's mask builder) use BOTTOM-RIGHT
+    # alignment: query row i is absolute position (kv_len - Lq + i) and
+    # attends to keys [0, kv_len - Lq + i]. So the reference below builds
+    # an explicit bottom-right mask (q_pos >= cache_pos) rather than
+    # passing is_causal=True, which would otherwise mismatch the kernel.
+    # ------------------------------------------------------------------
+
+    def _run_long_kv_test(
+        self,
+        *,
+        H_q,
+        H_kv,
+        D,
+        Lq,
+        kv_len,
+        buffer_len,
+        causal=False,
+        garbage=True,
+        pass_kv_len=True,
+        min_cosine=0.99,
+        seed=42,
+    ):
+        """Drive tq4_sdpa over a buffer whose first ``kv_len`` positions are
+        real and whose ``[kv_len:]`` tail is large-magnitude garbage, then
+        compare against an fp32 reference built from the first ``kv_len``
+        positions only.
+
+        The kernel sees the full (garbage-tailed) compressed buffer; the
+        on-device ``kv_len`` scalar (and, for prefill, the bottom-right
+        causal mask) must confine attention to ``[0, kv_len)``.
+
+        ``causal=True`` builds a bottom-right-aligned mask (the Lq queries
+        are the last Lq positions of a kv_len-long context), mirroring a
+        KV-cache decoder's ``q_pos >= cache_pos`` mask and the kernel's
+        ``(kv_len - Lq) + seq_pos`` block bound. We deliberately do NOT use
+        ``F.sdpa(is_causal=True)`` for the reference: PyTorch aligns
+        is_causal top-left when L_q < L_kv, while this kernel (and such a
+        decoder) align bottom-right.
+        """
+        torch.manual_seed(seed)
+        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
+        centroids = centroids.cuda()
+        boundaries = boundaries.cuda()
+        rotation = rotation.cuda()
+
+        B = 1
+        k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
+        if garbage and buffer_len > kv_len:
+            g = buffer_len - kv_len
+            k[:, :, kv_len:, :] = (
+                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
+            )
+            v[:, :, kv_len:, :] = (
+                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
+            )
+
+        q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda")
+
+        k_packed, k_norms = _compress(k, boundaries, rotation)
+        v_packed, v_norms = _compress(v, boundaries, rotation)
+
+        attn_mask = None
+        if causal:
+            cache_pos = torch.arange(buffer_len, device="cuda")
+            q_pos = torch.arange(kv_len - Lq, kv_len, device="cuda").unsqueeze(1)
+            attn_mask = (q_pos >= cache_pos.unsqueeze(0)).view(1, 1, Lq, buffer_len)
+
+        kv_len_t = (
+            torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+            if pass_kv_len
+            else None
+        )
+
+        out = self.tq4_sdpa(
+            q,
+            k_packed,
+            k_norms,
+            v_packed,
+            v_norms,
+            centroids,
+            rotation,
+            attn_mask=attn_mask,
+            is_causal=False,
+            scale=None,
+            kv_len=kv_len_t,
+            mask_is_causal=causal,
+        )
+
+        # Reference: the same decompress-then-fp32-SDPA path the other tests
+        # use (_reference_tq4_sdpa), but over ONLY the first kv_len positions
+        # so the garbage tail can never influence it. _compress is per-row,
+        # so compressing the sliced K/V here is bit-identical to the kernel's
+        # view of the full buffer sliced to [:, :, :kv_len]; the helper also
+        # handles the GQA repeat_interleave and mask broadcast internally.
+        ref_mask = attn_mask[:, :, :, :kv_len] if attn_mask is not None else None
+        ref, *_ = _reference_tq4_sdpa(
+            q,
+            k[:, :, :kv_len],
+            v[:, :, :kv_len],
+            centroids,
+            boundaries,
+            rotation,
+            attn_mask=ref_mask,
+        )
+
+        self.assertFalse(torch.isnan(out).any(), "NaN in output")
+        cos = _cosine_sim(out, ref)
+        self.assertGreater(
+            cos,
+            min_cosine,
+            f"Cosine {cos:.5f} < {min_cosine} "
+            f"(H_q={H_q} H_kv={H_kv} D={D} Lq={Lq} kv_len={kv_len} "
+            f"buffer={buffer_len} causal={causal} kv_len_passed={pass_kv_len})",
+        )
+        return cos
+
+    def _run_splitk_vs_fused_test(
+        self,
+        *,
+        H_q,
+        H_kv,
+        D,
+        Lq,
+        kv_len,
+        buffer_len,
+        B=1,
+        seed=42,
+    ):
+        """Verify split-K output matches fused kernel output for same inputs.
+
+        Runs tq4_sdpa twice: once with kv_len (triggers split-K for Lq=1, kv_len>=256),
+        and once without kv_len (forces fused kernel path). Both outputs must match
+        within fp tolerance, proving split-K computes the same result.
+        """
+        torch.manual_seed(seed)
+        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
+        centroids = centroids.cuda()
+        boundaries = boundaries.cuda()
+        rotation = rotation.cuda()
+
+        k = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H_kv, buffer_len, D, dtype=torch.bfloat16, device="cuda")
+        # Add garbage tail to ensure split-K respects kv_len bound
+        if buffer_len > kv_len:
+            g = buffer_len - kv_len
+            k[:, :, kv_len:, :] = (
+                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
+            )
+            v[:, :, kv_len:, :] = (
+                torch.randn(B, H_kv, g, D, dtype=torch.bfloat16, device="cuda") * 1000.0
+            )
+
+        q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda")
+
+        k_packed, k_norms = _compress(k, boundaries, rotation)
+        v_packed, v_norms = _compress(v, boundaries, rotation)
+
+        # Split-K path: with kv_len (triggers split-K for Lq=1, kv_len>=256)
+        kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+        out_splitk = self.tq4_sdpa(
+            q,
+            k_packed,
+            k_norms,
+            v_packed,
+            v_norms,
+            centroids,
+            rotation,
+            attn_mask=None,
+            is_causal=False,
+            scale=None,
+            kv_len=kv_len_t,
+            mask_is_causal=False,
+        )
+
+        # Fused kernel path: without kv_len (forces fused kernel)
+        # But we need to slice the buffer to kv_len to avoid garbage
+        k_packed_sliced = k_packed[:, :, :kv_len, :]
+        k_norms_sliced = k_norms[:, :, :kv_len, :]
+        v_packed_sliced = v_packed[:, :, :kv_len, :]
+        v_norms_sliced = v_norms[:, :, :kv_len, :]
+
+        out_fused = self.tq4_sdpa(
+            q,
+            k_packed_sliced,
+            k_norms_sliced,
+            v_packed_sliced,
+            v_norms_sliced,
+            centroids,
+            rotation,
+            attn_mask=None,
+            is_causal=False,
+            scale=None,
+            kv_len=None,
+            mask_is_causal=False,
+        )
+
+        # Both outputs must match (split-K computes same result as fused)
+        self.assertFalse(torch.isnan(out_splitk).any(), "NaN in split-K output")
+        self.assertFalse(torch.isnan(out_fused).any(), "NaN in fused output")
+        cos = _cosine_sim(out_splitk, out_fused)
+        self.assertGreater(
+            cos,
+            0.99,
+            f"Split-K vs Fused cosine {cos:.5f} < 0.99 "
+            f"(B={B} H_q={H_q} H_kv={H_kv} D={D} kv_len={kv_len})",
+        )
+
+    def test_splitk_batch2(self):
+        """Split-K decode (Lq=1) with batch size B=2.
+
+        Exercises the per-batch indexing in the split-K and reduce kernels
+        (b = pid_bh // H_grid). Split-K output must match the fused-kernel
+        path for the same inputs."""
+        self._run_splitk_vs_fused_test(
+            H_q=16, H_kv=2, D=256, Lq=1, kv_len=512, buffer_len=1024, B=2
+        )
+
+    def test_splitk_noncontiguous_query(self):
+        """Split-K decode (Lq=1, B=2) with a non-contiguous query.
+
+        The host wrapper rotates Q (Q @ Pi^T) before launching the kernel,
+        so a strided query must yield the same result as its contiguous
+        copy. Builds a query whose last-dim stride is 2 by slicing a padded
+        buffer, then checks it matches the contiguous query."""
+        H_q, H_kv, D, kv_len, B = 16, 2, 256, 512, 2
+        torch.manual_seed(42)
+        centroids, boundaries, rotation = _make_codebook_and_rotation(D)
+        centroids = centroids.cuda()
+        boundaries = boundaries.cuda()
+        rotation = rotation.cuda()
+
+        k = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H_kv, kv_len, D, dtype=torch.bfloat16, device="cuda")
+        k_packed, k_norms = _compress(k, boundaries, rotation)
+        v_packed, v_norms = _compress(v, boundaries, rotation)
+
+        q = torch.randn(B, H_q, 1, D, dtype=torch.bfloat16, device="cuda")
+        # Non-contiguous alias with identical values (last-dim stride 2).
+        q_pad = torch.empty(B, H_q, 1, D, 2, dtype=torch.bfloat16, device="cuda")
+        q_pad[..., 0] = q
+        q_nc = q_pad[..., 0]
+        self.assertFalse(q_nc.is_contiguous(), "query should be non-contiguous")
+
+        kv_len_t = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
+
+        def _run(query):
+            return self.tq4_sdpa(
+                query,
+                k_packed,
+                k_norms,
+                v_packed,
+                v_norms,
+                centroids,
+                rotation,
+                attn_mask=None,
+                is_causal=False,
+                scale=None,
+                kv_len=kv_len_t,
+                mask_is_causal=False,
+            )
+
+        out_contig = _run(q)
+        out_nc = _run(q_nc)
+
+        self.assertFalse(torch.isnan(out_nc).any(), "NaN in non-contiguous output")
+        cos = _cosine_sim(out_nc, out_contig)
+        self.assertGreater(
+            cos, 0.999, f"non-contiguous vs contiguous query cosine {cos:.5f}"
+        )
+
+    def test_kv_len_clamp_decode_hd512_gqa_8_4(self):
+        """Decode (Lq=1) kv_len clamp at a head_dim=512, GQA 8:4 shape.
+        N=8192 leaves a 24k garbage tail in a 32k buffer (clamp guard);
+        N=32768 fills the buffer (full 32k loop)."""
+        for N in (8192, 32768):
+            with self.subTest(N=N):
+                self._run_long_kv_test(
+                    H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768
+                )
+
+    def test_kv_len_clamp_decode_hd512_gqa_8_4_splitk(self):
+        """Split-K decode (Lq=1) at a head_dim=512, GQA 8:4 shape with long
+        KV. Verifies split-K output matches BOTH (a) fp32 reference over first
+        kv_len positions AND (b) existing fused-kernel output (byte-identical
+        within fp tolerance). Uses garbage tail as negative control."""
+        for N in (8192, 32768):
+            with self.subTest(N=N):
+                # Run with split-K (kv_len >= 256 triggers split-K)
+                _ = self._run_long_kv_test(
+                    H_q=8,
+                    H_kv=4,
+                    D=512,
+                    Lq=1,
+                    kv_len=N,
+                    buffer_len=32768,
+                    min_cosine=0.99,
+                )
+                # Also verify split-K matches fused kernel by running without kv_len
+                # (which forces fused kernel path) and comparing outputs
+                self._run_splitk_vs_fused_test(
+                    H_q=8, H_kv=4, D=512, Lq=1, kv_len=N, buffer_len=32768
+                )
+
+    def test_kv_len_clamp_decode_hd256_gqa_16_2(self):
+        """Decode (Lq=1) kv_len clamp at a head_dim=256, GQA 16:2 shape."""
+        for N in (8192, 32768):
+            with self.subTest(N=N):
+                self._run_long_kv_test(
+                    H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768
+                )
+
+    def test_kv_len_clamp_decode_hd256_gqa_16_2_splitk(self):
+        """Split-K decode (Lq=1) at a head_dim=256, GQA 16:2 shape with long
+        KV. Verifies split-K output matches BOTH fp32 reference AND fused
+        kernel."""
+        for N in (8192, 32768):
+            with self.subTest(N=N):
+                _ = self._run_long_kv_test(
+                    H_q=16,
+                    H_kv=2,
+                    D=256,
+                    Lq=1,
+                    kv_len=N,
+                    buffer_len=32768,
+                    min_cosine=0.99,
+                )
+                self._run_splitk_vs_fused_test(
+                    H_q=16, H_kv=2, D=256, Lq=1, kv_len=N, buffer_len=32768
+                )
+
+    def test_mask_is_causal_prefill_hd512_gqa_8_4(self):
+        """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=512,
+        GQA 8:4 shape. The Lq queries are the last Lq of a kv_len-long
+        context; the per-tile causal block-skip plus bottom-right mask must
+        match the fp32 causal reference over the first kv_len positions. A
+        garbage tail beyond kv_len also exercises the clamp."""
+        for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)):
+            with self.subTest(Lq=Lq, kv_len=kv_len):
+                self._run_long_kv_test(
+                    H_q=8,
+                    H_kv=4,
+                    D=512,
+                    Lq=Lq,
+                    kv_len=kv_len,
+                    buffer_len=buf,
+                    causal=True,
+                )
+
+    def test_mask_is_causal_prefill_hd256_gqa_16_2(self):
+        """Chunked prefill (Lq>1) with mask_is_causal at a head_dim=256,
+        GQA 16:2 shape."""
+        for Lq, kv_len, buf in ((256, 4096, 8192), (2048, 8192, 16384)):
+            with self.subTest(Lq=Lq, kv_len=kv_len):
+                self._run_long_kv_test(
+                    H_q=16,
+                    H_kv=2,
+                    D=256,
+                    Lq=Lq,
+                    kv_len=kv_len,
+                    buffer_len=buf,
+                    causal=True,
+                )
+
+    def test_kv_len_none_fallback_hd256_gqa_16_2(self):
+        """Regression: the kv_len=None fallback (HAS_KV_LEN False, full-Lk
+        loop) still matches the fp32 reference. This guards the original
+        behavior the kv_len feature must preserve for callers that pass
+        neither kv_len nor mask_is_causal."""
+        self._run_long_kv_test(
+            H_q=16,
+            H_kv=2,
+            D=256,
+            Lq=1,
+            kv_len=256,
+            buffer_len=256,
+            garbage=False,
+            pass_kv_len=False,
+        )
+
+    @unittest.skipUnless(
+        os.environ.get("TQ4_RUN_128K") == "1",
+        "128k case is heavy for the 24GB CI runner; set TQ4_RUN_128K=1 to run",
+    )
+    def test_kv_len_clamp_128k(self):
+        """Full 131072-entry buffer (head_dim=256, GQA 16:2). (a) kv_len=8192
+        with a ~123k garbage tail — the clamp keeps decode O(context) and
+        never touches the tail; (b) kv_len=131072 — correctness at true 128k
+        scale. Gated behind TQ4_RUN_128K because the fp32 reference for (b)
+        needs >~6GB and CI runs on a 24GB A10G."""
+        self._run_long_kv_test(
+            H_q=16, H_kv=2, D=256, Lq=1, kv_len=8192, buffer_len=131072
+        )
+        self._run_long_kv_test(
+            H_q=16,
+            H_kv=2,
+            D=256,
+            Lq=1,
+            kv_len=131072,
+            buffer_len=131072,
+            garbage=False,
+        )
+
     # ------------------------------------------------------------------
     # Validation errors
     # ------------------------------------------------------------------
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
index fb665e538bf..37989349ea7 100644
--- a/backends/cuda/triton/kernels/sdpa.py
+++ b/backends/cuda/triton/kernels/sdpa.py
@@ -45,6 +45,15 @@ def _is_power_of_2(n: int) -> bool:
     return n > 0 and (n & (n - 1)) == 0
 
 
+# KV length at/above which decode (L_q == 1) uses the split-K flash-decoding
+# kernel instead of the standard kernel. Mirrors the threshold the CUDA
+# replacement pass uses to pick triton.sdpa_decode_splitk.
+_SPLITK_LKV_THRESHOLD = 256
+
+# FlashDecoding++ unified-max constant used by the split-K decode path.
+_DEFAULT_SPLITK_PHI = 5.0
+
+
 def _next_power_of_2(x: int) -> int:
     """Get the next power of 2 >= x, clamped to [16, 256].
 
@@ -160,6 +169,7 @@ def _sdpa_fwd_kernel_non_pow2(
     v_ptr,
     o_ptr,
     mask_ptr,
+    kv_len_ptr,
     B,
     H_grid,
     LQ,
@@ -191,6 +201,7 @@ def _sdpa_fwd_kernel_non_pow2(
     BLOCK_D: tl.constexpr,
     HAS_MASK: tl.constexpr,
     IS_CAUSAL: tl.constexpr,
+    HAS_KV_LEN: tl.constexpr,
     NUM_GROUPS: tl.constexpr,
     PACK_GQA: tl.constexpr,
 ):
@@ -254,9 +265,15 @@ def _sdpa_fwd_kernel_non_pow2(
 
     NEG_INF: tl.constexpr = float("-inf")
 
-    for start_n in tl.range(0, LK, BLOCK_N, num_stages=2):
+    # Bound the KV loop to valid (filled) positions; see pow2 body for details.
+    if HAS_KV_LEN:
+        kv_len = tl.load(kv_len_ptr)
+    else:
+        kv_len = LK
+
+    for start_n in tl.range(0, kv_len, BLOCK_N, num_stages=2):
         offs_n = start_n + tl.arange(0, BLOCK_N)
-        kv_col_mask = offs_n < LK
+        kv_col_mask = offs_n < kv_len
 
         k_ptrs = k_base + (offs_n[:, None] * stride_kl + offs_d[None, :] * stride_kd)
         k = tl.load(k_ptrs, mask=kv_col_mask[:, None] & d_mask[None, :], other=0.0)
@@ -332,6 +349,7 @@ def _sdpa_fwd_kernel_body(
     V_ptr,
     O_ptr,
     Mask_ptr,
+    KV_LEN_ptr,
     B,
     H_grid,
     Lq,
@@ -358,6 +376,7 @@ def _sdpa_fwd_kernel_body(
     sm_scale: tl.float32,
     HAS_MASK: tl.constexpr,
     IS_CAUSAL: tl.constexpr,
+    HAS_KV_LEN: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HEAD_DIM: tl.constexpr,
@@ -422,6 +441,18 @@ def _sdpa_fwd_kernel_body(
 
     offs_n_init = tl.arange(0, BLOCK_N)
 
+    # Bound the KV loop to the number of valid (filled) positions instead of the
+    # full pre-allocated buffer Lk. For decode this is input_pos+1; for a prefill
+    # chunk it is chunk_end. This makes full-attention (global) layers O(context)
+    # rather than O(max_seq_len) — the empty tail of the cache is never touched.
+    # kv_len is read from a GPU scalar so the bound updates across CUDA-graph
+    # replays (decode is graph-captured). When not provided (HAS_KV_LEN False) it
+    # falls back to Lk, preserving the original behavior exactly.
+    if HAS_KV_LEN:
+        kv_len = tl.load(KV_LEN_ptr)
+    else:
+        kv_len = Lk
+
     # Window-aware early-exit. A KV block that is fully masked (sliding-window
     # or causal) contributes nothing to the online softmax — every entry is
     # -inf, so p=0 and m_i/l_i/acc are left unchanged. We detect such blocks up
@@ -434,7 +465,7 @@ def _sdpa_fwd_kernel_body(
     if IS_CAUSAL:
         max_seq_pos = tl.max(seq_pos)
 
-    for start_n in tl.range(0, Lk, BLOCK_N):
+    for start_n in tl.range(0, kv_len, BLOCK_N):
         offs_n = start_n + offs_n_init
 
         # Decide whether any row in this tile actually attends to this KV block.
@@ -444,7 +475,7 @@ def _sdpa_fwd_kernel_body(
                 + (seq_pos[:, None] * stride_mq)
                 + (offs_n[None, :] * stride_mk)
             )
-            mn_mask = row_valid[:, None] & (offs_n[None, :] < Lk)
+            mn_mask = row_valid[:, None] & (offs_n[None, :] < kv_len)
             mask_block = tl.load(mask_ptrs, mask=mn_mask, other=False)
             block_active = tl.sum(mask_block.to(tl.int32)) > 0
         elif IS_CAUSAL:
@@ -461,7 +492,7 @@ def _sdpa_fwd_kernel_body(
                 + (offs_n[:, None] * stride_kn)
                 + (offs_d[None, :] * stride_kd)
             )
-            k_mask = (offs_n[:, None] < Lk) & (offs_d[None, :] < HEAD_DIM)
+            k_mask = (offs_n[:, None] < kv_len) & (offs_d[None, :] < HEAD_DIM)
             k = tl.load(k_ptrs, mask=k_mask, other=0.0).to(tl.bfloat16)
 
             qk = (tl.dot(q, tl.trans(k)).to(tl.float32) * sm_scale).to(tl.float32)
@@ -493,7 +524,7 @@ def _sdpa_fwd_kernel_body(
                 + (offs_n[:, None] * stride_vn)
                 + (offs_d[None, :] * stride_vd)
             )
-            v_mask = (offs_n[:, None] < Lk) & (offs_d[None, :] < HEAD_DIM)
+            v_mask = (offs_n[:, None] < kv_len) & (offs_d[None, :] < HEAD_DIM)
             v = tl.load(v_ptrs, mask=v_mask, other=0.0).to(tl.bfloat16)
 
             p_bf16 = p_f32.to(tl.bfloat16)
@@ -523,111 +554,64 @@ def _sdpa_fwd_kernel_body(
     tl.store(o_ptrs, acc.to(tl.bfloat16), mask=o_mask)
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_warps=8, num_stages=3),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 32}, num_warps=4, num_stages=2),
-    ],
-    key=["Lq", "Lk", "HEAD_DIM", "HAS_MASK", "IS_CAUSAL", "NUM_GROUPS", "PACK_GQA"],
-)
-@triton.jit
-def _sdpa_fwd_kernel_m64(
-    Q_ptr,
-    K_ptr,
-    V_ptr,
-    O_ptr,
-    Mask_ptr,
-    B,
-    H_grid,
-    Lq,
-    Lk,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_qd,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_kd,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_vd,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_od,
-    stride_mb,
-    stride_mq,
-    stride_mk,
-    sm_scale: tl.float32,
-    HAS_MASK: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    HEAD_DIM: tl.constexpr,
-    NUM_GROUPS: tl.constexpr,
-    PACK_GQA: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    _sdpa_fwd_kernel_body(
-        Q_ptr,
-        K_ptr,
-        V_ptr,
-        O_ptr,
-        Mask_ptr,
-        B,
-        H_grid,
-        Lq,
-        Lk,
-        stride_qb,
-        stride_qh,
-        stride_qm,
-        stride_qd,
-        stride_kb,
-        stride_kh,
-        stride_kn,
-        stride_kd,
-        stride_vb,
-        stride_vh,
-        stride_vn,
-        stride_vd,
-        stride_ob,
-        stride_oh,
-        stride_om,
-        stride_od,
-        stride_mb,
-        stride_mq,
-        stride_mk,
-        sm_scale,
-        HAS_MASK=HAS_MASK,
-        IS_CAUSAL=IS_CAUSAL,
-        BLOCK_M=BLOCK_M,
-        BLOCK_N=BLOCK_N,
-        HEAD_DIM=HEAD_DIM,
-        NUM_GROUPS=NUM_GROUPS,
-        PACK_GQA=PACK_GQA,
-    )
+# Prefill / standard-path tile configs. ONE autotuned kernel spanning BLOCK_M in
+# {16..128}; `_sdpa_prefill_prune` drops configs whose fp32 accumulator
+# acc[BLOCK_M, HEAD_DIM] would spill registers for the runtime HEAD_DIM, so the
+# kernel is high-occupancy AND HEAD_DIM-agnostic (64/80/96/128/256/512). This
+# replaces the old fixed BLOCK_M=64 (m64) / BLOCK_M=32 (m32) wrappers + Python
+# CTA-count selector: at HEAD_DIM=512 the m64 path spilled acc[64,512] fp32
+# (128 KB/CTA -> ~280 reg spills -> ~30 TFLOP/s); the autotuner now picks a
+# non-spilling, well-pipelined tile per HEAD_DIM (e.g. BLOCK_M=32 at 512).
+_SDPA_PREFILL_CONFIGS = [
+    triton.Config({"BLOCK_M": 16, "BLOCK_N": 32}, num_warps=4, num_stages=2),
+    triton.Config({"BLOCK_M": 16, "BLOCK_N": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=2),
+    triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=4, num_stages=2),
+    triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_M": 64, "BLOCK_N": 32}, num_warps=8, num_stages=2),
+    triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=2),
+    triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=8, num_stages=3),
+]
+
+
+def _sdpa_prefill_prune(configs, nargs, **kwargs):
+    """Drop configs whose fp32 acc[BLOCK_M, HEAD_DIM] would spill registers.
+
+    Keeps ``BLOCK_M * HEAD_DIM <= 4096 * num_warps`` (the measured A100 no-spill
+    boundary: HEAD_DIM=512 -> BLOCK_M<=32 at 4 warps / <=64 at 8 warps;
+    HEAD_DIM=128 -> BLOCK_M<=128 at 4 warps). This guarantees a high-occupancy
+    pick for any HEAD_DIM and a non-empty result (the BLOCK_M=16 configs satisfy
+    the budget for every HEAD_DIM<=1024). SMEM-OOR tiles (large
+    BLOCK_N*HEAD_DIM*num_stages) are pruned by the autotuner at benchmark time.
+    """
+    head_dim = kwargs.get("HEAD_DIM")
+    if head_dim is None and nargs is not None:
+        head_dim = nargs.get("HEAD_DIM")
+    if head_dim is None:
+        return configs
+    kept = [c for c in configs if c.kwargs["BLOCK_M"] * head_dim <= 4096 * c.num_warps]
+    if not kept:
+        kept = [min(configs, key=lambda c: c.kwargs["BLOCK_M"] / c.num_warps)]
+    return kept
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 128}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 256}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=2),
-    ],
+    configs=_SDPA_PREFILL_CONFIGS,
     key=["Lq", "Lk", "HEAD_DIM", "HAS_MASK", "IS_CAUSAL", "NUM_GROUPS", "PACK_GQA"],
+    prune_configs_by={"early_config_prune": _sdpa_prefill_prune},
 )
 @triton.jit
-def _sdpa_fwd_kernel_m32(
+def _sdpa_fwd_kernel(
     Q_ptr,
     K_ptr,
     V_ptr,
     O_ptr,
     Mask_ptr,
+    KV_LEN_ptr,
     B,
     H_grid,
     Lq,
@@ -654,6 +638,7 @@ def _sdpa_fwd_kernel_m32(
     sm_scale: tl.float32,
     HAS_MASK: tl.constexpr,
     IS_CAUSAL: tl.constexpr,
+    HAS_KV_LEN: tl.constexpr,
     HEAD_DIM: tl.constexpr,
     NUM_GROUPS: tl.constexpr,
     PACK_GQA: tl.constexpr,
@@ -666,6 +651,7 @@ def _sdpa_fwd_kernel_m32(
         V_ptr,
         O_ptr,
         Mask_ptr,
+        KV_LEN_ptr,
         B,
         H_grid,
         Lq,
@@ -692,6 +678,7 @@ def _sdpa_fwd_kernel_m32(
         sm_scale,
         HAS_MASK=HAS_MASK,
         IS_CAUSAL=IS_CAUSAL,
+        HAS_KV_LEN=HAS_KV_LEN,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         HEAD_DIM=HEAD_DIM,
@@ -785,6 +772,8 @@ def _launch_pow2_kernel(
     is_causal: bool,
     num_groups: int,
     pack_gqa: bool,
+    kv_len_ptr: Optional[torch.Tensor] = None,
+    HAS_KV_LEN: bool = False,
 ) -> None:
     """Launch power-of-2 optimized SDPA kernel."""
     stride_qb, stride_qh, stride_qm, stride_qd = query.stride()
@@ -802,18 +791,18 @@ def _launch_pow2_kernel(
     def grid(meta):
         return (triton.cdiv(Lq_packed, meta["BLOCK_M"]), B * H_grid)
 
-    total_ctas_m64 = ((Lq_packed + 63) // 64) * (B * H_grid)
-    threshold = 4 * 84
-    kernel = (
-        _sdpa_fwd_kernel_m32 if total_ctas_m64 < threshold else _sdpa_fwd_kernel_m64
-    )
-
-    wrap_triton(kernel)[grid](
+    # Single autotuned kernel: the config set spans BLOCK_M in {16..128} and
+    # `_sdpa_prefill_prune` keeps only non-spilling tiles for this HEAD_DIM, so
+    # the autotuner picks a high-occupancy tile (small BLOCK_M for large HEAD_DIM,
+    # larger BLOCK_M / more CTAs for small problems) — subsuming the old
+    # CTA-count m32/m64 selector.
+    wrap_triton(_sdpa_fwd_kernel)[grid](
         query,
         key,
         value,
         out,
         Mask_ptr if HAS_MASK else 0,
+        kv_len_ptr if HAS_KV_LEN else 0,
         B,
         H_grid,
         L_q,
@@ -840,6 +829,7 @@ def grid(meta):
         sm_scale,
         HAS_MASK=HAS_MASK,
         IS_CAUSAL=is_causal,
+        HAS_KV_LEN=HAS_KV_LEN,
         HEAD_DIM=D,
         NUM_GROUPS=num_groups,
         PACK_GQA=pack_gqa,
@@ -863,6 +853,8 @@ def _launch_non_pow2_kernel(
     is_causal: bool,
     num_groups: int,
     pack_gqa: bool,
+    kv_len_ptr: Optional[torch.Tensor] = None,
+    HAS_KV_LEN: bool = False,
 ) -> None:
     """Launch non-power-of-2 SDPA kernel with dynamic HEAD_DIM masking."""
     stride_qb, stride_qh, stride_qm, stride_qd = query.stride()
@@ -902,6 +894,7 @@ def grid_non_pow2(meta):
         value,
         out,
         mask_ptr,
+        kv_len_ptr if HAS_KV_LEN else 0,
         B,
         H_grid,
         L_q,
@@ -933,6 +926,7 @@ def grid_non_pow2(meta):
         BLOCK_D=BLOCK_D,
         HAS_MASK=HAS_MASK,
         IS_CAUSAL=is_causal,
+        HAS_KV_LEN=HAS_KV_LEN,
         NUM_GROUPS=num_groups,
         PACK_GQA=pack_gqa,
         num_warps=num_warps,
@@ -950,6 +944,7 @@ def sdpa(
     is_causal: bool = False,
     scale: float = 0.0,
     enable_gqa: bool = False,
+    kv_len: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Triton fused Scaled Dot-Product Attention with GQA pack optimization.
@@ -967,6 +962,15 @@ def sdpa(
         is_causal: apply causal masking
         scale: attention scale (default: 1/sqrt(D))
         enable_gqa: allow H_q != H_kv (GQA/MQA)
+        kv_len: Optional GPU int scalar = number of valid (filled) KV positions.
+            When provided, the inner KV loop is bounded to ``kv_len`` instead of
+            the full pre-allocated ``L_kv``, making attention O(context) instead
+            of O(max_seq_len). It is read on-device (no host sync) so the bound
+            updates correctly under CUDA-graph replay (decode). For decode pass
+            ``input_pos + 1``; for a prefill chunk pass ``chunk_end``. When None
+            the loop runs over the full ``L_kv`` (original behavior). Supplying
+            it for an L_q==1 decode with a large buffer also routes through the
+            split-K flash-decoding kernel for occupancy.
     Returns:
         Output tensor [B, H_q, L_q, D], dtype torch.bfloat16
     """
@@ -984,6 +988,54 @@ def sdpa(
             "For decode (L_q < L_kv), use an explicit bool mask instead."
         )
 
+    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
+    sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
+    HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk = _prepare_mask_params(
+        attn_mask, B, L_q, L_kv
+    )
+
+    # Optional length bound: device int32 scalar, clamped to the buffer size for
+    # OOB safety. Reshaped to [1] so the kernel can ``tl.load`` element 0. No
+    # ``.item()`` — keeps it CUDA-graph-safe (value updates on replay).
+    HAS_KV_LEN = kv_len is not None
+    if HAS_KV_LEN:
+        kv_len_t = torch.clamp(
+            kv_len.reshape(1).to(torch.int32), max=int(L_kv)
+        ).contiguous()
+    else:
+        kv_len_t = None
+
+    # Split-K decode dispatch: L_q == 1 with a kv_len bound and a large KV
+    # buffer. Flash-decoding partitions the KV sequence across many CTAs for
+    # better occupancy (L_q=1 launches too few CTAs otherwise). The split is
+    # static (from buffer size L_kv, not the runtime kv_len value) so it is
+    # export/AOTI-traceable; the kernel still bounds each split's loop by kv_len
+    # on-device (CUDA-graph safe). Only taken when kv_len is supplied, so callers
+    # that don't pass kv_len keep the exact original (standard-kernel) dispatch.
+    if HAS_KV_LEN and L_q == 1 and _is_power_of_2(D) and L_kv >= _SPLITK_LKV_THRESHOLD:
+        _launch_decode_splitk(
+            query,
+            key,
+            value,
+            out,
+            B,
+            H_q,
+            H_kv,
+            L_kv,
+            D,
+            sm_scale,
+            HAS_MASK,
+            Mask_ptr,
+            stride_mb,
+            stride_mq,
+            stride_mk,
+            num_groups,
+            _DEFAULT_SPLITK_PHI,
+            kv_len_t,
+            HAS_KV_LEN,
+        )
+        return out
+
     # Decide whether to pack GQA based on tile utilization heuristic.
     # Use the actual BLOCK_M that the launched kernel will use:
     # - non-pow2 path always uses BLOCK_M=32
@@ -995,12 +1047,6 @@ def sdpa(
         block_m = 32 if total_ctas_m64 < 4 * 84 else 64
     pack_gqa = _should_pack_gqa(L_q, num_groups, block_m)
 
-    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
-    sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
-    HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk = _prepare_mask_params(
-        attn_mask, B, L_q, L_kv
-    )
-
     if _is_power_of_2(D):
         _launch_pow2_kernel(
             query,
@@ -1022,6 +1068,8 @@ def sdpa(
             is_causal,
             num_groups,
             pack_gqa,
+            kv_len_t,
+            HAS_KV_LEN,
         )
     else:
         _launch_non_pow2_kernel(
@@ -1041,6 +1089,8 @@ def sdpa(
             is_causal,
             num_groups,
             pack_gqa,
+            kv_len_t,
+            HAS_KV_LEN,
         )
 
     return out
@@ -1058,6 +1108,7 @@ def _sdpa_abstract(
     is_causal: bool = False,
     scale: float = 0.0,
     enable_gqa: bool = False,
+    kv_len: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Abstract/fake implementation for torch.export.
@@ -1104,6 +1155,7 @@ def _sdpa_decode_splitk_kernel(
     O_partial_ptr,
     L_partial_ptr,
     Mask_ptr,
+    KV_LEN_ptr,
     B,
     H_kv,
     Lk,
@@ -1133,6 +1185,7 @@ def _sdpa_decode_splitk_kernel(
     phi: tl.float32,
     chunk_size,
     HAS_MASK: tl.constexpr,
+    HAS_KV_LEN: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HEAD_DIM: tl.constexpr,
     NUM_GROUPS: tl.constexpr,
@@ -1144,7 +1197,15 @@ def _sdpa_decode_splitk_kernel(
     h_kv = pid_bh % H_kv
 
     start_n = split_id * chunk_size
-    end_n = tl.minimum(start_n + chunk_size, Lk)
+    # Bound the decode KV sweep to the valid (filled) positions. Splits whose
+    # chunk starts past kv_len do no work (end_n <= start_n) and store the zero
+    # partials they were initialized with, so the reduce is unaffected. kv_len is
+    # read on-device (CUDA-graph safe); falls back to Lk when not provided.
+    if HAS_KV_LEN:
+        kv_len = tl.load(KV_LEN_ptr)
+    else:
+        kv_len = Lk
+    end_n = tl.minimum(start_n + chunk_size, kv_len)
 
     offs_d = tl.arange(0, HEAD_DIM)
     offs_g = tl.arange(0, BLOCK_G)
@@ -1293,6 +1354,8 @@ def _launch_decode_splitk(
     stride_mk: int,
     num_groups: int,
     phi: float,
+    kv_len_ptr: Optional[torch.Tensor] = None,
+    HAS_KV_LEN: bool = False,
 ) -> None:
     num_splits = min(max(triton.cdiv(L_kv, 256), 1), 128)
     chunk_size = triton.cdiv(L_kv, num_splits)
@@ -1319,6 +1382,7 @@ def _launch_decode_splitk(
         O_partial,
         L_partial,
         Mask_ptr if HAS_MASK else 0,
+        kv_len_ptr if HAS_KV_LEN else 0,
         B,
         H_kv,
         L_kv,
@@ -1348,6 +1412,7 @@ def _launch_decode_splitk(
         phi,
         chunk_size,
         HAS_MASK=HAS_MASK,
+        HAS_KV_LEN=HAS_KV_LEN,
         HEAD_DIM=D,
         NUM_GROUPS=num_groups,
         BLOCK_G=_next_power_of_2_unclamped(num_groups),
@@ -1387,6 +1452,7 @@ def sdpa_decode_splitk(
     scale: float = 0.0,
     enable_gqa: bool = False,
     phi: float = 5.0,
+    kv_len: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """Split-K flash-decoding SDPA for L_q=1 (decode step).
 
@@ -1396,6 +1462,10 @@ def sdpa_decode_splitk(
     Signature mirrors sdpa() for drop-in use with torch.cond dispatch.
     enable_gqa is accepted but ignored — GQA is handled natively via
     H_q // H_kv grouping; no packed-GQA tradeoff exists at L_q=1.
+
+    kv_len: optional GPU int scalar bounding the KV sweep to the valid
+    (filled) positions (O(context) instead of O(max_seq_len)). Read
+    on-device, CUDA-graph safe. When None, sweeps the full L_kv.
     """
     _validate_sdpa_inputs(query, key, value, dropout_p, enable_gqa)
 
@@ -1431,6 +1501,14 @@ def sdpa_decode_splitk(
         attn_mask, B, L_q, L_kv
     )
 
+    HAS_KV_LEN = kv_len is not None
+    if HAS_KV_LEN:
+        kv_len_t = torch.clamp(
+            kv_len.reshape(1).to(torch.int32), max=int(L_kv)
+        ).contiguous()
+    else:
+        kv_len_t = None
+
     _launch_decode_splitk(
         query,
         key,
@@ -1449,6 +1527,8 @@ def sdpa_decode_splitk(
         stride_mk,
         num_groups,
         phi,
+        kv_len_t,
+        HAS_KV_LEN,
     )
     return out
 
@@ -1464,6 +1544,7 @@ def _sdpa_decode_splitk_abstract(
     scale: float = 0.0,
     enable_gqa: bool = False,
     phi: float = 5.0,
+    kv_len: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
     B, H_q, L_q, D = query.shape
diff --git a/backends/cuda/triton/kernels/tq4_sdpa.py b/backends/cuda/triton/kernels/tq4_sdpa.py
index 10f02c7fa3c..427f2eef4eb 100644
--- a/backends/cuda/triton/kernels/tq4_sdpa.py
+++ b/backends/cuda/triton/kernels/tq4_sdpa.py
@@ -194,7 +194,7 @@ def _tq4_sdpa_fwd_kernel_body(
     # causal mask); otherwise the full kv_len bound is kept, which is safe for an
     # arbitrary mask.
     loop_end = kv_len
-    if MASK_IS_CAUSAL:
+    if MASK_IS_CAUSAL or IS_CAUSAL:
         max_q_pos = (kv_len - Lq) + tl.max(seq_pos)
         loop_end = tl.minimum(kv_len, max_q_pos + 1)
 
@@ -227,7 +227,12 @@ def _tq4_sdpa_fwd_kernel_body(
             qk = tl.where(mask_block, qk, float("-inf"))
 
         if IS_CAUSAL:
-            causal = offs_n[None, :] > seq_pos[:, None]
+            # Absolute causal-offset: a query row's KV position is
+            # (kv_len - Lq) + seq_pos, correct for chunked prefill (Lq < kv_len).
+            # For the square is_causal case (kv_len == Lq) it reduces to
+            # offs_n > seq_pos. This lets a caller that guarantees a standard
+            # causal mask skip the materialized mask read entirely.
+            causal = offs_n[None, :] > (kv_len - Lq) + seq_pos[:, None]
             qk = tl.where(causal, float("-inf"), qk)
 
         qk = tl.where(kv_valid[None, :], qk, float("-inf"))
@@ -283,138 +288,25 @@ def _tq4_sdpa_fwd_kernel_body(
 
 
 # ---------------------------------------------------------------------------
-# Autotuned kernel wrappers (M64 and M32)
+# Autotuned prefill kernel (single, no-spill)
 # ---------------------------------------------------------------------------
 
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=4, num_stages=3),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_warps=8, num_stages=3),
-        triton.Config({"BLOCK_M": 64, "BLOCK_N": 32}, num_warps=4, num_stages=2),
-    ],
-    key=["Lq", "Lk", "HEAD_DIM", "HAS_MASK", "IS_CAUSAL", "NUM_GROUPS", "PACK_GQA"],
-)
-@triton.jit
-def _tq4_sdpa_fwd_kernel_m64(
-    Q_ptr,
-    KP_ptr,
-    KN_ptr,
-    VP_ptr,
-    VN_ptr,
-    LUT_hi_ptr,
-    LUT_lo_ptr,
-    Mask_ptr,
-    O_ptr,
-    KV_LEN_ptr,
-    B,
-    H_grid,
-    Lq,
-    Lk,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_qd,
-    stride_kpb,
-    stride_kph,
-    stride_kpn,
-    stride_kpd,
-    stride_knb,
-    stride_knh,
-    stride_knn,
-    stride_vpb,
-    stride_vph,
-    stride_vpn,
-    stride_vpd,
-    stride_vnb,
-    stride_vnh,
-    stride_vnn,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_od,
-    stride_mb,
-    stride_mq,
-    stride_mk,
-    sm_scale: tl.float32,
-    HAS_MASK: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    HAS_KV_LEN: tl.constexpr,
-    MASK_IS_CAUSAL: tl.constexpr,
-    HEAD_DIM: tl.constexpr,
-    HALF_D: tl.constexpr,
-    NUM_GROUPS: tl.constexpr,
-    PACK_GQA: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    _tq4_sdpa_fwd_kernel_body(
-        Q_ptr,
-        KP_ptr,
-        KN_ptr,
-        VP_ptr,
-        VN_ptr,
-        LUT_hi_ptr,
-        LUT_lo_ptr,
-        Mask_ptr,
-        O_ptr,
-        KV_LEN_ptr,
-        B,
-        H_grid,
-        Lq,
-        Lk,
-        stride_qb,
-        stride_qh,
-        stride_qm,
-        stride_qd,
-        stride_kpb,
-        stride_kph,
-        stride_kpn,
-        stride_kpd,
-        stride_knb,
-        stride_knh,
-        stride_knn,
-        stride_vpb,
-        stride_vph,
-        stride_vpn,
-        stride_vpd,
-        stride_vnb,
-        stride_vnh,
-        stride_vnn,
-        stride_ob,
-        stride_oh,
-        stride_om,
-        stride_od,
-        stride_mb,
-        stride_mq,
-        stride_mk,
-        sm_scale,
-        HAS_MASK=HAS_MASK,
-        IS_CAUSAL=IS_CAUSAL,
-        HAS_KV_LEN=HAS_KV_LEN,
-        MASK_IS_CAUSAL=MASK_IS_CAUSAL,
-        BLOCK_M=BLOCK_M,
-        BLOCK_N=BLOCK_N,
-        HEAD_DIM=HEAD_DIM,
-        HALF_D=HALF_D,
-        NUM_GROUPS=NUM_GROUPS,
-        PACK_GQA=PACK_GQA,
-    )
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 128}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_M": 32, "BLOCK_N": 256}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=3),
         triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=2),
+        # Extra BLOCK_N in {32,64} configs for smaller-SMEM GPUs (e.g. RTX 5090);
+        # correctness-safe (cos~1.0), never BLOCK_N=16 (numerically wrong).
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=4),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=8, num_stages=3),
     ],
     key=["Lq", "Lk", "HEAD_DIM", "HAS_MASK", "IS_CAUSAL", "NUM_GROUPS", "PACK_GQA"],
 )
 @triton.jit
-def _tq4_sdpa_fwd_kernel_m32(
+def _tq4_sdpa_prefill_kernel(
     Q_ptr,
     KP_ptr,
     KN_ptr,
@@ -565,15 +457,7 @@ def _launch_tq4_kernel(
     def grid(meta):
         return (triton.cdiv(Lq_packed, meta["BLOCK_M"]), B * H_grid)
 
-    total_ctas_m64 = ((Lq_packed + 63) // 64) * (B * H_grid)
-    threshold = 4 * 84
-    kernel = (
-        _tq4_sdpa_fwd_kernel_m32
-        if total_ctas_m64 < threshold
-        else _tq4_sdpa_fwd_kernel_m64
-    )
-
-    wrap_triton(kernel)[grid](
+    wrap_triton(_tq4_sdpa_prefill_kernel)[grid](
         q_rot,
         k_packed,
         k_norms,
@@ -840,6 +724,19 @@ def tq4_sdpa(
             pack_gqa,
         )
     else:
+        # Prefill path (N_Q > 1, plus the rare N_Q==1 && N_KV<256 fallthrough).
+        # When the caller guarantees a standard causal mask AND kv_len is known
+        # (MASK_IS_CAUSAL), use the kernel's analytic absolute causal-offset and
+        # skip loading the materialized mask — numerically identical, no mask HBM
+        # traffic. Causal is then applied via IS_CAUSAL (which also drives the
+        # per-tile loop-end clamp), so MASK_IS_CAUSAL is passed False to the
+        # launcher. Otherwise honor the explicit mask / is_causal as-is.
+        if MASK_IS_CAUSAL:
+            prefill_has_mask = False
+            prefill_is_causal = True
+        else:
+            prefill_has_mask = HAS_MASK
+            prefill_is_causal = is_causal
         _launch_tq4_kernel(
             q_rot,
             k_packed,
@@ -858,13 +755,13 @@ def tq4_sdpa(
             N_KV,
             D,
             sm_scale,
-            HAS_MASK,
+            prefill_has_mask,
             HAS_KV_LEN,
-            MASK_IS_CAUSAL,
+            False,
             stride_mb,
             stride_mq,
             stride_mk,
-            is_causal,
+            prefill_is_causal,
             num_groups,
             pack_gqa,
         )
@@ -884,17 +781,14 @@ def tq4_sdpa(
 
 @triton.autotune(
     configs=[
-        triton.Config({"BLOCK_N": 32}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_N": 32}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_N": 64}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_N": 128}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_N": 128}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_N": 128}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_N": 32}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 64}, num_warps=8, num_stages=3),
         triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_N": 256}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_N": 256}, num_warps=8, num_stages=2),
+        # Extra BLOCK_N in {32,64} configs for smaller-SMEM GPUs (e.g. RTX 5090);
+        # correctness-safe (cos~1.0), never BLOCK_N=16 (numerically wrong).
+        triton.Config({"BLOCK_N": 32}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 32}, num_warps=4, num_stages=3),
     ],
     key=["Lk", "HEAD_DIM", "NUM_GROUPS", "HAS_MASK", "PACK_GQA"],
 )
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 6ec8ee80688..ff8cbb660cb 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -39,6 +39,7 @@
     exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
+    exir_ops.edge.aten.exp.default: ExpConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter,  # noqa F405
     exir_ops.edge.aten.log.default: LogConverter,  # noqa F405
diff --git a/backends/nxp/backend/graph_utils.py b/backends/nxp/backend/graph_utils.py
index 88cd996d6fd..f5d8e16475c 100644
--- a/backends/nxp/backend/graph_utils.py
+++ b/backends/nxp/backend/graph_utils.py
@@ -56,7 +56,7 @@ def get_output_shape(node: Node) -> tuple[torch.Size] | torch.Size | None:
 
 
 def is_clamp_preserved_under_quantization(
-    node: Node, min_val: int = 0, max_val: int | None = None
+    node: Node, min_val: float = 0, max_val: float | None = None
 ) -> bool:
     """
     Checks if Clamp/ReLU/HardTanh is preserved under quantization and did
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index 5f19b2e48dc..93ba24e61bd 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -31,6 +31,9 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.convolution_converter import (
     ConvolutionConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.exp_converter import (
+    ExpConverter,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.getitem_converter import (
     GetItemConverter,
 )
@@ -111,6 +114,7 @@
     "CloneConverter",
     "ConstantPadNDConverter",
     "ConvolutionConverter",
+    "ExpConverter",
     "GetItemConverter",
     "HardTanhConverter",
     "LeakyReluConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
index 25cf6074701..a1e8c19e9bd 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clamp_converter.py
@@ -42,17 +42,6 @@
 from torch.nn import Parameter
 
 
-def _is_convertible_to_relu(node):
-    bounds = ClampConverter._get_clamp_bounds(node)
-    bounds = tuple(v if v is not None and math.isfinite(v) else None for v in bounds)
-
-    # Some specific bounds can be replaced with single op ReLU.
-    if bounds not in ClampConverter.RELU_COMPATIBLE_BOUNDS.values():
-        return False
-
-    return True
-
-
 class ClampConverter(NodeConverter):
     RELU_COMPATIBLE_BOUNDS = {
         "ReluN1To1": (-1, 1),
@@ -70,12 +59,25 @@ class ClampConverter(NodeConverter):
 
     # noinspection PyShadowingBuiltins
     @staticmethod
-    def _get_clamp_bounds(clamp_node: Node) -> tuple[float | None, float | None]:
+    def _get_bounds(node: Node) -> tuple[float | None, float | None]:
         """Extract min and max bounds from `aten.clamp.default` node."""
-        min = try_get_arg(clamp_node, 1)
-        max = try_get_arg(clamp_node, 2)
+        min = try_get_arg(node, 1)
+        max = try_get_arg(node, 2)
         return min, max
 
+    @classmethod
+    def _is_convertible_to_relu(cls, node):
+        bounds = cls._get_bounds(node)
+        bounds = tuple(
+            v if v is not None and math.isfinite(v) else None for v in bounds
+        )
+
+        # Some specific bounds can be replaced with single op ReLU.
+        if bounds not in cls.RELU_COMPATIBLE_BOUNDS.values():
+            return False
+
+        return True
+
     @staticmethod
     def _is_supported_in_IR(
         node: Node,
@@ -100,20 +102,21 @@ def _io_quant_is_same(node: Node):
         dq_params = dequant.args[1:]
         return all(q == dq for q, dq in zip(q_params, dq_params))
 
-    @staticmethod
+    @classmethod
     def _is_supported_on_target(
+        cls,
         node: Node,
         neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        relu_compatible = _is_convertible_to_relu(node)
-        bounds = ClampConverter._get_clamp_bounds(node)
+        relu_compatible = cls._is_convertible_to_relu(node)
+        bounds = cls._get_bounds(node)
 
         if all(b is None or math.isinf(b) for b in bounds):
             return False
 
-        io_quant_consistent = ClampConverter._io_quant_is_same(node)
+        io_quant_consistent = cls._io_quant_is_same(node)
         quant_supported = NodeConverter.uses_quantization_type_for_io(
             node,
             supported_types=[torch.int8, torch.uint8],
@@ -138,19 +141,20 @@ def supports_partitioning_result(
         neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
     ) -> bool:
-        bounds = cls._get_clamp_bounds(node)
+        bounds = cls._get_bounds(node)
 
         # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator
         # and at the same time the node does not satisfy delegation requirements.
-        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly.
+        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfully.
         if bounds in cls.RELU_COMPATIBLE_BOUNDS.values():
             is_alone_in_partition = cls.is_node_alone_in_partition(
                 node, partition_list, filter_fn=is_not_qdq_node
             )
             if is_alone_in_partition:
+                # noinspection PyTypeChecker
                 return is_clamp_preserved_under_quantization(
                     node,
-                    min_val=bounds[0],
+                    min_val=bounds[0] if bounds[0] is not None else 0,
                     max_val=bounds[1],
                 )
 
@@ -167,9 +171,9 @@ def convert(self, node: Node):
             ) -> Tensor
         """
         self.assert_convertible(node)
-        to_relu = _is_convertible_to_relu(node)
+        to_relu = self._is_convertible_to_relu(node)
 
-        bounds = self._get_clamp_bounds(node)
+        bounds = self._get_bounds(node)
         bounds = tuple(
             v if v is not None and math.isfinite(v) else None for v in bounds
         )
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
index f67851895c2..0159143c5f7 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/hardtanh_converter.py
@@ -3,43 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    CustomDelegationOptions,
-    is_not_qdq_node,
-    NodeConverter,
-    Partition,
-)
-from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
-    BuiltinOperator,
-)
-from executorch.backends.nxp.backend.neutron_operator_support import (
-    activation_supported_on_target,
+
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import (
+    ClampConverter,
 )
-from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
-from torch.nn import Parameter
-
 
-class HardTanhConverter(NodeConverter):
-
-    # Maps possible input parameters of HardTanh to equivalent ReLU-based operators supported by TFLite.
-    SUPPORTED_MODES_MAP = {
-        (0.0, 6.0): BuiltinOperator.RELU6,
-        (-1.0, 1.0): BuiltinOperator.RELU_N1_TO_1,
-        (0.0, 1.0): BuiltinOperator.RELU_0_TO_1,
-        (0.0, float("inf")): BuiltinOperator.RELU,
-    }
-
-    # Maps possible modes of HardTanh to equivalent ReLU bounds.
-    SUPPORTED_BOUNDS_MAP = {
-        "ReluN1To1": (-1.0, 1.0),
-        "Relu0To1": (0.0, 1.0),
-        "Relu6": (0.0, 6.0),
-        "Relu": (0.0, float("inf")),
-    }
 
+class HardTanhConverter(ClampConverter):
     @staticmethod
-    def _get_hardtanh_bounds(node: Node) -> tuple[float, float]:
+    def _get_bounds(node: Node) -> tuple[float | None, float | None]:
         args = node.args
 
         match len(args):
@@ -62,51 +35,3 @@ def _get_hardtanh_bounds(node: Node) -> tuple[float, float]:
                 )
 
         return min_val, max_val
-
-    @staticmethod
-    def _is_supported_in_IR(
-        node: Node,
-        parameters_mapping: dict[str, Parameter],
-        custom_delegation_options: CustomDelegationOptions,
-    ) -> bool:
-        bounds = HardTanhConverter._get_hardtanh_bounds(node)
-        return bounds in HardTanhConverter.SUPPORTED_MODES_MAP
-
-    @classmethod
-    def supports_partitioning_result(
-        cls,
-        node: Node,
-        partition_list: list[Partition],
-        custom_delegation_options: CustomDelegationOptions,
-        neutron_target_spec: NeutronTargetSpec,
-        parameters_mapping: dict[str, Parameter],
-    ) -> bool:
-        bounds = HardTanhConverter._get_hardtanh_bounds(node)
-
-        # Neutron cannot delegate a partition where ReLU or ReLU6 is the only operator
-        # and at the same time the node does not satisfy delegation requirements.
-        # In contrast, ReLUN1To1 and ReLU0To1 are supported and delegated successfuly.
-        if bounds in [
-            cls.SUPPORTED_BOUNDS_MAP["Relu"],
-            cls.SUPPORTED_BOUNDS_MAP["Relu6"],
-        ]:
-            is_alone_in_partition = cls.is_node_alone_in_partition(
-                node, partition_list, filter_fn=is_not_qdq_node
-            )
-            if is_alone_in_partition:
-                return activation_supported_on_target(node)
-
-        return True
-
-    def convert(self, node: Node):
-        """Convert 'aten::hardtanh' to its supported ReLU equivalent."""
-        self.assert_convertible(node)
-
-        t_op = self._create_tflite_op_with_io_tensors(node)
-
-        bounds = HardTanhConverter._get_hardtanh_bounds(node)
-
-        op = self.SUPPORTED_MODES_MAP[bounds]
-        t_op.opcode_index = self.builder.op_code_index_for_op_type(op)
-
-        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index a76abfbef91..8674bf697c7 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -5,6 +5,9 @@
 
 import torch
 
+from executorch.backends.nxp.backend.data_format import DataFormat
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
     create_channels_last_to_channels_first_permutation,
 )
@@ -89,10 +92,15 @@ def _is_supported_in_IR(
     def _to_pos_dim(d: int, rank: int):
         return d + rank if d < 0 else d
 
+    @staticmethod
+    def _normalize_dim(dim: list[int], rank: int) -> list[int]:
+        # convert negative index to positive
+        return [MeanDimConverter._to_pos_dim(d, rank) for d in dim]
+
     @staticmethod
     def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
         # convert negative index to positive
-        dim = [MeanDimConverter._to_pos_dim(d, rank) for d in dim]
+        dim = MeanDimConverter._normalize_dim(dim, rank)
 
         perm = create_channels_last_to_channels_first_permutation(rank, True)
         dim = [perm[d] for d in dim]
@@ -106,6 +114,114 @@ def _get_attrs(node: Node) -> tuple[list[int], bool]:
         keepdim = node.args[2] if len(node.args) >= 3 else False
         return dim, keepdim
 
+    def _get_dim_and_handle_io_formats(
+        self, ops: OpsList, dim: list[int], keep_dim: bool
+    ):
+        t_op = ops.middle_op
+        x = t_op.tmp_inputs[0]
+        y = t_op.tmp_outputs[0]
+
+        channels_last_input = x.tensor_format.is_channels_last()
+        channels_last_output = y.tensor_format.is_channels_last()
+        formatless_input = not channels_last_input
+        formatless_output = not channels_last_output
+
+        dim = self._normalize_dim(dim, x.rank)
+
+        if keep_dim:
+            # The rank is preserved and the io formats should always be equal.
+            assert (
+                x.tensor_format == y.tensor_format
+            ), "NXP backend: There is a bug in `mean.dim` format inference."
+
+            # Just adjust the dim to match the input format.
+            if channels_last_input:
+                dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
+
+        else:
+            # `keep_dim = False`, so the output rank != input rank, and the operator changes the tensor format.
+
+            if channels_last_input and formatless_output:
+                if 1 in dim:
+                    # If we are reducing over the channels, the channels dimension gets removed and the output ends up
+                    #  exactly equal in channels last and channels first, regardless of which other dimensions are
+                    #  removed. Therefore, we can just adjust the `dim` and we don't need to insert any `Transpose` ops.
+                    dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
+                elif all(spatial_dim in dim for spatial_dim in range(2, x.rank)):
+                    # All spatial dims are reduced, leaving only batch and channels (both optionally). So the result is
+                    #  equal in channels first and channels last as long as we adjust the `dim` to match a channels last
+                    #  input (similarly to the case above).
+                    dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
+                else:
+                    # If the channels dimension is preserved, we must transpose the input to channels first (to match
+                    #  the edge model) and we must keep the `dim` unchanged (referencing channels first dimensions).
+                    #  Otherwise, the output would not match the input.
+                    to_channels_first_perm = (
+                        translator.create_channels_last_to_channels_first_permutation(
+                            x.rank
+                        )
+                    )
+                    ops.add_pre(
+                        self.builder.create_transpose_operator_before(
+                            t_op, 0, to_channels_first_perm
+                        )
+                    )
+                    t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST
+
+            elif formatless_input and channels_last_output:
+                # We need apply the `mean` with the original `dim`, which will produce a channels first output. Then,
+                #  we need to append a `Transpose` operator to make the output channels last.
+                to_channels_last_perm = (
+                    translator.create_channels_first_to_channels_last_permutation(
+                        y.rank, True
+                    )
+                )
+                ops.add_post(
+                    self.builder.create_transpose_operator_after(
+                        t_op, 0, to_channels_last_perm
+                    )
+                )
+                t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST
+
+            elif formatless_input and formatless_output:
+                # No action needed.
+                pass
+
+            else:  # channels_last_input and channels_last_output
+                # This case cannot currently occur, as it would require the case:
+                #       channels last 4D -> mean -> channels_last 3D
+                #  which cannot currently happen as the 3D conv/pooling/... is supported by adding `view_copy` nodes in
+                #  the edge dialect and converting the node to 4D, and the `view_copy` nodes prevent the propagation of
+                #  the format to the `mean.dim` output.
+                # Therefore, the implementation cannot be tested. But from experience with other operators, it should
+                #  work correctly. We just need to add 2 `Transpose` ops to make the IO channels first, and keep the
+                #  `dim` unchanged.
+                to_channels_first_perm = (
+                    translator.create_channels_last_to_channels_first_permutation(
+                        x.rank
+                    )
+                )
+                ops.add_pre(
+                    self.builder.create_transpose_operator_before(
+                        t_op, 0, to_channels_first_perm
+                    )
+                )
+                t_op.tmp_inputs[0].tensor_format = DataFormat.CHANNELS_FIRST
+
+                to_channels_last_perm = (
+                    translator.create_channels_first_to_channels_last_permutation(
+                        y.rank, True
+                    )
+                )
+                ops.add_post(
+                    self.builder.create_transpose_operator_after(
+                        t_op, 0, to_channels_last_perm
+                    )
+                )
+                t_op.tmp_outputs[0].tensor_format = DataFormat.CHANNELS_FIRST
+
+        return dim
+
     def convert(self, node: Node):
         """Convert the 'mean.dim' operator to NeutronIR 'Mean'.
         The ExecuTorch schema is:
@@ -123,10 +239,9 @@ def convert(self, node: Node):
 
         t_op = self._create_tflite_op_with_io_tensors(node)
         t_op.builtin_options = mean_options.Mean(keepdim)
-        x = t_op.tmp_inputs[0]
 
-        if x.tensor_format.is_channels_last():
-            dim = self._normalize_and_to_channel_last_dim(dim, x.rank)
+        ops = OpsList(middle_op=t_op)
+        dim = self._get_dim_and_handle_io_formats(ops, dim, keepdim)
 
         convert_axes_from_attribute(t_op, self.builder, dim)
-        self.builder.append_operators([t_op])
+        self.builder.append_operators(ops.flatten())
diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py
index ba4ad14222b..f3fe868ae83 100755
--- a/backends/nxp/backend/ir/converter/quantization_utils.py
+++ b/backends/nxp/backend/ir/converter/quantization_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2025 NXP
+# Copyright 2023-2026 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -135,11 +135,12 @@ def set_quantization_parameters_to_tensor(
 def quantize_int8(
     data: np.ndarray, scale: List[float], zero_point: List[int]
 ) -> np.ndarray:
+    # noinspection PyTypeChecker
     return quantize(data, zero_point=zero_point, scale=scale)
 
 
 def quantize(
-    value: np.ndarray | int,
+    value: np.ndarray | float,
     zero_point: List[int] | int,
     scale: List[float] | float,
     quant_min: int = -128,
diff --git a/backends/nxp/backend/node_format_inference.py b/backends/nxp/backend/node_format_inference.py
index 65e34b7fbde..030873c88ab 100644
--- a/backends/nxp/backend/node_format_inference.py
+++ b/backends/nxp/backend/node_format_inference.py
@@ -9,10 +9,27 @@
 import torch
 
 from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT
-
-from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order
+from executorch.backends.nxp.backend.edge_helper import (
+    is_channels_last_dim_order,
+    try_get_arg,
+)
 from executorch.backends.nxp.backend.edge_program_converter import functions_converters
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.nxp.tests.ops_aliases import (
+    AdaptiveAvgPool2D,
+    AvgPool2D,
+    Convolution,
+    DequantizePerChannel,
+    DequantizePerTensor,
+    GetItem,
+    MaxPool2D,
+    MaxPool2DWithIndices,
+    MeanDim,
+    PermuteCopy,
+    QuantizePerTensor,
+    UpsampleBilinear2D,
+    UpsampleNearest2D,
+    ViewCopy,
+)
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from torch.export import ExportedProgram
 from torch.fx import Node
@@ -25,21 +42,22 @@ class NodeFormatInference:
     # The op in the dictionary is mapped to a dictionary, which holds indices to input nodes
     # that are always channels first.
     ops_with_channels_first_nodes = {
-        exir_ops.edge.aten._adaptive_avg_pool2d.default: {"inputs": [0]},
+        AdaptiveAvgPool2D: {"inputs": [0]},
         torch.ops.aten.adaptive_avg_pool2d.default: {"inputs": [0]},
-        exir_ops.edge.aten.avg_pool2d.default: {"inputs": [0]},
-        exir_ops.edge.aten.convolution.default: {"inputs": [0, 1]},
-        exir_ops.edge.aten.max_pool2d_with_indices.default: {"inputs": [0]},
-        exir_ops.edge.aten.max_pool2d.default: {"inputs": [0]},
-        exir_ops.edge.aten.upsample_bilinear2d.vec: {"inputs": [0]},
-        exir_ops.edge.aten.upsample_nearest2d.vec: {"inputs": [0]},
+        AvgPool2D: {"inputs": [0]},
+        Convolution: {"inputs": [0, 1]},
+        MaxPool2DWithIndices: {"inputs": [0]},
+        MaxPool2D: {"inputs": [0]},
+        UpsampleBilinear2D: {"inputs": [0]},
+        UpsampleNearest2D: {"inputs": [0]},
     }
 
     # A set of Edge Aten ops, which have the ability to change the format (for example - input nodes
     # are channels first but output is formatless).
     ops_that_can_change_tensor_format = {
-        exir_ops.edge.aten.view_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
+        ViewCopy,
+        PermuteCopy,
+        MeanDim,
     }
 
     _type_changed_during_last_run: bool
@@ -71,10 +89,10 @@ def __init__(self, edge_program: ExportedProgram, only_for_op_support_check=Fals
         self._type_changed_during_last_run = False
 
         self._known_targets = list(functions_converters) + [
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            operator.getitem,
+            DequantizePerTensor,
+            DequantizePerChannel,
+            QuantizePerTensor,
+            GetItem,
         ]
 
     def identify_node_formats(self):
@@ -104,10 +122,7 @@ def _infer_format_of_nodes(self, node: Node):
             self._handle_node_which_uses_channels_first_format(node)
 
         elif op_type in self.ops_that_can_change_tensor_format:
-            if op_type in [
-                exir_ops.edge.aten.view_copy.default,
-                exir_ops.edge.aten.permute_copy.default,
-            ]:
+            if op_type in [ViewCopy, PermuteCopy]:
                 # Try to assign the `formatless` format to the input and output. The converter will then handle the
                 #  transition.
                 # Note: If the format for the input/output has already been assigned as channels first, it will NOT be
@@ -119,10 +134,28 @@ def _infer_format_of_nodes(self, node: Node):
                     self._node_inputs[node][0], DataFormat.FORMATLESS
                 )
 
+            elif op_type == MeanDim:
+                # The operator schema is:
+                #  mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+                keep_dim = try_get_arg(node, 2) or False
+                if keep_dim:
+                    # The operator preserves the rank, so we can handle it as an operator that can use any node format.
+                    self._handle_node_which_can_use_any_node_format(node)
+                else:
+                    # The operator removes dimensions, so the IO must be marked as `formatless` (unless overridden by
+                    #  channels first of course).
+                    self._assign_format_to_node(
+                        self._node_outputs[node][0], DataFormat.FORMATLESS
+                    )
+                    self._assign_format_to_node(
+                        self._node_inputs[node][0], DataFormat.FORMATLESS
+                    )
+
             else:
                 logger.error(
                     f"Node format inference for node type: {op_type} not found!"
                 )
+
         elif node.op != "call_function" or (
             hasattr(node, "target") and node.target in self._known_targets
         ):
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index d4262b3a9f6..9cc174b97e0 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -212,6 +212,7 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
     exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
+    exir_ops.edge.aten.exp.default: ExpConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten.leaky_relu.default: LeakyReluConverter,  # noqa F405
     exir_ops.edge.aten.log.default: LogConverter,  # noqa F405
@@ -436,7 +437,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         graph_module.recompile()
 
-        operators_not_to_delegate = self.delegation_spec[1][3].value.decode().split(",")
+        operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
         logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
 
         parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index f28eb34064c..1a84a418e92 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -9,8 +9,9 @@
 #
 
 import logging
+import os
 import struct
-from typing import final, List, Optional
+from typing import final
 
 import numpy as np
 import torch
@@ -45,10 +46,11 @@ class NeutronCompileSpecBuilder:
     config: NeutronTargetSpec
 
     def __init__(self):
-        self.compile_spec: List[CompileSpec] = []
+        self.compile_spec: list[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
-        self.operators_not_to_delegate: List[str] = []
+        self.intermediates_dir = None
+        self.operators_not_to_delegate: list[str] = []
         self.use_neutron_for_format_conversion = True
         self.fetch_constants_to_sram = False
         self.dump_kernel_selection_code = False
@@ -62,8 +64,9 @@ def _replace_colons(self, operator: str) -> str:
     def neutron_compile_spec(
         self,
         config: str,
-        extra_flags: Optional[str] = None,
-        operators_not_to_delegate: Optional[List[str]] = None,
+        intermediates_dir: str | None = None,
+        extra_flags: str | None = None,
+        operators_not_to_delegate: list[str] | None = None,
         use_neutron_for_format_conversion: bool = True,
         fetch_constants_to_sram: bool = False,
         dump_kernel_selection_code: bool = False,
@@ -71,6 +74,7 @@ def neutron_compile_spec(
         """Generate compile spec for Neutron NPU
 
         :param config: Neutron accelerator configuration, e.g. "imxrt700"
+        :param intermediates_dir: Directory to store intermediate artifact files.
         :param extra_flags: Extra flags for the Neutron compiler
         :param operators_not_to_delegate: List of operators that should not be delegated
         :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
@@ -83,6 +87,7 @@ def neutron_compile_spec(
         """
 
         self.config = NeutronTargetSpec(config)
+        self.intermediates_dir = intermediates_dir
 
         assert (
             self.output_format is None
@@ -113,6 +118,7 @@ def build(self):
                 CompileSpec("output_format", "tflite".encode()),
                 CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
                 CompileSpec("target", self.config.get_name().encode()),
+                CompileSpec("intermediates_dir", f"{self.intermediates_dir}".encode()),
                 CompileSpec(
                     "operators_not_to_delegate",
                     ",".join(self.operators_not_to_delegate).encode(),
@@ -136,17 +142,19 @@ def build(self):
 
 def generate_neutron_compile_spec(
     config: str,  # The target platform. For example "imxrt700".
-    system_config: Optional[str] = None,
-    extra_flags: Optional[str] = None,
-    operators_not_to_delegate: Optional[List[str]] = None,
+    system_config: str | None = None,
+    extra_flags: str | None = None,
+    intermediates_dir: str | None = None,
+    operators_not_to_delegate: list[str] | None = None,
     use_neutron_for_format_conversion: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
-) -> List[CompileSpec]:
+) -> list[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
         .neutron_compile_spec(
             config,
+            intermediates_dir=intermediates_dir,
             extra_flags=extra_flags,
             operators_not_to_delegate=operators_not_to_delegate,
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
@@ -163,7 +171,7 @@ class NeutronBackend(BackendDetails):
     @staticmethod
     def preprocess(  # noqa C901
         edge_program: ExportedProgram,
-        compile_spec: List[CompileSpec],
+        compile_spec: list[CompileSpec],
     ) -> PreprocessResult:
         logging.info("NeutronBackend::preprocess")
 
@@ -173,6 +181,7 @@ def preprocess(  # noqa C901
         compile_flags = []
         binary = bytes()
         target = ""
+        intermediates_dir = "None"
         use_neutron_for_format_conversion = None
         fetch_constants_to_sram = False
         dump_kernel_selection_code = None
@@ -181,6 +190,8 @@ def preprocess(  # noqa C901
                 output_format = spec.value.decode()
             if spec.key == "target":
                 target = spec.value.decode()
+            if spec.key == "intermediates_dir":
+                intermediates_dir = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
             if spec.key == "use_neutron_for_format_conversion":
@@ -194,6 +205,10 @@ def preprocess(  # noqa C901
         if not output_format:
             raise RuntimeError("output format is required")
 
+        # Check if provided intermediates_dir is a correct path (None is decoded to str)
+        if intermediates_dir != "None" and not os.path.isdir(intermediates_dir):
+            raise ValueError("intermediates_dir is not a directory path.")
+
         for node in edge_program.graph.nodes:
             if node.op == "call_function":
                 logging.debug(f"Operator to be processed: {node.target}")
@@ -228,16 +243,22 @@ def preprocess(  # noqa C901
                 fetch_constants_to_sram,
             )
 
-            # Dump the tflite file if logging level is enabled
-            if logging.root.isEnabledFor(logging.DEBUG):
-                import os
-
+            # Dump the tflite file if intermediates_dir is set
+            if intermediates_dir != "None":
                 logging.debug(
-                    f"Serializing converted graph with tag {delegation_tag} to {os.getcwd()}"
+                    f"Serializing converted graph with tag {delegation_tag} to {intermediates_dir}"
                 )
-                with open(f"{delegation_tag}_pure.et.tflite", "wb") as f:
+                with open(
+                    os.path.join(intermediates_dir, f"{delegation_tag}_pure.et.tflite"),
+                    "wb",
+                ) as f:
                     f.write(bytes(tflite_model))
-                with open(f"{delegation_tag}_neutron.et.tflite", "wb") as f:
+                with open(
+                    os.path.join(
+                        intermediates_dir, f"{delegation_tag}_neutron.et.tflite"
+                    ),
+                    "wb",
+                ) as f:
                     f.write(bytes(neutron_model))
 
             binary = PayloadComposer().get_binary_payload(io_formats, neutron_model)
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index 048172ea212..94ee8e8656a 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -25,6 +25,7 @@
     Conv2dPattern,
     ConvTranspose2dPattern,
     DropoutPattern,
+    ExpPattern,
     FlattenPattern,
     HardTanhInPlacePattern,
     HardTanhPattern,
@@ -270,6 +271,7 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec, is_qat: bool = False)
                     ConvTranspose2dPattern(self, is_qat=is_qat), static_qconfig
                 ),
                 OpQuantizer(DropoutPattern(is_qat=is_qat), static_qconfig),
+                OpQuantizer(ExpPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(FlattenPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(HardTanhPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(HardTanhInPlacePattern(is_qat=is_qat), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 9e21e4f1660..d6cf1d7e063 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -11,7 +11,10 @@
 
 import torch
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import (
-    _is_convertible_to_relu,
+    ClampConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.hardtanh_converter import (
+    HardTanhConverter,
 )
 from executorch.backends.nxp.quantizer.utils import (
     get_bias_qparams,
@@ -438,7 +441,7 @@ def get_anchors(
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
 
-        if not _is_convertible_to_relu(node):
+        if not ClampConverter._is_convertible_to_relu(node):
             return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition)
         else:
             return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition)
@@ -709,6 +712,15 @@ def partition_types(self):
         return [torch.ops.aten.dropout.default]
 
 
+class ExpPattern(SharedSpecPattern):
+    """
+    Quantizer for Exp operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.exp.default]
+
+
 class FlattenPattern(SharedSpecPattern):
     """
     Quantizer for Flatten operator.
@@ -726,11 +738,21 @@ class HardTanhPattern(SingleInputBasicPattern):
     def partition_types(self):
         return [torch.ops.aten.hardtanh.default]
 
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+
+        if not HardTanhConverter._is_convertible_to_relu(node):
+            return SharedSpecPattern.get_shared_spec_anchors(gm, fused_partition)
+        else:
+            return SingleInputBasicPattern.get_single_input_anchors(gm, fused_partition)
+
     def replacement_op(self):
         raise AssertionError()
 
 
-class HardTanhInPlacePattern(SingleInputBasicPattern):
+class HardTanhInPlacePattern(HardTanhPattern):
     """
     Quantizer for HardTanh operator with param inplace=True.
     """
@@ -738,21 +760,6 @@ class HardTanhInPlacePattern(SingleInputBasicPattern):
     def partition_types(self):
         return [torch.ops.aten.hardtanh_.default]
 
-    def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
-    ) -> PartitionAnchors | None:
-        node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(node, NodeArgsIdx(0))],
-            weights=[],
-            biases=[],
-            output=[(node,)],
-        )
-
-    def replacement_op(self):
-        raise AssertionError()
-
 
 class LeakyReluPattern(SingleInputBasicPattern):
     """Quantizer for the `aten.leaky_relu.default` operator."""
diff --git a/backends/nxp/run_unittests.sh b/backends/nxp/run_unittests.sh
index 78e35d2617a..66e51c39a1d 100755
--- a/backends/nxp/run_unittests.sh
+++ b/backends/nxp/run_unittests.sh
@@ -11,6 +11,6 @@ EXECUTORCH_DIR=$(dirname $(dirname $SCRIPT_DIR))
 cd $EXECUTORCH_DIR
 
 # '-c /dev/null' is used to ignore root level pytest.ini.
-pytest -c /dev/null backends/nxp/tests/
+pytest -c /dev/null -n "logical" backends/nxp/tests/
 
 python -m unittest discover -s backends/nxp/tests/ -v
diff --git a/backends/nxp/tests/conftest.py b/backends/nxp/tests/conftest.py
index 34fe343ca6a..af2011a8000 100644
--- a/backends/nxp/tests/conftest.py
+++ b/backends/nxp/tests/conftest.py
@@ -35,4 +35,4 @@ def pytest_sessionstart(session):
 
     # Remove all cached test files
     shutil.rmtree(outputs_dir.OUTPUTS_DIR, ignore_errors=True)
-    os.mkdir(outputs_dir.OUTPUTS_DIR)
+    os.makedirs(outputs_dir.OUTPUTS_DIR, exist_ok=True)
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
index 5cfcb37c8a8..44a96010593 100644
--- a/backends/nxp/tests/executorch_pipeline.py
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -180,6 +180,7 @@ def to_quantized_edge_program(
     operators_not_to_delegate: list[str] = None,
     get_calibration_inputs_fn: GetCalibrationInputsFn = get_random_calibration_inputs,
     target: str = "imxrt700",
+    intermediates_dir: str | None = None,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     remove_quant_io_ops: bool = False,
@@ -217,6 +218,7 @@ def to_quantized_edge_program(
     preserve_ops = [torch.ops.aten.prelu.default]
     compile_spec = generate_neutron_compile_spec(
         target,
+        intermediates_dir=intermediates_dir,
         operators_not_to_delegate=operators_not_to_delegate,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         fetch_constants_to_sram=fetch_constants_to_sram,
@@ -266,6 +268,7 @@ def to_quantized_edge_program(
 def to_quantized_executorch_program(
     model: torch.nn.Module,
     input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]],
+    intermediates_dir: str | None = None,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     use_neutron_for_format_conversion: bool = True,
@@ -287,6 +290,7 @@ def to_quantized_executorch_program(
     edge_program_manager = to_quantized_edge_program(
         model,
         input_spec,
+        intermediates_dir=intermediates_dir,
         use_qat=use_qat,
         train_fn=train_fn,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
diff --git a/backends/nxp/tests/generic_tests/test_cifarnet.py b/backends/nxp/tests/generic_tests/test_cifarnet.py
index 1d795c938fe..c874ba24e47 100644
--- a/backends/nxp/tests/generic_tests/test_cifarnet.py
+++ b/backends/nxp/tests/generic_tests/test_cifarnet.py
@@ -34,7 +34,7 @@ def cifar_test_files(tmp_path_factory):
 
 
 @pytest.mark.parametrize("channels_last", [False, True])
-def test_cifarnet(mocker, cifar_test_files, channels_last):
+def test_cifarnet(mocker, request, cifar_test_files, channels_last):
     model = (
         CifarNet(
             pth_file=os.path.join(
@@ -64,9 +64,10 @@ def test_cifarnet(mocker, cifar_test_files, channels_last):
     lower_run_compare(
         model,
         [input_spec],
+        BaseGraphVerifier(1, non_dlg_nodes),
+        request,
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
-        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
         mocker=mocker,
         # Run the channels last reference in PyTorch as the ExecuTorch CPU model contains incorrectly
         #  lowered channels last convolution weights, which cause incorrect inference results. The issue
@@ -79,7 +80,7 @@ def test_cifarnet(mocker, cifar_test_files, channels_last):
     )
 
 
-def test_cifarnet_qat(mocker, cifar_test_files):
+def test_cifarnet_qat(mocker, request, cifar_test_files):
     model = CifarNet().get_eager_model().eval()
 
     input_shape = (1, 3, 32, 32)
@@ -94,9 +95,10 @@ def test_cifarnet_qat(mocker, cifar_test_files):
     lower_run_compare(
         model,
         input_shape,
+        BaseGraphVerifier(1, non_dlg_nodes),
+        request,
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
-        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
         mocker=mocker,
         use_qat=True,
     )
diff --git a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
index fcd0aae2130..3415b79a39d 100644
--- a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
+++ b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py
@@ -208,7 +208,7 @@ class TestConvertDivToMul:
         ids=lambda is_scalar: "scalar" if is_scalar else "tensor",
     )
     def test__static__full_pipeline(
-        self, mocker, input_shape: tuple[int, ...], is_scalar: bool
+        self, mocker, request, input_shape: tuple[int, ...], is_scalar: bool
     ):
         if is_scalar:
             divisor = np.random.uniform(0.01, 15)
@@ -231,5 +231,6 @@ def test__static__full_pipeline(
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
         )
diff --git a/backends/nxp/tests/generic_tests/test_integration.py b/backends/nxp/tests/generic_tests/test_integration.py
index fe157b44c48..edefd905dbf 100644
--- a/backends/nxp/tests/generic_tests/test_integration.py
+++ b/backends/nxp/tests/generic_tests/test_integration.py
@@ -19,7 +19,7 @@ def test_conv_fc_softmax__to_executorch_program(use_qat):
     model = ConvFCSoftmaxModule()
     input_shape = (1, 4, 5, 5)
 
-    exec_prog = to_quantized_executorch_program(model, input_shape, use_qat)
+    exec_prog = to_quantized_executorch_program(model, input_shape, use_qat=use_qat)
 
     program = exec_prog.exported_program()
     assert (
diff --git a/backends/nxp/tests/generic_tests/test_quantized_input_data.py b/backends/nxp/tests/generic_tests/test_quantized_input_data.py
index 8b2f6823e8d..a9f9f3e47e6 100644
--- a/backends/nxp/tests/generic_tests/test_quantized_input_data.py
+++ b/backends/nxp/tests/generic_tests/test_quantized_input_data.py
@@ -17,7 +17,7 @@
 from executorch.backends.nxp.tests.ops_aliases import AvgPool2D, MulTensor
 
 
-def test__single_quantized_inputs(mocker):
+def test__single_quantized_inputs(mocker, request):
     input_spec = ModelInputSpec((2, 4, 6, 7))
     model = AvgPool2dModule(False, 0)
     graph_verifier = DetailedGraphVerifier(
@@ -29,19 +29,19 @@ def test__single_quantized_inputs(mocker):
         model,
         [input_spec],
         graph_verifier,
+        request,
         remove_quant_io_ops=True,
     )
 
-    assert (
-        OUTPUTS_DIR / "test__single_quantized_inputs" / "dataset_quant" / "0000.bin"
-    ).exists()
+    test_name = nsys_testing.get_test_name(request)
+    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__single_quantized_inputs_edge_python_reference(mocker):
+def test__single_quantized_inputs_edge_python_reference(mocker, request):
     input_spec = ModelInputSpec((2, 4, 6, 7))
     model = AvgPool2dModule(False, 0)
     graph_verifier = DetailedGraphVerifier(
@@ -53,23 +53,20 @@ def test__single_quantized_inputs_edge_python_reference(mocker):
         model,
         [input_spec],
         graph_verifier,
+        request,
         reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON,
         remove_quant_io_ops=True,
     )
 
-    assert (
-        OUTPUTS_DIR
-        / "test__single_quantized_inputs_edge_python_reference"
-        / "dataset_quant"
-        / "0000.bin"
-    ).exists()
+    test_name = nsys_testing.get_test_name(request)
+    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000.bin").exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__multiple_quantized_inputs(mocker):
+def test__multiple_quantized_inputs(mocker, request):
     x_input_spec = ModelInputSpec((1, 4, 8, 8))
     model = MulTensorModule()
     graph_verifier = DetailedGraphVerifier(
@@ -81,23 +78,19 @@ def test__multiple_quantized_inputs(mocker):
         model,
         [x_input_spec, x_input_spec],
         graph_verifier,
+        request,
         remove_quant_io_ops=True,
     )
 
-    assert (
-        OUTPUTS_DIR
-        / "test__multiple_quantized_inputs"
-        / "dataset_quant"
-        / "0000"
-        / "00.bin"
-    ).exists()
+    test_name = nsys_testing.get_test_name(request)
+    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
     assert output_tensor_spec[0].dtype == torch.int8
 
 
-def test__multiple_quantized_inputs_edge_python_reference(mocker):
+def test__multiple_quantized_inputs_edge_python_reference(mocker, request):
     x_input_spec = ModelInputSpec((1, 4, 8, 8))
     model = MulTensorModule()
     graph_verifier = DetailedGraphVerifier(
@@ -109,17 +102,13 @@ def test__multiple_quantized_inputs_edge_python_reference(mocker):
         model,
         [x_input_spec, x_input_spec],
         graph_verifier,
+        request,
         reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON,
         remove_quant_io_ops=True,
     )
 
-    assert (
-        OUTPUTS_DIR
-        / "test__multiple_quantized_inputs_edge_python_reference"
-        / "dataset_quant"
-        / "0000"
-        / "00.bin"
-    ).exists()
+    test_name = nsys_testing.get_test_name(request)
+    assert (OUTPUTS_DIR / test_name / "dataset_quant" / "0000" / "00.bin").exists()
 
     # Check outputs are in quantized int8 format
     output_tensor_spec = output_tensor_spec_spy.spy_return
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
index ebe782c5a98..d42ef4c6e7d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_abs_converter.py
@@ -68,7 +68,7 @@ def _get_dataset_creator():
         dataset = RandomDatasetCreator(low=low, high=high)
         return dataset
 
-    def test__basic_nsys_inference(self, mocker):
+    def test__basic_nsys_inference(self, mocker, request):
         input_shape = (2, 3, 6, 7)
         model = AbsModule()
         graph_verifier = DetailedGraphVerifier(
@@ -80,10 +80,11 @@ def test__basic_nsys_inference(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
         )
 
-    def test__basic_nsys_inference__big(self, mocker):
+    def test__basic_nsys_inference__big(self, mocker, request):
         # some operators have delegation requirement that size must be < 4096
         input_shape = (4097, 1)
         model = AbsModule()
@@ -96,5 +97,6 @@ def test__basic_nsys_inference__big(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
index 8b8f2da8c4e..9646c04a3f2 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_adaptive_avg_pool2d_converter.py
@@ -44,7 +44,9 @@ class TestAdaptiveAvgPool2D:
             ),
         ],
     )
-    def test__basic_nsys_inference(self, mocker, use_qat, input_shape, output_size):
+    def test__basic_nsys_inference(
+        self, mocker, request, use_qat, input_shape, output_size
+    ):
         model = AdaptiveAvgPool2dModule(output_size)
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -60,6 +62,7 @@ def test__basic_nsys_inference(self, mocker, use_qat, input_shape, output_size):
             model,
             input_shape,
             graph_verifier,
+            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
             use_qat=use_qat,
@@ -69,7 +72,7 @@ def test__basic_nsys_inference(self, mocker, use_qat, input_shape, output_size):
         strict=True,
         reason="Known Neutron bad compute issue. Will be fixed in Neutron SW 3.1.2.",
     )
-    def test__know_neutron_issue(self, mocker):
+    def test__know_neutron_issue(self, mocker, request):
         input_shape = (2, 3, 10, 15)
         output_size = (5, 5)
         model = AdaptiveAvgPool2dModule(output_size)
@@ -86,11 +89,12 @@ def test__know_neutron_issue(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
         )
 
-    def test__kernel_size_and_stride_limit(self, mocker):
+    def test__kernel_size_and_stride_limit(self, mocker, request):
         input_shape = (1, 3, 4, 4096)  # input_size = (1, 4096)
         output_size = (
             2,
@@ -114,6 +118,7 @@ def test__kernel_size_and_stride_limit(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             RandomDatasetCreator(low=-1, high=1),
             output_comparator=output_comparator,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 3ede2cfaadd..6ac96e41cd1 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -16,6 +16,9 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    AllCloseOutputComparator,
+)
 from executorch.backends.nxp.tests.models import AddTensorConvModule, AddTensorModule
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
@@ -38,67 +41,49 @@ class TestAddTensor:
         [
             pytest.param((1,), id="1D."),
             pytest.param((6, 5), id="2D."),
+            pytest.param((6, 82), id="2D alt."),
             pytest.param((1, 4, 7), id="3D."),
+            pytest.param((1, 68, 7), id="3D alt."),
             pytest.param((2, 4, 3, 15), id="4D."),
-            pytest.param(
-                (6, 82),
-                id="2D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (1, 68, 7),
-                id="3D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (1, 4, 9, 11, 4),
-                id="5D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
+            pytest.param((1, 4, 9, 11, 4), id="5D."),
         ],
     )
-    def test__basic_nsys_inference(self, x_input_shape, mocker):
+    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
-    @pytest.mark.parametrize(
-        "x_input_shape",
-        [
-            pytest.param((1,), id="1D."),
-            pytest.param((6, 5), id="2D."),
-            pytest.param((1, 4, 7), id="3D."),
-            pytest.param((2, 4, 3, 15), id="4D."),
-            pytest.param(
-                (1, 4, 9, 11, 4),
-                id="5D.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-        ],
-    )
-    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
-        x_input_spec = ModelInputSpec(x_input_shape)
+    def test__basic_nsys_inference_qat(self, mocker, request):
+        x_input_spec = ModelInputSpec((1, 4, 7))
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
             use_qat=True,
         )
 
@@ -108,6 +93,10 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
             pytest.param(
                 [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
             ),
+            pytest.param(
+                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+                id="2 inputs 2D alt.",
+            ),
             pytest.param(
                 [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
                 id="2 inputs 3D.",
@@ -115,25 +104,24 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
             pytest.param(
                 [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
             ),
-            pytest.param(
-                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
-                id="2 inputs 2D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
         ],
     )
-    def test__broadcast(self, input_spec, mocker):
+    def test__broadcast(self, mocker, request, input_spec):
         model = AddTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -172,7 +160,7 @@ def test__broadcast_unsupported(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, x_input_shape, mocker):
+    def test__w_conv(self, mocker, request, x_input_shape):
         model = AddTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -187,7 +175,11 @@ def test__w_conv(self, x_input_shape, mocker):
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
 
         lower_run_compare(
-            model, [x_input_spec, y_input_spec], graph_verifier, dataset_creator
+            model,
+            [x_input_spec, y_input_spec],
+            graph_verifier,
+            request,
+            dataset_creator,
         )
 
     @pytest.mark.parametrize(
@@ -198,13 +190,12 @@ def test__w_conv(self, x_input_shape, mocker):
                 id="2 inputs 4D + 4D.",
             ),
             pytest.param(
-                [ModelInputSpec((1, 4, 5, 67)), ModelInputSpec((1, 8, 5, 1))],
-                id="2 inputs 4D + 4D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+                [ModelInputSpec((1, 4, 1, 67)), ModelInputSpec((1, 8, 5, 67))],
+                id="2 inputs 4D + 4D same width.",
             ),
         ],
     )
-    def test__w_conv_broadcast(self, input_spec, mocker):
+    def test__w_conv_broadcast(self, mocker, request, input_spec):
         model = AddTensorConvModule()
 
         graph_verifier = DetailedGraphVerifier(
@@ -213,12 +204,16 @@ def test__w_conv_broadcast(self, input_spec, mocker):
             expected_non_delegated_ops={},
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
index 120c3899ed4..3db1158d637 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
@@ -41,16 +41,16 @@ def forward(self, x):
 
 
 class TestAvgPool2D:
-    def test__basic_nsys_inference(self, mocker):
+    def test__basic_nsys_inference(self, mocker, request):
         input_shape = (2, 4, 6, 7)
         model = AvgPool2dModule(False, 0)
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
 
-    def test__basic_nsys_inference_qat(self, mocker):
+    def test__basic_nsys_inference_qat(self, mocker, request):
         input_shape = (2, 9, 6, 15)
         model = AvgPool2dModule(False, 0)
         graph_verifier = DetailedGraphVerifier(
@@ -61,10 +61,11 @@ def test__basic_nsys_inference_qat(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             use_qat=True,
         )
 
-    def test__kernel_size_limit(self, mocker):
+    def test__kernel_size_limit(self, mocker, request):
         kernel_size = (1, 4096)
         input_shape = (1, 4) + kernel_size
         model = AvgPool2dModule(False, 0, kernel_size)
@@ -72,7 +73,7 @@ def test__kernel_size_limit(self, mocker):
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
 
     def test__kernel_size_limit_exceeded(self):
         kernel_size = (1, 4097)  # Exceeds the kernel size limit.
@@ -87,7 +88,7 @@ def test__kernel_size_limit_exceeded(self):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [AvgPool2D])
 
-    def test__stride_limit(self, mocker):
+    def test__stride_limit(self, mocker, request):
         stride = 4096
         input_shape = (1, 4, 1, 4096)
         model = AvgPool2dModule(False, 0, 1, stride)
@@ -95,7 +96,7 @@ def test__stride_limit(self, mocker):
             mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
 
     def test__stride_limit_exceeded(self):
         stride = 4097  # Exceeds the stride limit.
@@ -114,7 +115,7 @@ def test__stride_limit_exceeded(self):
 class TestAvgPool1D:
 
     # Just a basic test to verify that the operator gets extended to the 2D variant correctly.
-    def test__basic_nsys_inference(self, mocker):
+    def test__basic_nsys_inference(self, mocker, request):
         input_shape = (2, 4, 6)  # The old flow limited the batch size to 1.
         model = AvgPool1DModule()
         graph_verifier = DetailedGraphVerifier(
@@ -123,4 +124,4 @@ def test__basic_nsys_inference(self, mocker):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
index 9bb1f30ee60..b28a431e3ca 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_cat_converter.py
@@ -56,7 +56,7 @@ def forward(self, *inputs: torch.Tensor):
 
 class TestCat:
 
-    def test__qat(self, mocker, use_qat):
+    def test__qat(self, mocker, request, use_qat):
         input_shape = (2, 3, 5)
         num_inputs = 2
 
@@ -66,11 +66,11 @@ def test__qat(self, mocker, use_qat):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier, use_qat=use_qat)
+        lower_run_compare(model, input_shapes, graph_verifier, request, use_qat=use_qat)
 
     @pytest.mark.parametrize("dim", list(range(-3, 3)), ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__same_shapes(self, mocker, dim, num_inputs):
+    def test__same_shapes(self, mocker, request, dim, num_inputs):
         input_shape = (2, 3, 5)
         input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)]
 
@@ -79,11 +79,11 @@ def test__same_shapes(self, mocker, dim, num_inputs):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier)
+        lower_run_compare(model, input_shapes, graph_verifier, request)
 
     @pytest.mark.parametrize("dim", [0, -3, 2, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__same_shapes__channels_first(self, mocker, dim, num_inputs):
+    def test__same_shapes__channels_first(self, mocker, request, dim, num_inputs):
         input_shape = (2, 3, 4, 5)
         input_shapes = [ModelInputSpec(input_shape) for _ in range(num_inputs)]
 
@@ -94,12 +94,12 @@ def test__same_shapes__channels_first(self, mocker, dim, num_inputs):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier)
+        lower_run_compare(model, input_shapes, graph_verifier, request)
 
     @pytest.mark.parametrize("dim", [0, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("rank", [2, 3, 4], ids=lambda rank: f"rank={rank}")
     @pytest.mark.parametrize("num_inputs", [2, 3], ids=lambda n: f"n={n}")
-    def test__different_shapes(self, mocker, dim, rank, num_inputs):
+    def test__different_shapes(self, mocker, request, dim, rank, num_inputs):
         # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input.
         # e.g. [(2, 3, 4), (3, 3, 4), (4, 3, 4), (5, 3, 4), (6, 3, 4)]
         base_shape = [i + 2 for i in range(rank)]
@@ -113,11 +113,11 @@ def test__different_shapes(self, mocker, dim, rank, num_inputs):
             mocker, expected_delegated_ops={Cat: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier)
+        lower_run_compare(model, input_shapes, graph_verifier, request)
 
     @pytest.mark.parametrize("dim", [1, -1], ids=lambda dim: f"dim={dim}")
     @pytest.mark.parametrize("num_inputs", [2, 5], ids=lambda n: f"n={n}")
-    def test__different_shapes__channels_first(self, mocker, dim, num_inputs):
+    def test__different_shapes__channels_first(self, mocker, request, dim, num_inputs):
         # The input shapes can only differ in the `dim` dimension. So we can just assign a different one for each input.
         # e.g. [(1, 3, 4, 5), (2, 3, 4, 5)]
         base_shape = (2, 3, 4, 5)
@@ -133,7 +133,7 @@ def test__different_shapes__channels_first(self, mocker, dim, num_inputs):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shapes, graph_verifier)
+        lower_run_compare(model, input_shapes, graph_verifier, request)
 
     def test__single_input__alone_in_partition__not_delegated(self):
         # The operator is a noop, and there is no other op in the model. The Neutron Converter would produce an empty
@@ -149,7 +149,7 @@ def test__single_input__alone_in_partition__not_delegated(self):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [Cat])
 
-    def test__single_input__not_alone_in_partition__delegated(self, mocker):
+    def test__single_input__not_alone_in_partition__delegated(self, mocker, request):
         # The operator is a noop, but there is another op in the model, so they are both delegated.
         input_shape = [ModelInputSpec((2, 3, 4, 5))]
 
@@ -160,4 +160,4 @@ def test__single_input__not_alone_in_partition__delegated(self, mocker):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
index e0ae44b61f8..bd296bb856f 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clamp_converter.py
@@ -24,9 +24,6 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
-    NumericalStatsOutputComparator,
-)
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
     AddTensor,
@@ -68,6 +65,35 @@ def forward(self, x):
 
 
 class TestClamp:
+
+    @pytest.mark.parametrize(
+        "min, max",
+        [
+            pytest.param(-1, 2, id="min = -1, max = 2 (Max/Min)"),
+            pytest.param(0.0, None, id="min = 0, max = None (Relu)"),
+        ],
+    )
+    def test__qat(self, mocker, request, min, max, use_qat):
+        input_shape = (2, 7, 2)  # Indivisible by num_macs
+        model = AddClampModule(min, max)
+
+        x_input_spec = ModelInputSpec(input_shape)
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops={
+                AddTensor: 1,
+                Clamp: 1,
+            },
+            expected_non_delegated_ops={},
+        )
+
+        lower_run_compare(
+            model=model,
+            input_spec=[x_input_spec],
+            request=request,
+            dlg_model_verifier=graph_verifier,
+        )
+
     @pytest.mark.parametrize(
         "min, max",
         [
@@ -90,12 +116,11 @@ class TestClamp:
             pytest.param(0.0, None, id="min = 0, max = None (Relu)"),
         ],
     )
-    def test_convert_clamp__full_pipeline(self, mocker, min, max, use_qat):
+    def test_convert_clamp__full_pipeline(self, mocker, request, min, max):
         input_shape = (2, 7, 2)  # Indivisible by num_macs
         model = AddClampModule(min, max)
 
         x_input_spec = ModelInputSpec(input_shape)
-        comparator = NumericalStatsOutputComparator()
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={
@@ -109,8 +134,7 @@ def test_convert_clamp__full_pipeline(self, mocker, min, max, use_qat):
             model=model,
             input_spec=[x_input_spec],
             dlg_model_verifier=graph_verifier,
-            output_comparator=comparator,
-            use_qat=use_qat,
+            request=request,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
index 9ffa69139f6..32bbf93fae4 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
@@ -34,7 +34,7 @@ class TestConstantPadND:
     """
 
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, use_qat=False):
+    def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={ConstantPadND: 1},
@@ -45,15 +45,16 @@ def assert_delegated(self, model, input_shape, mocker, use_qat=False):
             model,
             input_shape,
             graph_verifier,
+            request,
             use_qat=use_qat,
         )
 
     def assert_delegated_and_output_shape_equals(
-        self, model, input_shape, expected_output_shape, mocker
+        self, model, input_shape, expected_output_shape, mocker, request
     ):
         model_builder_spy = mocker.spy(ModelBuilder, "finish")
 
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
         neutron_ir_subgraph = model_builder_spy.call_args[0][0].get_sub_graph()
         assert neutron_ir_subgraph.outputs.tmp_outputs[0].shape.vector == list(
@@ -74,12 +75,14 @@ def assert_delegated_and_output_shape_equals(
             pytest.param((1, 2, 3, 4, 5), tuple(range(4)), id="5D, padding W, D"),
         ],
     )
-    def test__basic_nsys_inference(self, mocker, input_shape, paddings, use_qat):
+    def test__basic_nsys_inference(
+        self, mocker, request, input_shape, paddings, use_qat
+    ):
         # These test cases are also supported by the old flow.
         model = ConstantPadNDModule(paddings)
-        self.assert_delegated(model, input_shape, mocker, use_qat)
+        self.assert_delegated(model, input_shape, mocker, request, use_qat)
 
-    def test__channels_padding(self, mocker):
+    def test__channels_padding(self, mocker, request):
         input_shape = (2, 4, 6)
         # These paddings will be applied to the last dimension, which is the channels as the input is formatless.
         paddings = (1, 1)
@@ -87,25 +90,25 @@ def test__channels_padding(self, mocker):
         model = ConstantPadNDModule(paddings)
 
         self.assert_delegated_and_output_shape_equals(
-            model, input_shape, expected_output_shape, mocker
+            model, input_shape, expected_output_shape, mocker, request
         )
 
-    def test__batch_padding(self, mocker):
+    def test__batch_padding(self, mocker, request):
         input_shape = (2, 4, 6)
         paddings = (0, 0, 0, 0, 1, 1)  # Padding applied to the batch dimension.
         expected_output_shape = (4, 4, 6)  # Padded batch.
         model = ConstantPadNDModule(paddings)
 
         self.assert_delegated_and_output_shape_equals(
-            model, input_shape, expected_output_shape, mocker
+            model, input_shape, expected_output_shape, mocker, request
         )
 
     @pytest.mark.parametrize("constant", [0.0, -13.37])
-    def test__specific_constant(self, mocker, constant):
+    def test__specific_constant(self, mocker, request, constant):
         input_shape = (2, 4, 6)
         paddings = (1, 1)
         model = ConstantPadNDModule(paddings, constant)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     @pytest.mark.parametrize(
         "input_shape, paddings",
@@ -115,7 +118,7 @@ def test__specific_constant(self, mocker, constant):
             pytest.param((1, 2, 6, 8), (0, 1, 2, 3, 1, 1), id="4D, padding H, W"),
         ],
     )
-    def test__channels_first(self, mocker, input_shape, paddings):
+    def test__channels_first(self, mocker, request, input_shape, paddings):
         model = ConstantPadNDConvModule(paddings)
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -123,4 +126,4 @@ def test__channels_first(self, mocker, input_shape, paddings):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
index 67d3add978c..3799aa91623 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
@@ -4,22 +4,31 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+
+# noinspection PyUnusedImports
 import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import (
+    AtenModelBuilderDirector,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator as Ops,
+)
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
-from executorch.backends.nxp.tests.executors import (
-    convert_run_compare,
-    graph_contains_any_of_ops,
-    ToChannelFirstPreprocess,
-    ToChannelLastPreprocess,
+from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.models import Conv2dWithActivation, HardTanhModule
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import (
+    Convolution,
+    ExecutorchDelegateCall,
+    HardTanh,
 )
-from executorch.backends.nxp.tests.models import Conv2dWithActivation
-from executorch.exir.dialects._ops import ops as exir_ops
-from torch.export import ExportedProgram
 from executorch.backends.nxp.tests.use_qat import *  # noqa F403
 
 
@@ -29,91 +38,237 @@ def reseed_model_per_test_run():
     np.random.seed(23)
 
 
-ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
-HardTanh = exir_ops.edge.aten.hardtanh.default
-HardTanh_ = exir_ops.edge.aten.hardtanh_.default
+class AddHardTanhModule(HardTanhModule):
+    def forward(self, x):
+        x = x + x
+        x = super().forward(x)
+        return x
 
 
-@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128)])
-@pytest.mark.parametrize("inplace", [True, False])
-def test_relu6_quant(mocker, input_shape: tuple[int], inplace: bool, use_qat: bool):
-    # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen.
-    # Testing the hardtanh originated from torch.nn.Relu6 op.
-    model = Conv2dWithActivation(
-        activation=torch.nn.ReLU6(inplace=inplace), in_channels=input_shape[1]
-    )
+class TestHardTanh:
+    # noinspection PyMethodMayBeStatic
+    def assert_delegated(
+        self,
+        model,
+        input_shape,
+        mocker,
+        request,
+        use_qat=False,
+        expected_delegated_ops=None,
+    ):
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops=(
+                expected_delegated_ops
+                if expected_delegated_ops is not None
+                else {HardTanh: 1}
+            ),
+            expected_non_delegated_ops={},
+        )
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+        # Create a RandomDatasetCreator that covers also negative numbers to properly test the operator.
+        dataset_creator = RandomDatasetCreator(low=-2, high=2)
 
-    quantized_program = to_quantized_edge_program(
-        model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False
-    ).exported_program()
+        lower_run_compare(
+            model,
+            input_shape,
+            graph_verifier,
+            request,
+            dataset_creator,
+            use_qat=use_qat,
+        )
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+    @pytest.mark.parametrize(
+        "activation_range",
+        [
+            (-1, 3),
+            (0, float("inf")),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
+    )
+    def test__qat(
+        self, mocker, request, activation_range: tuple[float, float], use_qat, inplace
+    ):
+        input_shape = (23,)
+        model = HardTanhModule(*activation_range, inplace)
 
-    assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_])
-    assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall])
+        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
 
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToChannelLastPreprocess(),
-        tflite_output_preprocess=ToChannelFirstPreprocess(),
-        input_data=input_data,
-        atol=2.0,
+    @pytest.mark.parametrize(
+        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
     )
+    def test__from_relu6__after_conv(self, mocker, request, inplace: bool):
+        # The torch.nn.Relu6 inherits from torch.nn.Hardtanh, and hence represented as HardTanh in ATen.
+        # Testing the hardtanh originated from torch.nn.Relu6 op.
+        input_shape = (1, 3, 4, 5)
+        model = Conv2dWithActivation(
+            activation=torch.nn.ReLU6(inplace=inplace),
+            in_channels=input_shape[1],
+            out_channels=2,
+        )
 
+        self.assert_delegated(
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops={HardTanh: 1, Convolution: 1},
+        )
 
-@pytest.mark.parametrize("input_shape", [(1, 3, 16, 16), (1, 3, 32, 32)])
-@pytest.mark.parametrize(
-    "activation_range",
-    [
-        (0.0, 6.0),
-        (-1.0, 1.0),
-        (0.0, 1.0),
-        (0.0, float("inf")),
-        (0, 6),
-        (-1, 1),
-        (0, 1),
-        (0, float("inf")),
-    ],
-)
-@pytest.mark.parametrize("inplace", [True, False])
-def test_custom_hardtanh_quant(
-    mocker,
-    input_shape: tuple[int],
-    activation_range: tuple[float, float],
-    inplace: bool,
-    use_qat: bool,
-):
-    # TODO(13063): This test suffers from non-ideal testing random quantization, because we always use range <0,1>.
-    #  We should update (decrease atol) when the Conv/Linear + Activation fuse at quantization is in place.
-    min_val, max_val = activation_range
-    model = Conv2dWithActivation(
-        activation=torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace),
-        in_channels=input_shape[1],
+    @pytest.mark.parametrize(
+        "activation_range",
+        [
+            (0.0, 6.0),
+            (-1.0, 1),
+            (0, 1),
+            (0.0, float("inf")),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "inplace", [True, False], ids=lambda ip: "Inplace" if ip else "Not inplace"
     )
+    def test__hardtanh__mappable_to_relu__after_conv(
+        self,
+        mocker,
+        request,
+        activation_range: tuple[float, float],
+        inplace: bool,
+    ):
+        input_shape = (1, 3, 4, 5)
+        model = Conv2dWithActivation(
+            activation=torch.nn.Hardtanh(*activation_range, inplace),
+            in_channels=input_shape[1],
+            out_channels=2,
+        )
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+        self.assert_delegated(
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops={HardTanh: 1, Convolution: 1},
+        )
+
+    @pytest.mark.parametrize(
+        "activation_range",
+        [
+            (-1, 3),
+            (2.27, 3.14),
+            (-0.1, 0),
+            (float("-inf"), 1.23),
+        ],
+    )
+    def test__hardtanh__not_mappable_to_relu(
+        self,
+        mocker,
+        request,
+        activation_range: tuple[float, float],
+    ):
+        input_shape = (23,)
+        model = HardTanhModule(*activation_range)
 
-    quantized_program = to_quantized_edge_program(
-        model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False
-    ).exported_program()
+        self.assert_delegated(model, input_shape, mocker, request)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+    def test__unsupported_bounds(self):
+        # TODO ONLY WHEN ALONE IN PARTITION
+        input_shape = (2, 7, 2)
+        min_value, max_value = float("-inf"), float("inf")
+        model = HardTanhModule(min_value, max_value)
 
-    assert not graph_contains_any_of_ops(quantized_program.graph, [HardTanh, HardTanh_])
-    assert graph_contains_any_of_ops(quantized_program.graph, [ExecutorchDelegateCall])
+        delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
 
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToChannelLastPreprocess(),
-        tflite_output_preprocess=ToChannelFirstPreprocess(),
-        input_data=input_data,
-        atol=2.0,
+        # Make sure the `hardtanh` was NOT delegated.
+        assert graph_contains_any_of_ops(delegated_ep.graph, [HardTanh])
+
+    @pytest.mark.parametrize(
+        "activation_range",
+        [
+            pytest.param((None, float("inf")), id="min = None, max = inf"),
+            pytest.param((float("inf"), None), id="min = inf, max = None"),
+        ],
+    )
+    def test__invalid_bounds(self, activation_range):
+        # PyTorch doesn't allow these cases, so we cannot test our handling of this edge case.
+        with pytest.raises(TypeError, match="'<=' not supported between instances of"):
+            _ = HardTanhModule(*activation_range)
+
+    @pytest.mark.parametrize(
+        "min, max, expected_neutron_ir_ops",
+        [
+            pytest.param(
+                0.1,
+                0.5,
+                [Ops.ADD, Ops.MAXIMUM, Ops.MINIMUM],
+                id="min = 0.1, max = 0.5 (Max/Min)",
+            ),
+            pytest.param(
+                0.0, 1.0, [Ops.ADD, Ops.RELU_0_TO_1], id="min = 0, max = 1 (Relu0To1)"
+            ),
+            pytest.param(
+                -1.0,
+                1.0,
+                [Ops.ADD, Ops.RELU_N1_TO_1],
+                id="min = -1, max = 1 (ReluN1To1)",
+            ),
+            pytest.param(
+                0.0,
+                float("inf"),
+                [Ops.ADD, Ops.RELU],
+                id="min = 0, max = infinity (Relu)",
+            ),
+            pytest.param(
+                0,
+                1.0,
+                [Ops.ADD, Ops.RELU_0_TO_1],
+                id="min = 0, max = 1 (Relu0To1)",
+            ),
+            pytest.param(
+                0,
+                6.0,
+                [Ops.ADD, Ops.RELU6],
+                id="min = 0, max = 6 (Relu6)",
+            ),
+        ],
     )
+    def test_convert_clamp__relu_vs_maxmin(
+        self, mocker, min, max, expected_neutron_ir_ops
+    ):
+        input_shape = (23,)
+        model = AddHardTanhModule(min, max)
+
+        converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+        neutron_ir_spy = mocker.spy(AtenModelBuilderDirector, "finish")
+
+        delegated_ep = to_quantized_edge_program(
+            model,
+            input_shape,
+        ).exported_program()
+
+        # Make sure the `clamp` was delegated.
+        assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
+        assert not graph_contains_any_of_ops(delegated_ep.graph, [HardTanh])
+
+        intermediate_ep = converter_spy.call_args.args[1]
+        quant_node = list(intermediate_ep.graph.nodes)[-2]
+        dequant_node = list(intermediate_ep.graph.nodes)[-4]
+        neutron_ir_internal_ops = [
+            op.builtin_code for op in neutron_ir_spy.spy_return.operator_codes.vector
+        ]
+
+        assert graph_contains_any_of_ops(intermediate_ep.graph, [HardTanh])
+        assert (
+            len(neutron_ir_internal_ops) == len(expected_neutron_ir_ops) + 1
+        )  # Transpose
+        assert all(op in neutron_ir_internal_ops for op in expected_neutron_ir_ops)
+
+        if len(expected_neutron_ir_ops) == 3:
+            # Min/Max variant should have same input and output quantization
+            assert all(
+                q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:])
+            )
+        else:
+            assert not all(
+                q == dq for q, dq in zip(quant_node.args[1:], dequant_node.args[1:])
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
index 81dbe9aa0fb..567cf85ebe5 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_leaky_relu_converter.py
@@ -34,7 +34,7 @@ def forward(self, x):
 
 class TestLeakyRelu:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, use_qat=False):
+    def assert_delegated(self, model, input_shape, mocker, request, use_qat=False):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={LeakyRelu: 1},
@@ -48,6 +48,7 @@ def assert_delegated(self, model, input_shape, mocker, use_qat=False):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -63,28 +64,29 @@ def assert_delegated(self, model, input_shape, mocker, use_qat=False):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__default_alpha__input_shapes(self, mocker, input_shape):
+    def test__default_alpha__input_shapes(self, mocker, request, input_shape):
         model = LeakyReluModule()
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
-    def test__default_alpha__qat(self, mocker, use_qat):
+    def test__default_alpha__qat(self, mocker, request, use_qat):
         model = LeakyReluModule()
         input_shape = (23,)
-        self.assert_delegated(model, input_shape, mocker, use_qat)
+        self.assert_delegated(model, input_shape, mocker, request, use_qat)
 
     @pytest.mark.parametrize(
         "alpha",
         [0.01, 3.14159, 0, 1, float("inf")],
         ids=lambda alpha: f"alpha = {alpha}",
     )
-    def test__specific_alpha(self, mocker, alpha):
+    def test__specific_alpha(self, mocker, request, alpha):
         model = LeakyReluModule(negative_slope=alpha)
-        self.assert_delegated(model, (23,), mocker)
+        self.assert_delegated(model, (23,), mocker, request)
 
-    def test__inplace(self, mocker):
+    def test__inplace(self, mocker, request):
         model = LeakyReluModule(inplace=True)
         self.assert_delegated(
             model,
             (23,),
             mocker,
+            request,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
index 3e1d066103a..0b7fe88cffc 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_log_converter.py
@@ -35,7 +35,7 @@ def forward(self, x):
 
 
 class TestLog:
-    def test__basic_nsys_inference(self, mocker):
+    def test__basic_nsys_inference(self, mocker, request):
         # Use 256 elements so that, after quantization to int8, the input can
         # cover the full discrete range [-128, 127].
         # The dataset is generated as a linear float ramp and later quantized,
@@ -49,6 +49,7 @@ def test__basic_nsys_inference(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator=LinearRampDatasetCreator(low=0.0, high=1.0),
         )
 
@@ -60,7 +61,7 @@ def test__basic_nsys_inference(self, mocker):
             pytest.param((1, 3, 16, 16), id="4D"),
         ],
     )
-    def test__basic_nsys_inference__qat(self, mocker, input_shape, use_qat):
+    def test__basic_nsys_inference__qat(self, mocker, request, input_shape, use_qat):
         model = LogModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={Log: 1}, expected_non_delegated_ops={}
@@ -69,6 +70,7 @@ def test__basic_nsys_inference__qat(self, mocker, input_shape, use_qat):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator=RandomDatasetCreator(low=1.0, high=10.0),
             use_qat=use_qat,
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
index c95b3cd3b8d..55a47146bfc 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
@@ -51,14 +51,14 @@ def reseed_model_per_test_run():
 
 class TestMaxPool2D:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker):
+    def assert_delegated(self, model, input_shape, mocker, request):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1},
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
 
     # noinspection PyMethodMayBeStatic
     def assert_not_delegated(self, model, input_shape):
@@ -70,12 +70,12 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [MaxPool2DWithIndices])
 
-    def test__basic_nsys_inference(self, mocker):
+    def test__basic_nsys_inference(self, mocker, request):
         input_shape = (2, 4, 6, 7)  # The old flow limited the batch size to 1.
         model = MaxPool2dModule()
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
-    def test__basic_nsys_inference_qat(self, mocker):
+    def test__basic_nsys_inference_qat(self, mocker, request):
         input_shape = (2, 11, 7, 16)  # The old flow limited the batch size to 1.
         model = MaxPool2dModule()
         graph_verifier = DetailedGraphVerifier(
@@ -88,20 +88,21 @@ def test__basic_nsys_inference_qat(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             use_qat=True,
         )
 
-    def test__large_kernel_size(self, mocker):
+    def test__large_kernel_size(self, mocker, request):
         kernel_size = (1, 5000)
         input_shape = (1, 4) + kernel_size
         model = MaxPool2dModule(kernel_size, stride=1)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
-    def test__stride_limit__no_padding(self, mocker):
+    def test__stride_limit__no_padding(self, mocker, request):
         stride = 4096
         input_shape = (1, 4, 1, 4096)
         model = MaxPool2dModule(1, stride=stride)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     def test__stride_limit_exceeded__no_padding(self):
         stride = 4097  # Exceeds the stride limit.
@@ -109,12 +110,12 @@ def test__stride_limit_exceeded__no_padding(self):
         model = MaxPool2dModule(1, stride=stride)
         self.assert_not_delegated(model, input_shape)
 
-    def test__stride_limit__padding(self, mocker):
+    def test__stride_limit__padding(self, mocker, request):
         padding = 1
         stride = 4096
         input_shape = (1, 2, 3, stride)
         model = MaxPool2dModule(3, stride=stride, padding=padding)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     def test__stride_limit_exceeded__padding(self):
         padding = 1
@@ -126,7 +127,7 @@ def test__stride_limit_exceeded__padding(self):
     @pytest.mark.skip(
         reason="Large padding requires large kernel size which results in an extremely slow test."
     )
-    def test__padding_limit(self, mocker):
+    def test__padding_limit(self, mocker, request):
         # As the padding is added wia a `Pad` operator (not the `MaxPool` arguments), there is no limit to the padded
         #  value. But as padding can be at most half of the kernel size (PyTorch requirement) and kernel size is limited
         #  to 4096, padding of 2048 is the limit.
@@ -134,16 +135,16 @@ def test__padding_limit(self, mocker):
         kernel_size = padding * 2
         input_shape = (1, 1, 2, 3)
         model = MaxPool2dModule(kernel_size, padding=padding)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
-    def test__padding__max_pool_limit_exceeded(self, mocker):
+    def test__padding__max_pool_limit_exceeded(self, mocker, request):
         # NeutronIR `MaxPool` padding is limited to 32. But as it is added by the `Pad` operator instead, there is no
         #  limit. This tests ensures the `MaxPool` padding limit is not a problem.
         padding = 33
         kernel_size = padding * 2
         input_shape = (1, 2, 3, 4)
         model = MaxPool2dModule(kernel_size, padding=padding)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     def test__padding_to_kernel_ratio_exceeded(self):
         # Both PyTorch and Neutron require the padding to be at most half of the kernel size.
@@ -160,7 +161,7 @@ def test__padding_to_kernel_ratio_exceeded(self):
 class TestMaxPool1D:
 
     # Just a basic test to verify that the operator gets extended to the 2D variant correctly.
-    def test__basic_nsys_inference__view_not_delegated(self, mocker):
+    def test__basic_nsys_inference__view_not_delegated(self, mocker, request):
         input_shape = (2, 4, 6)  # The old flow limited the batch size to 1.
         model = MaxPool1DModule()
 
@@ -170,4 +171,4 @@ def test__basic_nsys_inference__view_not_delegated(self, mocker):
             expected_non_delegated_ops={},
         )
 
-        lower_run_compare(model, input_shape, graph_verifier)
+        lower_run_compare(model, input_shape, graph_verifier, request)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
index 8195581c0f6..f84471169ea 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
@@ -9,6 +9,18 @@
 import pytest
 import torch
 
+from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
+    ModelBuilder,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.max_pool_2d_options import (
+    MaxPool2D,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.mean_options import (
+    Mean,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.transpose_options import (
+    Transpose,
+)
 from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
@@ -50,71 +62,81 @@ def forward(self, x):
 
 
 class MaxPoolMeanDimModule(torch.nn.Module):
+    @staticmethod
+    def noop_max_pool_2d(x):
+        """Call `torch.max_pool2d` that is a NoOp, but it enforces the ChannelsFirst format in the `NodeFormatInference`."""
+        return torch.max_pool2d(x, kernel_size=1)
+
     def __init__(self, dim, keepdim):
         super().__init__()
         self.dim, self.keepdim = dim, keepdim
 
     def forward(self, x):
-        x = torch.max_pool2d(
-            x, kernel_size=1
-        )  # NoOp, but it enforces the channels first format.
-        return torch.mean(x, dim=self.dim, keepdim=self.keepdim)
+        x = self.noop_max_pool_2d(x)
+        x = torch.mean(x, dim=self.dim, keepdim=self.keepdim)
+        return x
 
 
-class TestMeanDim:
+class MeanDimMaxPoolModule(MaxPoolMeanDimModule):
+    def forward(self, x):
+        x = torch.mean(x, dim=self.dim, keepdim=self.keepdim)
+        x = self.noop_max_pool_2d(x)
+        return x
 
-    # noinspection PyMethodMayBeStatic
-    def assert_delegated(
-        self,
-        model,
-        input_shape,
+
+def assert_delegated(
+    model,
+    input_shape,
+    mocker,
+    request,
+    use_qat=False,
+    expected_delegated_ops=None,
+):
+    if expected_delegated_ops is None:
+        expected_delegated_ops = {MeanDim: 1}
+
+    graph_verifier = DetailedGraphVerifier(
         mocker,
-        use_qat=False,
-        atol=None,
-        expected_delegated_ops=None,
-    ):
-        if expected_delegated_ops is None:
-            expected_delegated_ops = {MeanDim: 1}
+        expected_delegated_ops=expected_delegated_ops,
+        expected_non_delegated_ops={},
+    )
 
-        graph_verifier = DetailedGraphVerifier(
-            mocker,
-            expected_delegated_ops=expected_delegated_ops,
-            expected_non_delegated_ops={},
-        )
+    # Cover also negative values to thoroughly test the operator.
+    dataset_creator = RandomDatasetCreator(low=-2, high=2)
 
-        # Cover also negative values to thoroughly test the operator.
-        dataset_creator = RandomDatasetCreator(low=-2, high=2)
+    remove_quant_io_ops = True  # Use quantized dataset.
+    output_comparator = AllCloseOutputComparator(atol=1)  # Allow single bit error.
 
-        kwargs = {"atol": atol} if atol is not None else {}
-        output_comparator = AllCloseOutputComparator(**kwargs)
+    lower_run_compare(
+        model,
+        input_shape,
+        graph_verifier,
+        request,
+        dataset_creator,
+        output_comparator,
+        use_qat=use_qat,
+        remove_quant_io_ops=remove_quant_io_ops,
+    )
 
-        lower_run_compare(
-            model,
-            input_shape,
-            graph_verifier,
-            dataset_creator,
-            output_comparator,
-            use_qat=use_qat,
-        )
 
-    # noinspection PyMethodMayBeStatic
-    def assert_not_delegated(self, model, input_shape):
-        delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
+def assert_not_delegated(model, input_shape):
+    delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
+
+    # Make sure the `mean` was NOT delegated.
+    assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
+    assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim])
 
-        # Make sure the `mean` was NOT delegated.
-        assert not graph_contains_any_of_ops(
-            delegated_ep.graph, [ExecutorchDelegateCall]
-        )
-        assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim])
+
+class TestMeanDim:
 
     @pytest.fixture(params=[True, False], ids=lambda keep_dim: f"keep_dim = {keep_dim}")
     def keep_dim(self, request):
         return request.param
 
-    def test__basic_nsys_inference__qat(self, mocker, use_qat, keep_dim):
+    def test__basic_nsys_inference__qat(self, mocker, request, use_qat, keep_dim):
         input_shape = (23,)
         model = MeanDimModule(0, keep_dim)
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
+        assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -128,12 +150,9 @@ def test__basic_nsys_inference__qat(self, mocker, use_qat, keep_dim):
             pytest.param((3, 1, 4, 1, 5), 0, id="5D, dim = 0."),
         ],
     )
-    def test__single_dims(self, mocker, input_shape, dim, keep_dim):
+    def test__single_dims(self, mocker, request, input_shape, dim, keep_dim):
         model = MeanDimModule(dim, keep_dim)
-        # Relatively large error, but it is actually equal to the output scale, so it is a single bit error.
-        # TODO Replace with quantized dataset testing and `atol = 1`.
-        atol = 0.014
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        assert_delegated(model, input_shape, mocker, request)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -145,12 +164,9 @@ def test__single_dims(self, mocker, input_shape, dim, keep_dim):
             pytest.param((3, 1, 4, 1, 5), (3, -5, -4), id="5D, dim = (3, -5 ,-4)."),
         ],
     )
-    def test__tuple_dims(self, mocker, input_shape, dim, keep_dim):
+    def test__tuple_dims(self, mocker, request, input_shape, dim, keep_dim):
         model = MeanDimModule(dim, keep_dim)
-        # Relatively large error, but it is actually equal to the output scale, so it is a single bit error.
-        # TODO Replace with quantized dataset testing and `atol = 1`.
-        atol = 0.015
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        assert_delegated(model, input_shape, mocker, request)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -162,7 +178,7 @@ def test__tuple_dims(self, mocker, input_shape, dim, keep_dim):
     def test__noop__only_node__not_delegated(self, input_shape, dim):
         keep_dim = True  # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op.
         model = MeanDimModule(dim, keep_dim)
-        self.assert_not_delegated(model, input_shape)
+        assert_not_delegated(model, input_shape)
 
     @pytest.mark.parametrize(
         "input_shape, dim",
@@ -171,13 +187,14 @@ def test__noop__only_node__not_delegated(self, input_shape, dim):
             pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."),
         ],
     )
-    def test__noop__not_only_node__delegated(self, mocker, input_shape, dim):
+    def test__noop__not_only_node__delegated(self, mocker, request, input_shape, dim):
         keep_dim = True  # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op.
         model = MeanDimAddModule(dim, keep_dim)
-        self.assert_delegated(
+        assert_delegated(
             model,
             input_shape,
             mocker,
+            request,
             expected_delegated_ops={MeanDim: 1, AddTensor: 1},
         )
 
@@ -186,44 +203,207 @@ def test__noop__not_only_node__delegated(self, mocker, input_shape, dim):
         [
             pytest.param((3, 1, 4), 1, id="3D, dim = 1."),
             pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."),
+            pytest.param((1, 7, 3, 3), [0], id="4D, dim = [0]."),
         ],
     )
-    def test__no_reduction__keepdim_false__delegated(self, mocker, input_shape, dim):
+    def test__no_reduction__keepdim_false__delegated(
+        self, mocker, request, input_shape, dim
+    ):
         # These cases reduce over a dimension of size 1.
         # When `keep_dim=True` the node is a noop, and it's not delegated (see `test__noop__only_node__not_delegated`),
         # but with `keep_dim=False` it changes the shape so it's not a noop and is therefore delegated successfully.
         keep_dim = False
         model = MeanDimModule(dim, keep_dim)
-        self.assert_delegated(model, input_shape, mocker)
+        assert_delegated(model, input_shape, mocker, request)
 
-    @pytest.mark.parametrize(
-        "input_shape, dim",
-        [((1, 7, 3, 3), 1)],
-        ids=lambda val: f"shape={val}" if isinstance(val, tuple) else f"dim={val}",
-    )
-    @pytest.mark.parametrize(
-        "keep_dim",
-        [
-            pytest.param(True),
-            pytest.param(
-                False,
-                marks=pytest.mark.xfail(
-                    strict=True, reason="Known format inference bug (EIEX-937)."
-                ),
-            ),
-        ],
-        ids=lambda kd: f"keep_dim={kd}",
-    )
-    def test__channels_first__keep_dim__true(self, mocker, input_shape, dim, keep_dim):
+    def test__channels_first__keep_dim__true(self, mocker, request):
         # Just 1 test case to verify correct handling of the `dim`.
         # Most cases fall into the single bit error case, and since this test uses 2 operators, the error accumulates
         #  and the final error is larger. We cannot with 100% certainty say that the error is only caused by the single
         #  bit errors and not related to the format. That's why only this 1 case with no errors is used.
-
-        model = MaxPoolMeanDimModule(dim, keep_dim)
-        self.assert_delegated(
+        input_shape, dim = (1, 7, 3, 3), 1
+        model = MaxPoolMeanDimModule(dim, True)
+        assert_delegated(
             model,
             input_shape,
             mocker,
+            request,
             expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1, MeanDim: 1},
         )
+
+    class TestKeepDimFalseFormatHandling:
+        """When `keep_dim = False`, the `mean.dim` operator changes the rank, so the format have to be explicitly
+        handled. The tests in this class focus on the related edge cases.
+        """
+
+        def _assert_neutron_ir_model_has_ops(
+            self, model_builder_finish_spy, expected_ops
+        ):
+            assert (
+                model_builder_finish_spy.call_count == 1
+            ), "Conversion to Neutron IR happened multiple times."
+
+            neutron_ir_ops = model_builder_finish_spy.spy_return.sub_graphs[
+                0
+            ].operators.vector
+            assert len(neutron_ir_ops) == len(
+                expected_ops
+            ), "Neutron IR model doesn't have the expected number of ops."
+
+            for op, expected_op in zip(neutron_ir_ops, expected_ops, strict=True):
+                assert isinstance(
+                    op.builtin_options, expected_op
+                ), f"Expected {expected_op}, got {op}."
+
+        @pytest.mark.parametrize(
+            "dim",
+            [
+                1,
+                [0, -3],
+                (-4, 1, 2),
+                [-3, 3],
+                [1, 2, 3],
+            ],
+            ids=lambda dim: f"dim={dim}",
+        )
+        def test__channels_first_input__reducing_channels(self, mocker, request, dim):
+            # If the channels dimension is reduced (removed), the `mean` output will always be equal in channels first
+            #  and channels last, so no `Transpose` ops are added.
+            input_shape = (1, 7, 3, 3)
+            model = MaxPoolMeanDimModule(dim, False)
+
+            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
+            assert_delegated(
+                model,
+                input_shape,
+                mocker,
+                request,
+                expected_delegated_ops={
+                    MaxPool2DWithIndices: 1,
+                    GetItem: 1,
+                    MeanDim: 1,
+                },
+            )
+            self._assert_neutron_ir_model_has_ops(
+                model_builder_finish_spy,
+                expected_ops=[
+                    Transpose,
+                    MaxPool2D,
+                    Mean,
+                ],
+            )
+
+        @pytest.mark.parametrize(
+            "dim",
+            [
+                (2, 3),
+                [1, -2, 3],
+                [-1, -2, 0],
+            ],
+            ids=lambda dim: f"dim={dim}",
+        )
+        def test__channels_first_input__reducing_all_spatial_dims(
+            self, mocker, request, dim
+        ):
+            # If tall he spatial dimensions are reduced (removed), the `mean` output will always be equal in channels
+            #  first and channels last, so no `Transpose` ops are added.
+            input_shape = (1, 7, 3, 3)
+            model = MaxPoolMeanDimModule(dim, False)
+
+            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
+            assert_delegated(
+                model,
+                input_shape,
+                mocker,
+                request,
+                expected_delegated_ops={
+                    MaxPool2DWithIndices: 1,
+                    GetItem: 1,
+                    MeanDim: 1,
+                },
+            )
+            self._assert_neutron_ir_model_has_ops(
+                model_builder_finish_spy,
+                expected_ops=[
+                    Transpose,
+                    MaxPool2D,
+                    Mean,
+                ],
+            )
+
+        @pytest.mark.xfail(strict=True, reason="Known Neutron bug (AIR-14726).")
+        @pytest.mark.parametrize(
+            "dim",
+            [
+                0,
+                (2,),
+                [-1, 0],
+            ],
+            ids=lambda dim: f"dim={dim}",
+        )
+        def test__channels_first_input__not_reducing_channels_or_all_spatial_dims(
+            self, mocker, request, dim
+        ):
+            # If the channels dimension is not reduced, a `Transpose` operator must be added to make the input channels
+            #  first in Neutron IR.
+
+            input_shape = (1, 7, 3, 3)
+            model = MaxPoolMeanDimModule(dim, False)
+
+            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
+            assert_delegated(
+                model,
+                input_shape,
+                mocker,
+                request,
+                expected_delegated_ops={
+                    MaxPool2DWithIndices: 1,
+                    GetItem: 1,
+                    MeanDim: 1,
+                },
+            )
+
+            self._assert_neutron_ir_model_has_ops(
+                model_builder_finish_spy,
+                expected_ops=[
+                    Transpose,
+                    MaxPool2D,
+                    Transpose,  # The necessary `Transpose` operator.
+                    Mean,
+                ],
+            )
+
+        @pytest.mark.parametrize(
+            "input_shape, dim",
+            [
+                pytest.param((2, 3, 4, 5, 6), 0, id="dim=0, 5D->4D"),
+                pytest.param((2, 3, 4, 5, 6), [-3], id="dim=[-3], 5D->4D"),
+                pytest.param((1, 2, 3, 4, 5, 6), (1, -1), id="dim=(1, -1), 6D->4D"),
+            ],
+            ids=lambda dim: f"dim={dim}",
+        )
+        def test__channels_first_output(self, mocker, request, input_shape, dim):
+            model = MeanDimMaxPoolModule(dim, False)
+
+            model_builder_finish_spy = mocker.spy(ModelBuilder, "finish")
+            assert_delegated(
+                model,
+                input_shape,
+                mocker,
+                request,
+                expected_delegated_ops={
+                    MaxPool2DWithIndices: 1,
+                    GetItem: 1,
+                    MeanDim: 1,
+                },
+            )
+
+            self._assert_neutron_ir_model_has_ops(
+                model_builder_finish_spy,
+                expected_ops=[
+                    Mean,
+                    Transpose,  # The necessary `Transpose` operator.
+                    MaxPool2D,
+                    Transpose,
+                ],
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
index 897c3efd850..d112ff1e1ac 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
@@ -41,7 +41,7 @@ class TestMulTensor:
             pytest.param((1, 4, 8, 8), id="4D."),
         ],
     )
-    def test__basic_nsys_inference(self, x_input_shape, mocker):
+    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
@@ -52,6 +52,7 @@ def test__basic_nsys_inference(self, x_input_shape, mocker):
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
         )
 
     @pytest.mark.parametrize(
@@ -61,7 +62,7 @@ def test__basic_nsys_inference(self, x_input_shape, mocker):
             pytest.param((1, 4, 8, 8), id="4D."),
         ],
     )
-    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
+    def test__basic_nsys_inference_qat(self, mocker, request, x_input_shape):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
@@ -72,6 +73,7 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
             use_qat=True,
         )
 
@@ -90,13 +92,13 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
             ),
         ],
     )
-    def test__correct_broadcast(self, input_spec, mocker):
+    def test__correct_broadcast(self, input_spec, mocker, request):
         model = MulTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={}
         )
 
-        lower_run_compare(model, input_spec, graph_verifier)
+        lower_run_compare(model, input_spec, graph_verifier, request)
 
     @pytest.mark.parametrize(
         "input_spec",
@@ -134,7 +136,7 @@ def test__incorrect_broadcast(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, x_input_shape, mocker):
+    def test__w_conv(self, mocker, request, x_input_shape):
         model = MulTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -151,6 +153,7 @@ def test__w_conv(self, x_input_shape, mocker):
             model,
             [x_input_spec, y_input_spec],
             graph_verifier,
+            request,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
index 31436a3f200..bdfd1e9da25 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
@@ -77,7 +77,13 @@ def forward(self, x):
 class TestPermuteCopy:
     # noinspection PyMethodMayBeStatic
     def assert_delegated(
-        self, model, input_shape, mocker, expected_delegated_ops=None, use_qat=False
+        self,
+        model,
+        input_shape,
+        mocker,
+        request,
+        expected_delegated_ops=None,
+        use_qat=False,
     ):
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -89,6 +95,7 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
+            request,
             use_qat=use_qat,
         )
 
@@ -115,18 +122,18 @@ def _special_4d_permutations() -> list[ParameterSet]:
             pytest.param((3, 2, 1, 0), id="reverse"),
         ]
 
-    def test__qat(self, mocker, use_qat):
+    def test__qat(self, mocker, request, use_qat):
         input_shape = (2, 3, 5, 7)
         permutation = (0, 2, 3, 1)  # NCHW -> NHWC
         model = PermuteModule(permutation)
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "permutation",
         _all_permutations_for_rank(3),
         ids=lambda perm: f"permutation = {perm}",
     )
-    def test__all_permutations__3d(self, mocker, permutation: tuple[int]):
+    def test__all_permutations__3d(self, mocker, request, permutation: tuple[int]):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5)
         model = PermuteModule(permutation)
@@ -135,14 +142,14 @@ def test__all_permutations__3d(self, mocker, permutation: tuple[int]):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker)
+            self.assert_delegated(model, input_shape, mocker, request)
 
     @pytest.mark.parametrize(
         "permutation",
         _all_permutations_for_rank(4),
         ids=lambda perm: f"permutation = {perm}",
     )
-    def test__all_permutations__4d(self, mocker, permutation: tuple[int]):
+    def test__all_permutations__4d(self, mocker, request, permutation: tuple[int]):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteModule(permutation)
@@ -151,43 +158,55 @@ def test__all_permutations__4d(self, mocker, permutation: tuple[int]):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker)
+            self.assert_delegated(model, input_shape, mocker, request)
 
     @pytest.mark.parametrize("permutation", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_input(
-        self, mocker, permutation: tuple[int]
+        self, mocker, request, permutation: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = MaxPoolPermuteModule(permutation)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1}
         self.assert_delegated(
-            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops=expected_delegated_ops,
         )
 
     @pytest.mark.parametrize("permutation", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_output(
-        self, mocker, permutation: tuple[int]
+        self, mocker, request, permutation: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteMaxPoolModule(permutation)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 1}
         self.assert_delegated(
-            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops=expected_delegated_ops,
         )
 
     @pytest.mark.parametrize("perm1", _special_4d_permutations())
     @pytest.mark.parametrize("perm2", _special_4d_permutations())
     def test__all_permutations__4d__channels_first_io(
-        self, mocker, perm1: tuple[int], perm2: tuple[int]
+        self, mocker, request, perm1: tuple[int], perm2: tuple[int]
     ):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 7)
         model = PermuteMaxPoolPermuteModule(perm1, perm2)
         expected_delegated_ops = {MaxPool2DWithIndices: 1, GetItem: 1, PermuteCopy: 2}
         self.assert_delegated(
-            model, input_shape, mocker, expected_delegated_ops=expected_delegated_ops
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops=expected_delegated_ops,
         )
 
     @pytest.mark.parametrize(
@@ -200,7 +219,7 @@ def test__all_permutations__4d__channels_first_io(
             pytest.param((4, 2, 3, 0, 1), id="perm = (4, 2, 3, 0, 1)"),
         ],
     )
-    def test__5d(self, mocker, permutation):
+    def test__5d(self, mocker, request, permutation):
         # Avoid dimensions of size 1 and multiples of `num_macs` for a thorough test.
         input_shape = (2, 3, 5, 3, 5)
         model = PermuteModule(permutation)
@@ -209,4 +228,4 @@ def test__5d(self, mocker, permutation):
             #  would result in an empty graph, which is not allowed. Therefore, it's not delegated.
             self.assert_not_delegated(model, input_shape)
         else:
-            self.assert_delegated(model, input_shape, mocker)
+            self.assert_delegated(model, input_shape, mocker, request)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
index ab42560f075..ca2abd18f32 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
@@ -62,7 +62,7 @@ def forward(self, x):
         return self.relu(x)
 
 
-class TestReLUNewNeutronFlow:
+class TestReLU:
     @pytest.mark.parametrize(
         ["model", "input_shape"],
         [
@@ -98,7 +98,7 @@ class TestReLUNewNeutronFlow:
             ),
         ],
     )
-    def test_relu_conversion__full_pipeline(self, mocker, model, input_shape):
+    def test_relu_conversion__full_pipeline(self, mocker, request, model, input_shape):
         model = model()  # Avoid model creation at import time
         is_conv_module = not hasattr(model, "linear")
 
@@ -108,19 +108,20 @@ def test_relu_conversion__full_pipeline(self, mocker, model, input_shape):
                 {Convolution: 1, Relu: 1} if is_conv_module else {AddMm: 1, Relu: 1}
             ),
             expected_non_delegated_ops={},
-            ops_to_ignore=[
+            ops_to_ignore={
                 PermuteCopy,
                 ViewCopy,
                 QuantizePerTensor,
                 DequantizePerTensor,
                 DequantizePerChannel,
-            ],
+            },
         )
 
         lower_run_compare(
             model,
             input_shape,
             graph_verifier,
+            request,
         )
 
     @pytest.mark.parametrize(
@@ -136,7 +137,9 @@ def test_relu_conversion__full_pipeline(self, mocker, model, input_shape):
             ),
         ],
     )
-    def test_relu_conversion__non_delegated_with_old_flow(self, mocker, input_shape):
+    def test_relu_conversion__non_delegated_with_old_flow(
+        self, mocker, request, input_shape
+    ):
         verifier = DetailedGraphVerifier(
             mocker=mocker,
             expected_delegated_ops={Relu: 1},
@@ -146,8 +149,9 @@ def test_relu_conversion__non_delegated_with_old_flow(self, mocker, input_shape)
         lower_run_compare(
             ReLUModule(),
             input_shape,
-            dlg_model_verifier=verifier,
-            dataset_creator=RandomDatasetCreator(low=-1, high=1),
+            verifier,
+            request,
+            RandomDatasetCreator(low=-1, high=1),
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
index 75a32254a1d..bdd41d1eab0 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sigmoid_converter.py
@@ -30,7 +30,9 @@ def reseed_model_per_test_run():
 
 class TestSigmoid:
     # noinspection PyMethodMayBeStatic
-    def assert_delegated(self, model, input_shape, mocker, use_qat=False, atol=None):
+    def assert_delegated(
+        self, model, input_shape, mocker, request, use_qat=False, atol=None
+    ):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={Sigmoid: 1},
@@ -47,15 +49,16 @@ def assert_delegated(self, model, input_shape, mocker, use_qat=False, atol=None)
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
             output_comparator,
             use_qat=use_qat,
         )
 
-    def test__basic_nsys_inference__qat(self, mocker, use_qat):
+    def test__basic_nsys_inference__qat(self, mocker, request, use_qat):
         input_shape = (23,)
         model = nn.Sigmoid()
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape",
@@ -68,13 +71,13 @@ def test__basic_nsys_inference__qat(self, mocker, use_qat):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__input_shapes(self, mocker, input_shape):
+    def test__input_shapes(self, mocker, request, input_shape):
         model = nn.Sigmoid()
 
         output_scale = 1.0 / 256.0
         lowering_spy = mocker.spy(NeutronPartitioner, "partition")
         self.assert_delegated(
-            model, input_shape, mocker, atol=output_scale
+            model, input_shape, mocker, request, atol=output_scale
         )  # Allow single bit error.
 
         # Verify that the `atol` is indeed equal to the output scale.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
index cb0ec09bcce..98cc924ee85 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py
@@ -41,7 +41,9 @@ def _slice_id(prefix, input_shape, dims, starts, ends):
         return f"{prefix}rank={len(input_shape)}_dims={str(dims)}_starts={str(starts)}_ends={str(ends)}"
 
     @staticmethod
-    def assert_delegated_and_correct(model, input_shape, num_slices, mocker, use_qat):
+    def assert_delegated_and_correct(
+        model, input_shape, num_slices, mocker, request, use_qat
+    ):
         graph_verifier = DetailedGraphVerifier(
             mocker,
             expected_delegated_ops={SliceCopy: num_slices},
@@ -54,6 +56,7 @@ def assert_delegated_and_correct(model, input_shape, num_slices, mocker, use_qat
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset,
             comparator,
             use_qat=use_qat,
@@ -182,12 +185,14 @@ def assert_not_delegated(model, input_shape):
             ),
         ],
     )
-    def test_nsys_inference__basic(self, input_shape, dims, starts, ends, mocker):
+    def test_nsys_inference__basic(
+        self, input_shape, dims, starts, ends, mocker, request
+    ):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, use_qat=False
+            model, input_shape, num_slices, mocker, request, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -209,7 +214,9 @@ def test_nsys_inference__basic(self, input_shape, dims, starts, ends, mocker):
             ),
         ],
     )
-    def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker):
+    def test_nsys_inference__reduction(
+        self, input_shape, dims, starts, ends, mocker, request
+    ):
         model = SliceTensorModule(dims, starts, ends)
 
         slice_lengths = [e - s for s, e in zip(starts, ends)]
@@ -219,7 +226,7 @@ def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker
         else:
             num_slices = len(dims)
             self.assert_delegated_and_correct(
-                model, input_shape, num_slices, mocker, use_qat=False
+                model, input_shape, num_slices, mocker, request, use_qat=False
             )
 
     @pytest.mark.parametrize(
@@ -241,12 +248,14 @@ def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker
             ),
         ],
     )
-    def test_nsys_inference__clipped(self, input_shape, dims, starts, ends, mocker):
+    def test_nsys_inference__clipped(
+        self, input_shape, dims, starts, ends, mocker, request
+    ):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, use_qat=False
+            model, input_shape, num_slices, mocker, request, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -269,13 +278,13 @@ def test_nsys_inference__clipped(self, input_shape, dims, starts, ends, mocker):
         ],
     )
     def test_nsys_inference__normalization(
-        self, input_shape, dims, starts, ends, mocker
+        self, input_shape, dims, starts, ends, mocker, request
     ):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, use_qat=False
+            model, input_shape, num_slices, mocker, request, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -304,12 +313,14 @@ def test_nsys_inference__normalization(
             ),
         ],
     )
-    def test_nsys_inference__big(self, input_shape, dims, starts, ends, mocker):
+    def test_nsys_inference__big(
+        self, input_shape, dims, starts, ends, mocker, request
+    ):
         model = SliceTensorModule(dims, starts, ends)
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, use_qat=False
+            model, input_shape, num_slices, mocker, request, use_qat=False
         )
 
     @pytest.mark.parametrize(
@@ -336,7 +347,7 @@ def test_nsys_inference__identity(self, input_shape, dims, starts, ends):
 
         self.assert_model_without_slices(model, input_shape)
 
-    def test_nsys_inference__with_conv(self, mocker):
+    def test_nsys_inference__with_conv(self, mocker, request):
         input_shape = (11, 13, 5, 7)
         in_channels = input_shape[1]
         out_channels = 19
@@ -360,12 +371,13 @@ def test_nsys_inference__with_conv(self, mocker):
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset,
             comparator,
             use_qat=False,
         )
 
-    def test_nsys_inference__qat(self, mocker):
+    def test_nsys_inference__qat(self, mocker, request):
         input_shape = (7, 13, 7, 9)
         dims = (0, 1, 2, 3)
         starts = (1, 2, 3, 2)
@@ -375,5 +387,5 @@ def test_nsys_inference__qat(self, mocker):
 
         num_slices = len(dims)
         self.assert_delegated_and_correct(
-            model, input_shape, num_slices, mocker, use_qat=True
+            model, input_shape, num_slices, mocker, request, use_qat=True
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
index 9638f8fe0ec..e71ff7e8af5 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -16,6 +16,9 @@
 )
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    AllCloseOutputComparator,
+)
 from executorch.backends.nxp.tests.models import SubTensorConvModule, SubTensorModule
 from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
 from executorch.backends.nxp.tests.ops_aliases import (
@@ -38,76 +41,50 @@ class TestSubTensor:
         [
             pytest.param((1,), id="1D."),
             pytest.param((6, 5), id="2D."),
+            pytest.param((6, 82), id="2D alt."),
             pytest.param((1, 4, 7), id="3D."),
-            pytest.param(
-                (6, 82),
-                id="2D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (1, 68, 7),
-                id="3D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (2, 4, 3, 15),
-                id="4D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (1, 4, 9, 11, 4),
-                id="5D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
+            pytest.param((1, 68, 7), id="3D alt."),
+            pytest.param((2, 4, 3, 15), id="4D."),
+            pytest.param((1, 4, 9, 11, 4), id="5D."),
         ],
     )
-    def test__basic_nsys_inference(self, x_input_shape, mocker):
+    def test__basic_nsys_inference(self, mocker, request, x_input_shape):
         x_input_spec = ModelInputSpec(x_input_shape)
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
-    @pytest.mark.parametrize(
-        "x_input_shape",
-        [
-            pytest.param((1,), id="1D."),
-            pytest.param((6, 5), id="2D."),
-            pytest.param((2, 4, 3, 15), id="4D."),
-            pytest.param(
-                (1, 4, 7),
-                id="3D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-            pytest.param(
-                (1, 4, 9, 11, 4),
-                id="5D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
-            ),
-        ],
-    )
-    def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
-        x_input_spec = ModelInputSpec(x_input_shape)
+    def test__basic_nsys_inference_qat(self, mocker, request):
+        x_input_spec = ModelInputSpec((2, 4, 3, 15))
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             [x_input_spec, x_input_spec],
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
             use_qat=True,
+            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -117,32 +94,34 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
                 [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
             ),
             pytest.param(
-                [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
+                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+                id="2 inputs 2D alt.",
             ),
             pytest.param(
-                [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
-                id="2 inputs 3D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+                [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
             ),
             pytest.param(
-                [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
-                id="2 inputs 2D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+                [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
+                id="2 inputs 3D.",
             ),
         ],
     )
-    def test__broadcast(self, input_spec, mocker):
+    def test__broadcast(self, mocker, request, input_spec):
         model = SubTensorModule()
         graph_verifier = DetailedGraphVerifier(
             mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
@@ -181,7 +160,7 @@ def test__broadcast_unsupported(self, input_spec):
             ),
         ],
     )
-    def test__w_conv(self, x_input_shape, mocker):
+    def test__w_conv(self, mocker, request, x_input_shape):
         model = SubTensorConvModule()
 
         n, c, h, w = x_input_shape
@@ -199,6 +178,7 @@ def test__w_conv(self, x_input_shape, mocker):
             model,
             [x_input_spec, y_input_spec],
             graph_verifier,
+            request,
             dataset_creator,
         )
 
@@ -211,12 +191,11 @@ def test__w_conv(self, x_input_shape, mocker):
             ),
             pytest.param(
                 [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))],
-                id="2 inputs 4D + 4D incorrect.",
-                marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+                id="2 inputs 4D + 4D same height.",
             ),
         ],
     )
-    def test__w_conv_broadcast(self, input_spec, mocker):
+    def test__w_conv_broadcast(self, mocker, request, input_spec):
         model = SubTensorConvModule()
         graph_verifier = DetailedGraphVerifier(
             mocker,
@@ -224,12 +203,16 @@ def test__w_conv_broadcast(self, input_spec, mocker):
             expected_non_delegated_ops={},
         )
         dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+        comparator = AllCloseOutputComparator(atol=1)
 
         lower_run_compare(
             model,
             input_spec,
             graph_verifier,
+            request,
             dataset_creator,
+            comparator,
+            remove_quant_io_ops=True,
         )
 
     @pytest.mark.parametrize(
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
index 6336308e40b..51b7ee484a7 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
@@ -36,6 +36,7 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
+        request,
         use_qat=False,
         expected_delegated_ops=None,
     ):
@@ -55,6 +56,7 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -63,10 +65,10 @@ def assert_delegated(
     def inplace(self, request):
         return request.param
 
-    def test__qat__inplace(self, mocker, use_qat, inplace):
+    def test__qat__inplace(self, mocker, request, use_qat, inplace):
         shape = (23,)
         model = TanhModule(inplace)
-        self.assert_delegated(model, shape, mocker, use_qat=use_qat)
+        self.assert_delegated(model, shape, mocker, request, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "shape",
@@ -79,16 +81,20 @@ def test__qat__inplace(self, mocker, use_qat, inplace):
         ],
         ids=lambda shape: f"{len(shape)}D",
     )
-    def test__shapes(self, mocker, shape):
+    def test__shapes(self, mocker, request, shape):
         model = TanhModule()
-        self.assert_delegated(model, shape, mocker)
+        self.assert_delegated(model, shape, mocker, request)
 
-    def test__with_convolution(self, mocker):
+    def test__with_convolution(self, mocker, request):
         input_shape = (1, 3, 12, 16)
         channels = input_shape[1]
         model = Conv2dWithActivation(
             activation=torch.tanh, in_channels=channels, out_channels=channels
         )
         self.assert_delegated(
-            model, input_shape, mocker, expected_delegated_ops={Tanh: 1, Convolution: 1}
+            model,
+            input_shape,
+            mocker,
+            request,
+            expected_delegated_ops={Tanh: 1, Convolution: 1},
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
index c4a698f4bfb..f9b2269751f 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_bilinear2d.py
@@ -59,6 +59,7 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
+        request,
         use_qat=False,
         atol=None,
         expected_delegated_ops=None,
@@ -82,6 +83,7 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
             output_comparator,
             use_qat=use_qat,
@@ -96,21 +98,25 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D])
 
-    def test__qat__align_corners(self, mocker, use_qat):
+    def test__qat__align_corners(self, mocker, request, use_qat):
         align_corners = True
         input_shape = (1, 2, 3, 4)
         output_size = (5, 7)
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.015  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
+        self.assert_delegated(
+            model, input_shape, mocker, request, use_qat=use_qat, atol=atol
+        )
 
-    def test__qat__not_align_corners(self, mocker, use_qat):
+    def test__qat__not_align_corners(self, mocker, request, use_qat):
         align_corners = False
         input_shape = (1, 2, 3, 4)
         output_size = (6, 8)
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.015  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol)
+        self.assert_delegated(
+            model, input_shape, mocker, request, use_qat=use_qat, atol=atol
+        )
 
     @pytest.mark.parametrize(
         "input_shape, output_size",
@@ -125,11 +131,13 @@ def test__qat__not_align_corners(self, mocker, use_qat):
             pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
         ],
     )
-    def test__not_align_corners__output_size(self, mocker, input_shape, output_size):
+    def test__not_align_corners__output_size(
+        self, mocker, request, input_shape, output_size
+    ):
         align_corners = False
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
 
     def test__not_align_corners__output_size__unsupported(self):
         align_corners = False
@@ -151,11 +159,11 @@ def test__not_align_corners__output_size__unsupported(self):
             pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
         ],
     )
-    def test__not_align_corners__scales(self, mocker, input_shape, scale):
+    def test__not_align_corners__scales(self, mocker, request, input_shape, scale):
         align_corners = False
         model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
 
     def test__not_align_corners__scales__unsupported(self):
         align_corners = False
@@ -183,11 +191,13 @@ def test__not_align_corners__scales__unsupported(self):
             ),
         ],
     )
-    def test__align_corners__output_size(self, mocker, input_shape, output_size):
+    def test__align_corners__output_size(
+        self, mocker, request, input_shape, output_size
+    ):
         align_corners = True
         model = UpsampleBilinearModule(size=output_size, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
 
     def test__align_corners__output_size__unsupported(self):
         align_corners = True
@@ -240,11 +250,11 @@ def test__align_corners__output_size__input_size_equal_to_one(self):
             ),
         ],
     )
-    def test__align_corners__scales(self, mocker, input_shape, scale):
+    def test__align_corners__scales(self, mocker, request, input_shape, scale):
         align_corners = True
         model = UpsampleBilinearModule(scale=scale, align_corners=align_corners)
         atol = 0.016  # ~= output scale -> single bit error.
-        self.assert_delegated(model, input_shape, mocker, atol=atol)
+        self.assert_delegated(model, input_shape, mocker, request, atol=atol)
 
     def test__align_corners__scales__unsupported(self):
         align_corners = True
@@ -259,7 +269,7 @@ def test__noop__alone_in_partition__not_delegated(self):
         model = UpsampleBilinearModule(scale=scale)
         self.assert_not_delegated(model, input_shape)
 
-    def test__noop__not_alone_in_partition__delegated(self, mocker):
+    def test__noop__not_alone_in_partition__delegated(self, mocker, request):
         input_shape = (1, 2, 3, 4)
         scale = 1
         model = UpsampleBilinearAddModule(scale=scale)
@@ -267,5 +277,6 @@ def test__noop__not_alone_in_partition__delegated(self, mocker):
             model,
             input_shape,
             mocker,
+            request,
             expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1},
         )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
index 438a580f6e8..b3e28a7b2f8 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_upsample_nearest2d.py
@@ -53,6 +53,7 @@ def assert_delegated(
         model,
         input_shape,
         mocker,
+        request,
         use_qat=False,
         expected_delegated_ops=None,
     ):
@@ -72,6 +73,7 @@ def assert_delegated(
             model,
             input_shape,
             graph_verifier,
+            request,
             dataset_creator,
             use_qat=use_qat,
         )
@@ -85,11 +87,11 @@ def assert_not_delegated(self, model, input_shape):
         )
         assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D])
 
-    def test__qat(self, mocker, use_qat):
+    def test__qat(self, mocker, request, use_qat):
         input_shape = (1, 2, 3, 4)
         output_size = (6, 8)
         model = UpsampleNearestModule(size=output_size)
-        self.assert_delegated(model, input_shape, mocker, use_qat=use_qat)
+        self.assert_delegated(model, input_shape, mocker, request, use_qat=use_qat)
 
     @pytest.mark.parametrize(
         "input_shape, output_size",
@@ -105,9 +107,9 @@ def test__qat(self, mocker, use_qat):
             pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"),
         ],
     )
-    def test__output_size(self, mocker, input_shape, output_size):
+    def test__output_size(self, mocker, request, input_shape, output_size):
         model = UpsampleNearestModule(size=output_size)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     def test__output_size__unsupported(self):
         input_shape = (1, 2, 3, 4)
@@ -131,9 +133,9 @@ def test__output_size__unsupported(self):
             pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"),
         ],
     )
-    def test__scales(self, mocker, input_shape, scale):
+    def test__scales(self, mocker, request, input_shape, scale):
         model = UpsampleNearestModule(scale=scale)
-        self.assert_delegated(model, input_shape, mocker)
+        self.assert_delegated(model, input_shape, mocker, request)
 
     def test__scales__unsupported(self):
         input_shape = (1, 2, 3, 4)
@@ -147,7 +149,7 @@ def test__noop__alone_in_partition__not_delegated(self):
         model = UpsampleNearestModule(scale=scale)
         self.assert_not_delegated(model, input_shape)
 
-    def test__noop__not_alone_in_partition__delegated(self, mocker):
+    def test__noop__not_alone_in_partition__delegated(self, mocker, request):
         input_shape = (1, 2, 3, 4)
         scale = 1
         model = UpsampleNearestAddModule(scale=scale)
@@ -155,5 +157,6 @@ def test__noop__not_alone_in_partition__delegated(self, mocker):
             model,
             input_shape,
             mocker,
+            request,
             expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1},
         )
diff --git a/backends/nxp/tests/model_output_comparator.py b/backends/nxp/tests/model_output_comparator.py
index f0dd7cd2d60..5563703ae20 100644
--- a/backends/nxp/tests/model_output_comparator.py
+++ b/backends/nxp/tests/model_output_comparator.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import abc
+import logging
 import os
 from abc import abstractmethod
 from pathlib import Path
@@ -15,6 +16,7 @@
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
     torch_type_to_numpy_type,
 )
+from executorch.backends.nxp.tests.utils import archive_test_dir, store_txt_input_tensor
 
 
 class BaseOutputComparator(abc.ABC):
@@ -35,6 +37,11 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec):
         :param npu_results_dir: Path to directory with NPU (delegated) results.
         :param output_tensor_spec: List of output tensor specifications.
         """
+        if logging.root.isEnabledFor(logging.DEBUG):
+            diff_cpu_npu_results_dir = os.path.join(
+                os.path.dirname(cpu_results_dir), "diff_cpu_npu_results"
+            )
+
         sample_dirs = [
             os.path.join(cpu_results_dir, file) for file in os.listdir(cpu_results_dir)
         ]
@@ -65,7 +72,28 @@ def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec):
                 )
                 npu_output_tensors.append((output_tensor_name, npu_tensor))
 
-            self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors)
+                if logging.root.isEnabledFor(logging.DEBUG):
+                    # Store diff results if logging level is enabled
+                    diff_cpu_npu_tensor = np.abs(cpu_tensor - npu_tensor)
+                    os.makedirs(
+                        os.path.join(diff_cpu_npu_results_dir, sample_dir),
+                        exist_ok=True,
+                    )
+                    diff_cpu_npu_tensor_path = os.path.join(
+                        diff_cpu_npu_results_dir, sample_dir, output_tensor_name
+                    )
+                    diff_cpu_npu_tensor.tofile(diff_cpu_npu_tensor_path)
+
+                    # Store text tensor results
+                    store_txt_input_tensor(cpu_tensor_path, tensor_spec)
+                    store_txt_input_tensor(npu_tensor_path, tensor_spec)
+                    store_txt_input_tensor(diff_cpu_npu_tensor_path, tensor_spec)
+
+        # We need to archive the test_dir before comparison, as comparison can cause AssertionError exception
+        test_dir = os.path.dirname(cpu_results_dir)
+        if logging.root.isEnabledFor(logging.DEBUG):
+            archive_test_dir(test_dir)
+        self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors)
 
     @abstractmethod
     def compare_sample(
diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py
index 7631ee20ca1..d5ff3680f38 100644
--- a/backends/nxp/tests/nsys_testing.py
+++ b/backends/nxp/tests/nsys_testing.py
@@ -3,19 +3,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import datetime
 import functools
-import inspect
 import logging
 import os.path
+import re
 import shutil
 import subprocess
 from copy import deepcopy
 from enum import Enum
+from importlib.metadata import version
 from os import environ, mkdir
 from typing import Callable, Iterable
 
 import numpy as np
 import torch
+import yaml
 from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order
 from executorch.backends.nxp.backend.ir.converter.conversion import translator
 from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
@@ -40,10 +43,11 @@
     AllCloseOutputComparator,
 )
 from executorch.backends.nxp.tests.outputs_dir_importer import outputs_dir
-from executorch.backends.nxp.tests.utils import save_pte_program
+from executorch.backends.nxp.tests.utils import save_pte_program, store_txt_input_tensor
 from executorch.devtools.visualization.visualization_utils import (
     visualize_with_clusters,
 )
+from pytest import FixtureRequest
 from pytest_mock import MockerFixture
 from torch.export import ExportedProgram
 from torch.fx import GraphModule
@@ -55,6 +59,7 @@
 NSYS_CONFIG_PATH = test_config.NSYS_CONFIG_PATH
 NSYS_FIRMWARE_PATH = test_config.NSYS_FIRMWARE_PATH
 NEUTRON_TEST_PATH = test_config.NEUTRON_TEST_PATH
+PROJECT_DIR = test_config.PROJECT_DIR
 
 
 class ReferenceModel(Enum):
@@ -119,6 +124,7 @@ def wrapper(*args, **kwargs):
         delegated_program = to_quantized_executorch_program(
             model,
             input_spec,
+            intermediates_dir=test_dir,
             dataset_dir=calibration_dataset_dir,
             delegate_to_npu=True,
             use_qat=use_qat,
@@ -126,6 +132,7 @@ def wrapper(*args, **kwargs):
             operators_not_to_delegate=operators_not_to_delegate,
             remove_quant_io_ops=remove_quant_io_ops,
         )
+
     except RuntimeError as e:
         if "Model converted with neutron-converter has" in str(e) and hasattr(
             dlg_model_verifier, "check_num_delegated_nodes"
@@ -391,6 +398,7 @@ def lower_run_compare(
     model: torch.nn.Module,
     input_spec: Iterable[ModelInputSpec] | tuple[int, ...],
     dlg_model_verifier: GraphVerifier,
+    request: FixtureRequest,
     dataset_creator=None,
     output_comparator=None,
     mocker: MockerFixture = None,
@@ -408,11 +416,12 @@ def lower_run_compare(
     :param model: Executed PyTorch model.
     :param input_spec: Model input specification. Can be either tuple of ints - single float32 input model - or Iterable
         of ModelInputSpec.
+    :param dlg_model_verifier: Graph verifier instance.
+    :param request: PyTest request needed for correct test name extraction.
     :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples.
     :param output_comparator: Comparator of results produced by NPU and CPU runs of the program.
-    :param dlg_model_verifier: Graph verifier instance.
-    :param reference_model: Version of the model which will be run to obtain reference output data.
     :param mocker: Mocker instance used by visualizer.
+    :param reference_model: Version of the model which will be run to obtain reference output data.
     :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training).
     :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`.
     :param operators_not_to_delegate: list of operators not to delegate.
@@ -430,7 +439,7 @@ def lower_run_compare(
     model_to_delegate = model
     model_to_not_delegate = deepcopy(model)
 
-    test_name = _get_caller_name()
+    test_name = get_test_name(request)
     test_dir = os.path.join(OUTPUTS_DIR, test_name)
 
     shutil.rmtree(test_dir, ignore_errors=True)
@@ -538,6 +547,11 @@ def lower_run_compare(
 
     output_tensor_spec = _get_program_output_spec(delegated_program)
 
+    if logging.root.isEnabledFor(logging.DEBUG):
+        _generate_txt_test_data(
+            calibration_dataset_dir, testing_dataset_dir, list(input_spec)
+        )
+        dump_debug_test_summary(test_name, test_dir)
     npu_results_dir = os.path.join(test_dir, "results_npu")
     cpu_results_dir = os.path.join(test_dir, "results_cpu")
     output_comparator.compare_results(
@@ -549,10 +563,12 @@ def lower_run_compare_ptq_qat(
     model: torch.nn.Module,
     input_spec: list[ModelInputSpec] | tuple,
     dlg_model_verifier: GraphVerifier,
+    request: FixtureRequest,
     train_fn: Callable[[torch.fx.GraphModule], None],
     dataset_creator=None,
     output_comparator=None,
     mocker: MockerFixture = None,
+    operators_not_to_delegate: list[str] = None,
 ):
     """
     Run provided program twice and compare it's results.
@@ -562,10 +578,12 @@ def lower_run_compare_ptq_qat(
     :param input_spec: Model input specification. Can be either tuple - single float32 input model - or list
         of ModelInputSpec.
     :param dlg_model_verifier: Graph verifier instance.
+    :param request: PyTest request needed for correct test name extraction.
     :param train_fn: Train/finetune function for QAT training.
     :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples.
     :param output_comparator: Comparator of results produced by NPU and CPU runs of the program.
     :param mocker: Mocker instance used by visualizer.
+    :param operators_not_to_delegate: list of operators not to delegate.
     """
     assert_NSYS()
 
@@ -577,7 +595,7 @@ def lower_run_compare_ptq_qat(
     model_ptq = model
     model_qat = deepcopy(model)
 
-    test_name = _get_caller_name()
+    test_name = get_test_name(request)
     test_dir = os.path.join(OUTPUTS_DIR, test_name)
 
     shutil.rmtree(test_dir, ignore_errors=True)
@@ -606,6 +624,7 @@ def lower_run_compare_ptq_qat(
         ptq_results_dir,
         mocker,
         use_qat=False,
+        operators_not_to_delegate=operators_not_to_delegate,
     )
 
     _ = _run_delegated_executorch_program(
@@ -620,10 +639,14 @@ def lower_run_compare_ptq_qat(
         mocker,
         use_qat=True,
         train_fn=train_fn,
+        operators_not_to_delegate=operators_not_to_delegate,
     )
 
     output_tensor_spec = _get_program_output_spec(delegated_program_ptq)
 
+    if logging.root.isEnabledFor(logging.DEBUG):
+        dump_debug_test_summary(test_name, test_dir)
+        shutil.make_archive(test_dir, "zip", test_dir)
     ptq_results_dir = os.path.join(test_dir, "results_ptq")
     qat_results_dir = os.path.join(test_dir, "results_qat")
     output_comparator.compare_results(
@@ -657,13 +680,13 @@ def _parse_input_quant_params(
     return q_params
 
 
-def _get_caller_name():
-    test_function_names = ["lower_run_compare", "lower_run_compare_ptq_qat"]
-    for idx, frame in enumerate(inspect.stack()):
-        if frame.function in test_function_names:
-            # Look one index above to get caller
-            return inspect.stack()[idx + 1].function
-    return None
+def get_test_name(request):
+    # PyTest request is available, extract correct name including test class and params
+    test_name = request.node.nodeid.lstrip(":")
+    # Escape unacceptable characters from test name to make sure it is a valid filesystem directory name
+    test_name = re.sub(r'[<>:"/\\|?* ,()`]', "_", test_name)
+    test_name = test_name.strip(" .")
+    return test_name
 
 
 def execute_cmd(cmd, cwd="."):
@@ -725,3 +748,60 @@ def _get_program_output_spec(exported_program) -> list[torch.Tensor]:
     output_tensors_spec = list(exported_program.graph.output_node().meta["val"])
 
     return output_tensors_spec
+
+
+def get_executorch_git_info() -> dict[str, str]:
+    git_branch_cmd = f"git -C {PROJECT_DIR} branch --show-current"
+    git_branch, _, _ = execute_cmd(git_branch_cmd)
+    git_commit_cmd = f"git -C {PROJECT_DIR} rev-parse --short HEAD"
+    git_commit, _, _ = execute_cmd(git_commit_cmd)
+    return {"git_branch": git_branch, "git_commit": git_commit}
+
+
+def dump_debug_test_summary(test_name: str, test_dir: str):
+    git_info = get_executorch_git_info()
+
+    summary = {
+        "test_name": test_name,
+        "date_time": datetime.datetime.now().isoformat(),
+        "git_branch": git_info["git_branch"],
+        "git_commit": git_info["git_commit"],
+        "eiq_neutron_sdk_version": version("eiq_neutron_sdk"),
+        "eiq_nsys_version": version("eiq_nsys"),
+    }
+    with open(os.path.join(test_dir, "summary.yaml"), "w") as f:
+        yaml.dump(summary, f)
+
+
+def _generate_txt_test_data(
+    calibration_dataset_dir: str,
+    testing_dataset_dir: str,
+    input_tensor_spec: list[ModelInputSpec],
+):
+    # Generates txt tensor variants for input datasets
+    # Testing dataset can point to calibration dataset
+    dataset_paths = (
+        [calibration_dataset_dir, testing_dataset_dir]
+        if calibration_dataset_dir != testing_dataset_dir
+        else [testing_dataset_dir]
+    )
+    for d_path in dataset_paths:
+        quant_dataset = d_path.endswith("dataset_quant")
+
+        # For multiple input tests, list each sample dir, for single input tests the input files are in d_path
+        sample_dirs = [os.path.join(d_path, file) for file in os.listdir(d_path)]
+        sample_dirs = [file for file in sample_dirs if os.path.isdir(file)]
+        # Single input dataset has tensor directly in dataset path
+        if len(sample_dirs) == 0:
+            for input_tensor_name in sorted(os.listdir(d_path)):
+                input_tensor_path = os.path.join(d_path, input_tensor_name)
+                tensor_spec = input_tensor_spec[0]
+                store_txt_input_tensor(input_tensor_path, tensor_spec, quant_dataset)
+        else:
+            for sample_dir in sample_dirs:
+                for idx, input_tensor_name in enumerate(os.listdir(sample_dir)):
+                    input_tensor_path = os.path.join(sample_dir, input_tensor_name)
+                    tensor_spec = input_tensor_spec[idx]
+                    store_txt_input_tensor(
+                        input_tensor_path, tensor_spec, quant_dataset
+                    )
diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py
index 46002ba8883..da50d4dc0d9 100644
--- a/backends/nxp/tests/ops_aliases.py
+++ b/backends/nxp/tests/ops_aliases.py
@@ -26,11 +26,13 @@
 DequantizePerChannel = exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
 DequantizePerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
+Exp = exir_ops.edge.aten.exp.default
 GetItem = operator.getitem
 HardTanh = exir_ops.edge.aten.hardtanh.default
 HardTanh_ = exir_ops.edge.aten.hardtanh_.default
 LeakyRelu = exir_ops.edge.aten.leaky_relu.default
 Log = exir_ops.edge.aten.log.default
+MaxPool2D = exir_ops.edge.aten.max_pool2d.default
 MaxPool2DWithIndices = exir_ops.edge.aten.max_pool2d_with_indices.default
 MeanDim = exir_ops.edge.aten.mean.dim
 MulTensor = exir_ops.edge.aten.mul.Tensor
diff --git a/backends/nxp/tests/utils.py b/backends/nxp/tests/utils.py
index c210d9db8bc..00b7c364a31 100644
--- a/backends/nxp/tests/utils.py
+++ b/backends/nxp/tests/utils.py
@@ -7,11 +7,19 @@
 
 import logging
 import os
+import shutil
 
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
+    torch_type_to_numpy_type,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec
 from executorch.devtools.visualization.visualization_utils import (
     visualize_with_clusters,
 )
 from executorch.exir import ExecutorchProgramManager
+from torch._subclasses import FakeTensor
 
 
 def save_pte_program(
@@ -32,3 +40,27 @@ def save_pte_program(
 
     visualize_with_clusters(prog.exported_program(), visualize_file_name, False)
     return filename
+
+
+def change_filepath_extension(path: str, extension: str) -> str:
+    base, _ = os.path.splitext(path)
+    return base + "." + extension
+
+
+def store_txt_input_tensor(
+    input_tensor_path: str,
+    tensor_spec: ModelInputSpec | FakeTensor,
+    quant_dataset: bool = False,
+):
+    dtype = np.int8 if quant_dataset else torch_type_to_numpy_type(tensor_spec.dtype)
+    input_tensor = np.fromfile(input_tensor_path, dtype=dtype)
+    int__max = np.iinfo(np.int32).max
+
+    with open(change_filepath_extension(input_tensor_path, "txt"), "w") as f:
+        f.write("Flattened tensor shape:" + str(input_tensor.shape))
+        f.write("\nOriginal tensor shape:" + str(list(tensor_spec.shape)) + "\n")
+        f.write(np.array2string(input_tensor, threshold=int__max))
+
+
+def archive_test_dir(test_dir: str):
+    shutil.make_archive(test_dir, "zip", test_dir)
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index ca853de6f86..3336a394510 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -14,6 +14,7 @@
 from .convert_mha_to_sha import ConvertMhaToSha
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_acos import DecomposeAcos
+from .decompose_addmm import DecomposeAddmm
 from .decompose_any import DecomposeAny
 from .decompose_atan2 import DecomposeAtan2
 from .decompose_binary_alpha import DecomposeBinaryAlpha
@@ -26,6 +27,7 @@
 from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_hardsigmoid import DecomposeHardsigmoid
+from .decompose_hyperbolic_variants import DecomposeHyperbolicVariants
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_log_variants import DecomposeLogVariants
 from .decompose_maxpool3d import DecomposeMaxPool3d
@@ -76,6 +78,7 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
+    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
@@ -87,6 +90,7 @@
     DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
+    DecomposeHyperbolicVariants,
     DecomposeHardsigmoid,
     DecomposeLinalgVectorNorm,
     DecomposeLogVariants,
diff --git a/backends/qualcomm/_passes/decompose_acos.py b/backends/qualcomm/_passes/decompose_acos.py
index f83b18f11fc..d546cf6d92d 100644
--- a/backends/qualcomm/_passes/decompose_acos.py
+++ b/backends/qualcomm/_passes/decompose_acos.py
@@ -9,7 +9,7 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, get_const_node
+from .utils import copy_meta, create_const_node
 
 
 class DecomposeAcos(ExportPass):
@@ -52,7 +52,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             )
 
             if is_edge and pi_half_node is None:
-                pi_half_node = get_const_node(
+                pi_half_node = create_const_node(
                     graph, graph_module, "_pi_half_constant", pi_half, node
                 )
 
diff --git a/backends/qualcomm/_passes/decompose_atan2.py b/backends/qualcomm/_passes/decompose_atan2.py
index 0f54e555e03..a411f997b61 100644
--- a/backends/qualcomm/_passes/decompose_atan2.py
+++ b/backends/qualcomm/_passes/decompose_atan2.py
@@ -9,7 +9,7 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, create_node, get_const_node
+from .utils import copy_meta, create_const_node, create_node
 
 
 class DecomposeAtan2(ExportPass):
@@ -68,7 +68,7 @@ def _get_constants(self, graph, graph_module, node, is_edge, const_cache):
 
             def make_const(name, val):
                 if name not in const_cache:
-                    const_cache[name] = get_const_node(
+                    const_cache[name] = create_const_node(
                         graph, graph_module, name, val, node
                     )
                 return const_cache[name]
diff --git a/backends/qualcomm/_passes/decompose_log_variants.py b/backends/qualcomm/_passes/decompose_log_variants.py
index 2b394806b68..904900dd205 100644
--- a/backends/qualcomm/_passes/decompose_log_variants.py
+++ b/backends/qualcomm/_passes/decompose_log_variants.py
@@ -11,7 +11,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_meta, get_const_node
+from .utils import copy_meta, create_const_node
 
 
 class DecomposeLogVariants(ExportPass):
@@ -50,7 +50,7 @@ def _decompose_log_n(self, node, graph, graph_module, const_cache, n):
             div_op = exir_ops.edge.aten.div.Tensor
             attr_name = f"_log_base_{n}_constant"
             if attr_name not in const_cache:
-                const_cache[attr_name] = get_const_node(
+                const_cache[attr_name] = create_const_node(
                     graph, graph_module, attr_name, math.log(n), node
                 )
             div_arg = const_cache[attr_name]
@@ -81,7 +81,7 @@ def _decompose_log_p(self, node, graph, graph_module, const_cache, p):
             log_op = exir_ops.edge.aten.log.default
             attr_name = f"_log1p_addend_{p}_constant"
             if attr_name not in const_cache:
-                const_cache[attr_name] = get_const_node(
+                const_cache[attr_name] = create_const_node(
                     graph, graph_module, attr_name, p, node
                 )
             add_arg = const_cache[attr_name]
diff --git a/backends/qualcomm/_passes/decompose_remainder.py b/backends/qualcomm/_passes/decompose_remainder.py
index 4e5ea739856..a6c260d217b 100644
--- a/backends/qualcomm/_passes/decompose_remainder.py
+++ b/backends/qualcomm/_passes/decompose_remainder.py
@@ -10,7 +10,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
-from .utils import copy_meta, get_const_node
+from .utils import copy_meta, create_const_node
 
 
 class DecomposeRemainder(ExportPass):
@@ -69,7 +69,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
                             graph_module
                         )
-                        const_cache[x_arg] = get_const_node(
+                        const_cache[x_arg] = create_const_node(
                             graph, graph_module, attr_name, x_arg, node
                         )
                     x_node = const_cache[x_arg]
@@ -82,7 +82,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
                             graph_module
                         )
-                        const_cache[y_arg] = get_const_node(
+                        const_cache[y_arg] = create_const_node(
                             graph, graph_module, attr_name, y_arg, node
                         )
                     y_node = const_cache[y_arg]
diff --git a/backends/qualcomm/_passes/decompose_var.py b/backends/qualcomm/_passes/decompose_var.py
index 923fae4977f..c89929fa50e 100644
--- a/backends/qualcomm/_passes/decompose_var.py
+++ b/backends/qualcomm/_passes/decompose_var.py
@@ -10,7 +10,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
-from .utils import copy_meta, get_const_node
+from .utils import copy_meta, create_const_node
 
 
 class DecomposeVar(ExportPass):
@@ -155,7 +155,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                                 attr_name = get_new_attr_name_with_prefix(
                                     "_var_scale_const_"
                                 )(graph_module)
-                                const_cache[cache_key] = get_const_node(
+                                const_cache[cache_key] = create_const_node(
                                     graph, graph_module, attr_name, scale, node
                                 )
                             scale_node = const_cache[cache_key]
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index e3e4b8c8e51..7efb4a293e1 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -20,6 +20,7 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
+    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
@@ -31,6 +32,7 @@
     DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
+    DecomposeHyperbolicVariants,
     DecomposeLinalgVectorNorm,
     DecomposeLogVariants,
     DecomposeMaxPool3d,
@@ -122,12 +124,14 @@ def get_default_pass_activations(cls):
             (AnnotateUnbind, True),
             (ConvertBmmToMatmul, False),
             (DecomposeAcos, True),
+            (DecomposeAddmm, True),
             (DecomposeAny, True),
             (DecomposeAtan2, True),
             (DecomposeColIm, True),
             (DecomposeCDist, True),
             (DecomposeDivMode, True),
             (DecomposeFill, True),
+            (DecomposeHyperbolicVariants, True),
             (DecomposeLogVariants, True),
             (DecomposeMaxPool3d, True),
             (DecomposeMinMaxDim, True),
@@ -160,6 +164,7 @@ def get_annotation_passes(cls):
             RecomposeRmsNorm,
             ReplaceArangeArgs,
             DecomposeAcos,
+            DecomposeAddmm,
             DecomposeAtan2,
             DecomposeBinaryAlpha,
             DecomposeCDist,
@@ -179,6 +184,7 @@ def get_annotation_passes(cls):
             DecomposeExpM1,
             DecomposeFill,
             DecomposeGlu,
+            DecomposeHyperbolicVariants,
             DecomposeRemainder,
             DecomposeSelectScatter,
             DecomposeLinalgVectorNorm,
@@ -275,12 +281,14 @@ def get_passes_dependency_for_capture_program(cls):
             AnnotateUnbind: [RemoveRedundancy],
             ConvertBmmToMatmul: [RecomposePixelUnshuffle],
             DecomposeAcos: [RemoveRedundancy],
+            DecomposeAddmm: [RemoveRedundancy],
             DecomposeAny: [RemoveRedundancy],
             DecomposeAtan2: [RemoveRedundancy],
             DecomposeColIm: [FoldQDQ],
             DecomposeCDist: [RemoveRedundancy],
             DecomposeDivMode: [RemoveRedundancy],
             DecomposeFill: [RemoveRedundancy],
+            DecomposeHyperbolicVariants: [RemoveRedundancy],
             DecomposeLinalgVectorNorm: [RemoveRedundancy],
             DecomposeLogVariants: [RemoveRedundancy],
             DecomposeMaxPool3d: [RemoveRedundancy],
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index 92a75703bbd..2a580ab11a4 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -343,7 +343,7 @@ def append_qdq(
     return dq_node
 
 
-def get_const_node(
+def create_const_node(
     graph: torch.fx.Graph,
     graph_module: torch.fx.GraphModule,
     attr_name: str,
diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl
index 89f8efdea3e..1ea7e6679d5 100644
--- a/backends/qualcomm/aot/wrappers/targets.bzl
+++ b/backends/qualcomm/aot/wrappers/targets.bzl
@@ -1,6 +1,7 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
+    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -20,7 +21,7 @@ def define_common_targets():
             "*.h",
         ]),
         define_static_target = True,
-        platforms = [ANDROID],
+        platforms = [ANDROID, CXX],
         visibility = ["PUBLIC"],
         deps = [
             qnn_third_party_dep("api"),
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index 7b7f4ef8139..b8d86b9d6da 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -498,11 +498,16 @@ The following PyTorch operators are supported through decomposition or annotatio
 | PyTorch Op | Decomposition Pass |
 |---|---|
 | `aten.acos` | `DecomposeAcos` |
+| `aten.acosh` | `DecomposeHyperbolicVariants` |
+| `aten.addmm` | `DecomposeAddmm` |
 | `aten.adaptive_avg_pool1d`, `aten.avg_pool1d` | `AnnotateAvgPool1D` |
 | `aten.any` | `DecomposeAny` |
+| `aten.asinh` | `DecomposeHyperbolicVariants` |
 | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` |
+| `aten.atanh` | `DecomposeHyperbolicVariants` |
 | `aten.add` (with alpha), `aten.sub` (with alpha) | `DecomposeBinaryAlpha` |
 | `aten.cdist`, `aten._cdist_forward` | `DecomposeCDist` |
+| `aten.cosh` | `DecomposeHyperbolicVariants` |
 | `aten.div.Tensor_mode` | `DecomposeDivMode` |
 | `aten.div.Scalar_mode` | `LiftConstantScalarOperands` → `DecomposeDivMode` |
 | `aten.im2col`, `aten.col2im` | `DecomposeColIm` |
@@ -522,6 +527,7 @@ The following PyTorch operators are supported through decomposition or annotatio
 | `aten.roll` | `DecomposeRoll` |
 | `aten.select_scatter` | `DecomposeSelectScatter` |
 | `aten.silu` | `DecomposeSilu` |
+| `aten.sinh` | `DecomposeHyperbolicVariants` |
 | `aten.tan` | `DecomposeTan` |
 | `aten.threshold` | `DecomposeThreshold` |
 | `aten.triu` | `DecomposeTriu` |
diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md
index 8300920d1d5..09b4c1918df 100644
--- a/backends/qualcomm/debugger/README.md
+++ b/backends/qualcomm/debugger/README.md
@@ -156,6 +156,8 @@ After `build_executorch_binary()`, the debugger holds:
 
 Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported.
 
+**Note:** Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
+
 ```python
 from executorch.examples.qualcomm.utils import SimpleADB
 
@@ -266,7 +268,7 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build
 3. Does not support graphs with partitions (partial delegation).
 4. Does not support LLM models.
 5. Does not support graphs with multiple methods.
-
+6. Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
 
 ## ExecuTorch QNN HTP Heap Profiling
 
diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py
index 28b7952ef33..bcba08ecc5a 100644
--- a/backends/qualcomm/export_utils.py
+++ b/backends/qualcomm/export_utils.py
@@ -276,6 +276,10 @@ def __init__(
         self.skip_push = qnn_config.skip_push
         self.backend_library_paths = {}
 
+        if self.direct_build_folder and self.dump_intermediate_outputs:
+            raise ValueError(
+                "Per-tensor dumping is currently not supported in direct mode."
+            )
         if self.direct_build_folder:
             direct_general_artifacts = [
                 f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so",
@@ -437,9 +441,8 @@ def execute(
                         f"--input_list_path {self.input_list_filename}",
                         f"--etdump_path {self.etdump_path}",
                         "--shared_buffer" if self.shared_buffer else "",
-                        f"--debug_output_path {self.debug_output_path}",
                         (
-                            "--dump_intermediate_outputs"
+                            f"--debug_output_path {self.debug_output_path} --dump_intermediate_outputs"
                             if self.dump_intermediate_outputs
                             else ""
                         ),
diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py
index 0c5be07fcdc..ca8abb246bf 100644
--- a/backends/qualcomm/quantizer/annotators/htp_rules.py
+++ b/backends/qualcomm/quantizer/annotators/htp_rules.py
@@ -1077,7 +1077,11 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
+    [
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.mm.default,
+    ],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/quantizer/annotators/lpai_rules.py b/backends/qualcomm/quantizer/annotators/lpai_rules.py
index 2623e4a6524..6e5b343c5c7 100644
--- a/backends/qualcomm/quantizer/annotators/lpai_rules.py
+++ b/backends/qualcomm/quantizer/annotators/lpai_rules.py
@@ -601,7 +601,11 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
+    [
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.mm.default,
+    ],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index 335f4a5c4cb..5ad312020be 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -1,6 +1,7 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
+    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -21,7 +22,7 @@ def define_common_targets():
             "Logging.h",
         ],
         define_static_target = True,
-        platforms = [ANDROID],
+        platforms = [ANDROID, CXX],
         visibility = ["PUBLIC"],
         deps = [
             qnn_third_party_dep("api"),
@@ -91,7 +92,7 @@ def define_common_targets():
             ),
             define_static_target = True,
             link_whole = True,  # needed for executorch/examples/models/llama:main to register QnnBackend
-            platforms = [ANDROID],
+            platforms = [ANDROID, CXX],
             visibility = ["PUBLIC"],
             resources = ({
                 "qnn_lib": qnn_third_party_dep("qnn_offline_compile_libs"),
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index a53e5823aff..c98a8bc83ac 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -1,6 +1,7 @@
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "ANDROID",
+    "CXX",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/third-party:third_party_libs.bzl", "qnn_third_party_dep")
@@ -69,7 +70,7 @@ def define_common_targets():
         },
         exported_external_deps = ["flatbuffers-api"],
         define_static_target = True,
-        platforms = [ANDROID],
+        platforms = [ANDROID, CXX],
     )
 
     runtime.cxx_library(
@@ -87,5 +88,32 @@ def define_common_targets():
         exported_deps = [
             ":schema",
         ],
-        platforms = [ANDROID],
+        platforms = [ANDROID, CXX],
+    )
+
+    # Host-side AOT variant of qnn_executorch_backend. Pulls in the QNN
+    # offline-compile libraries as a Buck resource (via :runtime, which
+    # itself depends on qnn_third_party_dep("qnn_offline_compile_libs")),
+    # so a host-side gtest or runner can dlopen the QNN libraries
+    # without a manual path setup.
+    #
+    # Mirrors qnn_executorch_backend's structure but swaps the on-device
+    # runtime_android_build dep for the host runtime which bundles the
+    # x86 simulator libraries as a Buck resource.
+    runtime.cxx_library(
+        name = "qnn_executorch_backend_aot",
+        srcs = [],
+        headers = [],
+        define_static_target = True,
+        visibility = ["PUBLIC"],
+        deps = [
+            qnn_third_party_dep("api"),
+            "//executorch/runtime/backend:interface",
+            "//executorch/runtime/core:core",
+            "//executorch/backends/qualcomm/runtime:runtime",
+        ],
+        exported_deps = [
+            ":schema",
+        ],
+        platforms = [ANDROID, CXX],
     )
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 9f043ea56a9..0201edb6dee 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -49,6 +49,14 @@ def forward(self, x):
         return torch.acos(x)
 
 
+class Acosh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.acosh(x)
+
+
 class AcosMultiNode(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -144,6 +152,16 @@ def forward(self, x):
         return 10 + x
 
 
+class AddMM(torch.nn.Module):
+    def __init__(self, alpha=1, beta=1):
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+
+    def forward(self, bias, input, mat2):
+        return torch.addmm(bias, input, mat2, alpha=self.alpha, beta=self.beta)
+
+
 class Any(torch.nn.Module):
     def __init__(self, dim=None, keepdim=False):
         super().__init__()
@@ -247,6 +265,14 @@ def forward(self, x, y):
         return squeeze_out, conv_out
 
 
+class Asinh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.asinh(x)
+
+
 class Asin(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -279,6 +305,14 @@ def forward(self, x1, y1, x2, y2):
         return torch.atan2(x1, y1), torch.atan2(x2, y2)
 
 
+class Atanh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.atanh(x)
+
+
 class AvgPool1D(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -989,6 +1023,14 @@ def forward(self, x):
         return torch.cos(x)
 
 
+class Cosh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.cosh(x)
+
+
 class CumSum(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -2290,6 +2332,14 @@ def forward(self, x):
         return torch.sin(x)
 
 
+class Sinh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.sinh(x)
+
+
 class SimpleModel(torch.nn.Module):
     def __init__(self, kernel_size=3):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 914afa077e4..fcb365292ee 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -150,6 +150,11 @@ def test_qnn_backend_acos(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_acosh(self):
+        module = Acosh()  # noqa: F405
+        sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_adaptive_avg_pool1d(self):
         module = AdaptiveAvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -190,6 +195,30 @@ def test_qnn_backend_adaptive_max_pool2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_addmm(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [AddMM()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+            {
+                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)
@@ -300,6 +329,11 @@ def test_qnn_backend_argmin(self):
                     case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
                 )
 
+    def test_qnn_backend_asinh(self):
+        module = Asinh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     @unittest.expectedFailure
     def test_qnn_backend_asin(self):
         sample_input = (torch.rand(3, 4) * 2 - 1,)
@@ -351,6 +385,11 @@ def test_qnn_backend_atan2(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_atanh(self):
+        module = Atanh()  # noqa: F405
+        sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_avg_pool1d(self):
         module = AvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -613,6 +652,11 @@ def test_qnn_backend_cos(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cosh(self):
+        module = Cosh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_cumsum(self):
         sample_input = ()
         test_comb = [
@@ -2147,6 +2191,11 @@ def test_qnn_backend_sin(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_sinh(self):
+        module = Sinh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_select_copy(self):
         module = SelectCopy()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3, 3]),)
@@ -2925,6 +2974,12 @@ def test_qnn_backend_acos(self):
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_acosh(self):
+        module = Acosh()  # noqa: F405
+        sample_input = (torch.tensor([1.0, 1.5, 2.0, 3.0, 5.0, 10.0]).reshape(2, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_adaptive_avg_pool1d(self):
         module = AdaptiveAvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -2969,6 +3024,31 @@ def test_qnn_backend_adaptive_max_pool2d(self):
                 module_one = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module_one, sample_input)
 
+    def test_qnn_backend_addmm(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [AddMM()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+            {
+                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)
@@ -3093,6 +3173,12 @@ def test_qnn_backend_asin(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_asinh(self):
+        module = Asinh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_atan(self):
         sample_input = (torch.randn(3, 4),)
         module = Atan()  # noqa: F405
@@ -3132,6 +3218,12 @@ def test_qnn_backend_atan2(self):
                         qdq_module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(qdq_module, sample_input)
 
+    def test_qnn_backend_atanh(self):
+        module = Atanh()  # noqa: F405
+        sample_input = (torch.tensor([-0.9, -0.5, -0.1, 0.1, 0.5, 0.9]).reshape(2, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_avg_pool1d(self):
         module = AvgPool1D()  # noqa: F405
         sample_input = (torch.randn(1, 512, 7),)
@@ -3478,6 +3570,12 @@ def test_qnn_backend_cos(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cosh(self):
+        module = Cosh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_cumsum(self):
         module = CumSum()  # noqa: F405
         sample_input = (torch.randn(4),)
@@ -5262,6 +5360,12 @@ def test_qnn_backend_sin(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_sinh(self):
+        module = Sinh()  # noqa: F405
+        sample_input = (torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]).reshape(2, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_slice_copy(self):
         modules = [
             SliceCopyDefaultParameter(),  # noqa: F405
@@ -6118,6 +6222,10 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
+        if self.direct_build_folder:
+            self.skipTest(
+                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
+            )
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -6848,20 +6956,38 @@ def output_callback(log_msg):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        # TODO: LPAI direct mode support per-tensor dumping.
+        if self.direct_build_folder:
+            self.skipTest(
+                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
+            )
+        match get_backend_type(self.backend):
+            case QnnExecuTorchBackendType.kHtpBackend:
+                backend_options = generate_htp_compiler_spec(use_fp16=False)
+                expected_compared_events = 14
+            case QnnExecuTorchBackendType.kLpaiBackend:
+                backend_options = generate_lpai_compiler_spec(
+                    target_env=self.get_lpai_target_env()
+                )
+                # I/O q/dq nodes fall back to CPU via FoldQDQ LPAI workaround
+                # and are excluded from QNN etdump; update after first LPAI run
+                expected_compared_events = 17
+            case _:
+                raise ValueError("Backend is not implemented yet")
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
             backend_options=backend_options,
             dump_intermediate_outputs=True,
         )
         module = SimpleModel()  # noqa: F405
+        torch.manual_seed(8)
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
-        module = self.get_qdq_module(module, sample_input)
+        qdq_module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(
-            module,
+            qdq_module,
             sample_input,
             expected_partitions=1,
-            expected_compared_events=14,
+            expected_compared_events=expected_compared_events,
         )
 
     def test_qnn_backend_dump_intermediate_outputs_topk(self):
@@ -8073,7 +8199,6 @@ def test_static_llm_model(self):  # noqa: C901
             "1024",
             "--max_context_len",
             "1024",
-            "--skip_user_prompt_calibration",
         ]
 
         match self.static_llm_eval_method:
@@ -8123,10 +8248,17 @@ def test_static_llm_model(self):  # noqa: C901
                     ]
                 )
             case _:
-                cmds.remove("--skip_user_prompt_calibration")
                 logging.warning(
                     "No llm eval method chosen. Only generate model output."
                 )
+                cmds.extend(
+                    [
+                        "--calib_tasks",
+                        "wikitext",
+                        "--calib_limit",
+                        "1",
+                    ]
+                )
 
         if is_llama_model:
             cmds.extend(
@@ -8299,6 +8431,10 @@ def test_codegen2_1b(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8360,6 +8496,10 @@ def test_llama_stories_260k(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8423,6 +8563,10 @@ def test_llama_stories_110m(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         if self.use_fp16:
             cmds.append("--use_fp16")
@@ -8576,7 +8720,7 @@ class VLMSpecs(MLLMSpecs):
     def setUp(self):
         self.alm_specs = {
             "granite_speech_3_3-2b": TestExampleMultimodalityScript.ALMSpecs(
-                max_seq_len=512,
+                max_seq_len=1024,
                 sm8650_token_rate=5,
                 sm8750_token_rate=8,
                 encoder_pte_size=900_000_000,  # 900MB
@@ -8588,7 +8732,7 @@ def setUp(self):
         }
         self.vlm_specs = {
             "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=128,
+                max_seq_len=1024,
                 sm8650_token_rate=50,
                 sm8750_token_rate=55,
                 encoder_pte_size=110_000_000,  # 110MB
@@ -8598,7 +8742,7 @@ def setUp(self):
                 golden_image_feature="city",
             ),
             "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=320,
+                max_seq_len=1024,
                 sm8650_token_rate=11,
                 sm8750_token_rate=13,
                 encoder_pte_size=425_000_000,  # 425MB
@@ -8650,6 +8794,8 @@ def test_static_asr(self):
             "kv",
             "--max_seq_len",
             f"{alm_specs.max_seq_len}",
+            "--calib_samples",
+            "./examples/qualcomm/oss_scripts/llama/assets/samples/audio.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -8733,6 +8879,8 @@ def test_static_vlm(self):
             "kv",
             "--max_seq_len",
             f"{vlm_specs.max_seq_len}",
+            "--calib_samples",
+            "./examples/qualcomm/oss_scripts/llama/assets/samples/vision.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
diff --git a/backends/transforms/postpone_permute_below_squeeze_view.py b/backends/transforms/postpone_permute_below_squeeze_view.py
index f676e19fb65..e0e9a3ec198 100644
--- a/backends/transforms/postpone_permute_below_squeeze_view.py
+++ b/backends/transforms/postpone_permute_below_squeeze_view.py
@@ -1,12 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
-import copy
 from typing import cast, List
 
 import torch
@@ -108,7 +108,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
                 # view_node_shape is almost same as permute_node_shape
                 # except it has one more dim somewhere
                 # and the extra dim has value of 1.
-                new_view_shape = copy.deepcopy(pred_shape)
+                new_view_shape = list(pred_shape)
                 new_view_shape.insert(index, 1)
                 new_permute_dims = [x + 1 if x >= index else x for x in permute_dims]
                 new_permute_dims.insert(index, index)
@@ -132,7 +132,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
                 # and the extra dim has value of 1.
                 # Convert permute_dims to list of ints
                 index_to_remove = permute_dims[index]
-                new_view_shape = copy.deepcopy(pred_shape)
+                new_view_shape = list(pred_shape)
                 del new_view_shape[index_to_remove]
                 new_permute_dims = [
                     x - 1 if x > index_to_remove else x for x in permute_dims
diff --git a/backends/transforms/test/test_permute_optimization_passes.py b/backends/transforms/test/test_permute_optimization_passes.py
index dd356aad8a2..550446da562 100644
--- a/backends/transforms/test/test_permute_optimization_passes.py
+++ b/backends/transforms/test/test_permute_optimization_passes.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -25,6 +26,8 @@
 from executorch.backends.transforms.replace_nop_transpose_or_permute_with_view import (
     ReplaceNopTransposeOrPermuteWithViewPass,
 )
+
+from executorch.exir import EdgeCompileConfig, to_edge
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from torch.utils import _pytree as pytree
@@ -477,6 +480,38 @@ def test_permute4_view3_chains(self) -> None:
             "PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView",
         )
 
+    def test_postpone_permute_with_symbolic_shapes(self) -> None:
+        class DynamicPermuteViewModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                y = x.view(x.shape[0], 12, 64)
+                y = y.permute(1, 0, 2)
+                y = y.view(1, 12, x.shape[0], 64)
+                return y.permute(0, 1, 3, 2)
+
+        exported_program = torch.export.export(
+            DynamicPermuteViewModule(),
+            (torch.randn(3, 1, 768),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=8)}},
+        )
+        edge_program = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        graph_module = edge_program.exported_program().graph_module
+
+        result = cast(
+            PassResult,
+            PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView().call(graph_module),
+        )
+
+        self.assertTrue(result.modified)
+        self.assertEqual(
+            count_node(result.graph_module, exir_ops.edge.aten.view_copy.default), 2
+        )
+        self.assertEqual(
+            count_node(result.graph_module, exir_ops.edge.aten.permute_copy.default), 2
+        )
+
     def test_negative_not_squeeze_like(self) -> None:
         """View that reshapes (not just squeeze/unsqueeze) should NOT be reordered."""
         builder = GraphBuilder()
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
index 05bdd9431c8..44fbc4bc8f6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
@@ -9,10 +9,36 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+//
+// Resize
+//
+
+// resize_args = { block_config_ref } (unused here)
+//
+// Elementwise binary with broadcasting: output = broadcast(in_a, in_b). Without
+// this the DynamicDispatchNode freezes the output at the build-time upper
+// bound. Mirrors the fp32 resize_binary_op_node (same arg-group layout: inputs
+// are args[1].refs[0] and [1]).
+void resize_q8ta_binary_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in_a = args.at(1).refs.at(0);
+  const ValueRef in_b = args.at(1).refs.at(1);
+
+  const std::vector<int64_t> a_sizes = graph->sizes_of(in_a);
+  const std::vector<int64_t> b_sizes = graph->sizes_of(in_b);
+  graph->virtual_resize(
+      out, calculate_broadcasted_output_size(a_sizes, b_sizes));
+}
+
 //
 // Dispatch nodes
 //
@@ -111,7 +137,7 @@ void add_q8ta_binary_node(
       // Resize args
       {block_config_ref},
       // Resizing Logic
-      nullptr));
+      resize_q8ta_binary_node));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
index f6e89bef03d..b9f17021ea0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -218,6 +219,51 @@ ValueRef prepack_quantized_conv2d_weight(
   return packed_weight;
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation }
+//
+// The q8ta_conv2d output is statically allocated at the build-time upper-bound
+// shape. Without this resize function the DynamicDispatchNode would never
+// virtual_resize the output on trigger_resize(), so a dynamic-shape graph would
+// freeze the conv output at its upper bound — feeding e.g. a 238-row input into
+// a 241-row buffer leaves garbage rows that GroupNorm's global statistics then
+// smear across the whole tensor. Recompute H/W from the current input (N and C
+// are shape-independent and stay as currently allocated).
+void resize_q8ta_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  // H/W from the current input via the shared conv-output helper. kernel dims
+  // come from the kernel_size IntList (kernel_size_only=true); the args[3] slot
+  // is consulted only as an optional ceil_mode and dilation (non-bool) resolves
+  // it to false. transposed=false.
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch nodes
 //
@@ -327,8 +373,10 @@ void add_q8ta_conv2d_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {}));
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
+      // Resize function: propagate dynamic H/W to the output.
+      resize_q8ta_conv2d_node));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
index f463589c50a..5d16cb3b78c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
@@ -123,7 +123,12 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef packed_bias,
     const uint32_t activation_type,
     const ValueRef packed_int8_output,
-    const int32_t groups = 1);
+    const int32_t groups = 1,
+    const ValueRef conv_input = kDummyValueRef,
+    const ValueRef kernel_size = kDummyValueRef,
+    const ValueRef stride = kDummyValueRef,
+    const ValueRef padding = kDummyValueRef,
+    const ValueRef dilation = kDummyValueRef);
 
 std::vector<int64_t> calculate_q8ta_im2col_sizes(
     ComputeGraph* graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
index e690ff435a8..914ca1a23ef 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
@@ -12,6 +12,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -172,6 +173,45 @@ ValueRef prepack_quantized_conv2d_dw_weight(
   return packed_weight;
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation }
+//
+// Depthwise conv output H/W follows the same formula as a regular conv (channel
+// count is unchanged: groups == in_channels == out_channels). Without this the
+// DynamicDispatchNode freezes the output at the build-time upper bound. N/C are
+// shape-independent and stay as currently allocated. Mirrors the regular q8ta
+// conv resize (resize_q8ta_conv2d_node).
+void resize_q8ta_conv2d_dw_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch nodes
 //
@@ -258,10 +298,10 @@ void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {},
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
       // Resizing Logic
-      nullptr));
+      resize_q8ta_conv2d_dw_node));
 }
 
 void add_q8ta_conv2d_dw_node(
@@ -363,8 +403,10 @@ void add_q8ta_conv2d_dw_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {}));
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
+      // Resizing Logic
+      resize_q8ta_conv2d_dw_node));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
index b43fe9eacc6..9aa6e7b05d1 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -95,6 +96,59 @@ std::vector<int64_t> calculate_q8ta_im2col_sizes(
   return {K, H, W};
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation, groups }
+//
+// The im2col scratch tensor is [K, H_out, align_up_4(W_out)] where K (the
+// flattened conv window, channel/kernel-derived) is shape-independent and
+// H_out/W_out are the conv output spatial dims. The downstream PW GEMM that
+// consumes this scratch is resized separately (it preserves H/W). Without this,
+// the scratch freezes at the build-time upper bound and feeds garbage rows into
+// the GEMM. Recompute H_out/W_out from the CURRENT input (NOT the conv output
+// tensor, which may itself still be frozen at this point in the resize order).
+void resize_q8ta_im2col_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef im2col_out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+  const ValueRef groups = resize_args.at(5);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  // Conv output H/W from the current input.
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+  const int64_t out_height = out_hw.at(0);
+  const int64_t out_width = out_hw.at(1);
+
+  // K (flattened conv window) is shape-independent — recompute from channels +
+  // kernel exactly as calculate_q8ta_im2col_sizes does.
+  const int64_t in_channels = utils::val_at(-3, in_sizes);
+  const int64_t groups_val = graph->extract_scalar<int64_t>(groups);
+  const int64_t in_channels_per_group = in_channels / groups_val;
+  const auto kernel_size_list = graph->get_int_list(kernel_size);
+  const int64_t flattened_kernel_len = utils::align_up_4(
+      in_channels_per_group * kernel_size_list->at(0) *
+      kernel_size_list->at(1));
+  const int64_t K = flattened_kernel_len * groups_val;
+  const int64_t W = utils::align_up_4(out_width);
+
+  graph->virtual_resize(im2col_out, {K, out_height, W});
+}
+
 //
 // Dispatch nodes
 //
@@ -168,10 +222,11 @@ void add_q8ta_im2col_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {},
-      // Resizing Logic
-      nullptr));
+      // Resize args: { input, kernel_size, stride, padding, dilation, groups }
+      {packed_int8_input, kernel_size, stride, padding, dilation, groups},
+      // Resizing Logic: recompute the im2col scratch dims from the current
+      // input
+      resize_q8ta_im2col_node));
 }
 
 //
@@ -272,7 +327,14 @@ void q8ta_conv2d_im2col(
       packed_bias,
       activation_type_val,
       packed_int8_output,
-      groups_val);
+      groups_val,
+      // Original activation + conv geometry so the PW output H/W is recomputed
+      // from the true conv result, not the width-padded im2col scratch.
+      packed_int8_input,
+      kernel_size,
+      stride,
+      padding,
+      dilation);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
index 7a2380f728a..4fb7f0fa775 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
@@ -11,6 +11,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -181,6 +182,69 @@ ValueRef prepack_quantized_conv2d_pw_weight(
   return packed_weight;
 }
 
+//
+// Resize
+//
+
+// resize_args = { input }
+//
+// Standalone 1x1 pointwise conv: stride 1, padding 0, dilation 1, so the output
+// H/W equals the input activation H/W. Without this resize the output would
+// freeze at the build-time upper bound. N/C are shape-independent and stay as
+// currently allocated.
+void resize_q8ta_conv2d_pw_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t out_ndim = new_sizes.size();
+  const size_t in_ndim = in_sizes.size();
+  // Copy H (dim -2) and W (dim -1) from the input; keep output N/C.
+  new_sizes.at(out_ndim - 2) = in_sizes.at(in_ndim - 2);
+  new_sizes.at(out_ndim - 1) = in_sizes.at(in_ndim - 1);
+  graph->virtual_resize(out, new_sizes);
+}
+
+// resize_args = { conv_input, kernel_size, stride, padding, dilation }
+//
+// im2col-path PW conv. Here the PW node's bound input is the im2col scratch
+// tensor sized {K, H_out, align_up_4(W_out)} — its width is rounded up to a
+// multiple of 4 for texel alignment, so it must NOT be used to size the output.
+// Recompute the TRUE conv H_out/W_out from the ORIGINAL activation + conv
+// geometry, exactly as resize_q8ta_conv2d_node does. N/C are shape-independent
+// and stay as currently allocated.
+void resize_q8ta_conv2d_pw_im2col_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef conv_input = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(conv_input);
+
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch nodes
 //
@@ -199,7 +263,12 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef packed_bias,
     const uint32_t activation_type,
     const ValueRef packed_int8_output,
-    const int32_t groups) {
+    const int32_t groups,
+    const ValueRef conv_input,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation) {
   VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info(
       graph.packed_dim_info_of(packed_int8_input)));
   VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
@@ -251,6 +320,21 @@ void add_q8ta_conv2d_pw_node(
       graph.hashed_layout_of(packed_int8_input),
   };
 
+  // The im2col path passes the original activation + conv geometry so the
+  // output H/W can be recomputed from the true conv result (the bound input is
+  // the width-padded im2col scratch and must not size the output). The
+  // standalone 1x1 PW conv passes only its real activation input, whose H/W the
+  // output matches directly.
+  std::vector<ValueRef> resize_args;
+  ExecuteNode::ResizeFunction resize_fn;
+  if (conv_input == kDummyValueRef) {
+    resize_args = {packed_int8_input};
+    resize_fn = resize_q8ta_conv2d_pw_node;
+  } else {
+    resize_args = {conv_input, kernel_size, stride, padding, dilation};
+    resize_fn = resize_q8ta_conv2d_pw_im2col_node;
+  }
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
@@ -266,7 +350,8 @@ void add_q8ta_conv2d_pw_node(
       param_buffers,
       push_constants,
       spec_constants,
-      {}));
+      resize_args,
+      resize_fn));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
index bdbdaa14fec..7e3c4166e3c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dTransposed.cpp
@@ -13,10 +13,50 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+// resize_args = { input, kernel_size, stride, padding, dilation, output_padding
+// }
+//
+// Transposed conv output H/W uses the transposed formula
+//   out = (in - 1) * stride - 2 * padding + dilation * (kernel - 1)
+//         + output_padding + 1
+// (computed by calc_out_sizes_hw's transposed=true path, where the 4th args
+// slot is output_padding). Channels stay as allocated. Without this the
+// DynamicDispatchNode freezes the output at the build-time upper bound. Mirrors
+// the fp32 transposed path of resize_conv2d_node.
+void resize_q8ta_conv2d_transposed_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+  const ValueRef output_padding = resize_args.at(5);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, output_padding},
+      /*transposed=*/true);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 // Dedicated workgroup size functions for transposed convolution.
 // Unlike regular conv2d, transposed conv with stride > 1 causes branch
 // divergence along the height dimension (different rows have different
@@ -83,6 +123,7 @@ void add_q8ta_conv2d_transposed_node(
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation,
+    const ValueRef output_padding,
     const ValueRef groups,
     const uint32_t activation_type,
     const ValueRef packed_int8_output) {
@@ -175,8 +216,16 @@ void add_q8ta_conv2d_transposed_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {}));
+      // Resize args: { input, kernel_size, stride, padding, dilation,
+      // output_padding }
+      {packed_int8_input,
+       kernel_size,
+       stride,
+       padding,
+       dilation,
+       output_padding},
+      // Resizing Logic
+      resize_q8ta_conv2d_transposed_node));
 }
 
 void q8ta_conv2d_transposed(
@@ -195,7 +244,9 @@ void q8ta_conv2d_transposed(
   const ValueRef kernel_size = args.at(idx++);
   const ValueRef stride = args.at(idx++);
   const ValueRef padding = args.at(idx++);
-  args.at(idx++); // output_padding: only affects output size, not shader
+  // output_padding does not affect the shader, but it IS needed to compute the
+  // transposed-conv output H/W on resize (dynamic shapes).
+  const ValueRef output_padding = args.at(idx++);
   const ValueRef dilation = args.at(idx++);
   const ValueRef groups = args.at(idx++);
   const ValueRef activation = args.at(idx++);
@@ -255,6 +306,7 @@ void q8ta_conv2d_transposed(
       stride,
       padding,
       dilation,
+      output_padding,
       groups,
       activation_type_val,
       packed_int8_output);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
index 210bd0cd78b..92daf9d8ac5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taLinear.cpp
@@ -63,6 +63,34 @@ utils::uvec3 q8ta_linear_local_wg_size(
       graph, shader, global_workgroup_size, args, resize_args);
 }
 
+//
+// Resize
+//
+
+// resize_args = {}
+//
+// Quantized linear/matmul: output = [*input.shape[:-1], out_features]. The
+// leading/M dims follow the input; out_features (the last dim) is
+// weight-derived and shape-independent, so it stays as currently allocated.
+// Without this the DynamicDispatchNode freezes the output (incl. the M dim) at
+// the build-time upper bound. Mirrors the fp32 resize_linear_qw_node shape
+// logic, generalized to arbitrary input rank.
+void resize_q8ta_linear_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = args.at(1).refs.at(0);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(in);
+  const std::vector<int64_t> out_sizes = graph->sizes_of(out);
+  // Keep out_features (last dim, weight-derived); take all leading dims from
+  // in.
+  new_sizes.at(new_sizes.size() - 1) = out_sizes.at(out_sizes.size() - 1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch node
 //
@@ -135,7 +163,7 @@ void add_q8ta_linear_node(
       // Resize args
       {},
       // Resizing Logic
-      nullptr));
+      resize_q8ta_linear_node));
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
index bca36444725..fb0ffcab14c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
@@ -13,6 +13,21 @@
 
 namespace vkcompute {
 
+// quantize / dequantize are elementwise: output shape == input shape. Without a
+// resize function the DynamicDispatchNode freezes the output at the build-time
+// upper bound, so on a dynamic-shape graph (e.g. a 238-row input fed to a
+// 241-allocated graph) the FIRST quantize_per_tensor freezes everything
+// downstream at 241. Propagate the input's current sizes to the output.
+void resize_q8ta_qdq_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = args.at(1).refs.at(0);
+  graph->virtual_resize(out, graph->sizes_of(in));
+}
+
 void add_q8ta_quantize_node(
     ComputeGraph& graph,
     const ValueRef fp_input,
@@ -80,7 +95,9 @@ void add_q8ta_quantize_node(
        inp_block_config.as_packed_int(),
        outp_block_config.as_packed_int()},
       // Resize args
-      {block_config_ref}));
+      {block_config_ref},
+      // Resize function: output shape == input shape (elementwise).
+      resize_q8ta_qdq_node));
 }
 
 void add_q8ta_dequantize_node(
@@ -150,7 +167,9 @@ void add_q8ta_dequantize_node(
        outp_block_config.as_packed_int(),
        inp_block_config.as_packed_int()},
       // Resize args
-      {block_config_ref}));
+      {block_config_ref},
+      // Resize function: output shape == input shape (elementwise).
+      resize_q8ta_qdq_node));
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
index f7454b6b93a..8709e4bdc2c 100644
--- a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
@@ -53,7 +53,13 @@ static std::string pick_conv2d_dw_shader_with_selector(
   if (is_3x3) {
     kernel_name += "_output_tile_3x3";
     if (impl_selector == "b1x1") {
-      kernel_name += "_b1x1";
+      // The _b1x1 batch-tile variant exists only for the non-sned family;
+      // sned (stride != dilation) shaders are not batch-tiled. Match
+      // pick_conv2d_dw_shader and only append it when stride == dilation,
+      // otherwise fall back to the un-suffixed sned shader.
+      if (stride_equals_dilation) {
+        kernel_name += "_b1x1";
+      }
     } else if (impl_selector == "b4x2") {
       // b4x2 is the default (no suffix)
     } else {
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
index 12d4ed61b76..a282ebfb0ff 100644
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ b/backends/vulkan/test/custom_ops/utils.cpp
@@ -1366,6 +1366,10 @@ ComputeGraph setup_compute_graph(
     int op_invocations_per_execute) {
   GraphConfig config;
   config.enable_querypool = true;
+  // Default-on (opt-out via TestCase::set_force_resize(false)): force every
+  // DynamicDispatchNode to run its resize function on each execute(),
+  // exercising the op's resize formula even when input shapes are unchanged.
+  config.force_resize = test_case.get_force_resize();
   ComputeGraph graph(config);
 
   std::vector<ValueRef> input_values;
diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h
index d8fc36a5142..81bad5e9df0 100644
--- a/backends/vulkan/test/custom_ops/utils.h
+++ b/backends/vulkan/test/custom_ops/utils.h
@@ -603,6 +603,22 @@ class TestCase {
     return target_execute_time_us_;
   }
 
+  // When true, the ComputeGraph built for this test case sets
+  // GraphConfig::force_resize, so every DynamicDispatchNode runs its resize
+  // function on each execute() even when no input shape changed. Because the
+  // output is already allocated at the swept shape, the resize must recompute
+  // the same shape from the current input — a wrong resize formula resizes the
+  // output to a mismatched shape and surfaces as a test failure. Default true
+  // (opt-out): every custom_ops test exercises its resize formulas across the
+  // swept shapes. Call set_force_resize(false) for the rare op whose resize fn
+  // is intentionally not shape-preserving under a fixed output allocation.
+  void set_force_resize(bool force_resize) {
+    force_resize_ = force_resize;
+  }
+  bool get_force_resize() const {
+    return force_resize_;
+  }
+
   void add_input_spec(const ValueSpec& spec) {
     inputs_.push_back(spec);
   }
@@ -648,6 +664,7 @@ class TestCase {
     shader_filter_ = kDefaultShaderFilter;
     op_invocations_per_execute_ = 0;
     target_execute_time_us_ = kDefaultTargetExecuteTimeUs;
+    force_resize_ = true;
   }
 
  private:
@@ -660,6 +677,7 @@ class TestCase {
   std::vector<std::string> shader_filter_;
   int op_invocations_per_execute_ = 0; // 0 = adaptive
   int target_execute_time_us_ = kDefaultTargetExecuteTimeUs;
+  bool force_resize_ = true;
 };
 
 //
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 957862935a4..f7cd85f9758 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -38,6 +38,10 @@ set(WEBGPU_SRCS
     runtime/ops/sdpa/Sdpa.cpp
     runtime/ops/select_as_symint/SelectAsSymint.cpp
     runtime/ops/quantized_linear/QuantizedLinear.cpp
+    runtime/ops/mul/BinaryOp.cpp
+    runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
+    runtime/ops/rope/RotaryEmbedding.cpp
+    runtime/ops/prepack/Prepack.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
@@ -138,7 +142,6 @@ endfunction()
 
 if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
-  add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
   add_webgpu_native_test(
     webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
   )
@@ -148,4 +151,38 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(
     webgpu_update_cache_test test/native/test_update_cache.cpp
   )
+
+  # Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) +
+  # its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
+  if(NOT TARGET GTest::gtest)
+    find_package(GTest QUIET)
+  endif()
+  if(TARGET GTest::gtest)
+    # Reuse add_webgpu_native_test for the backend link + frameworks + flags;
+    # add only driver_util, GTest, and the header-only nlohmann/json include.
+    add_webgpu_native_test(webgpu_op_test test/op_tests/op_test_driver.cpp)
+    target_sources(webgpu_op_test PRIVATE test/op_tests/driver_util.cpp)
+    target_link_libraries(webgpu_op_test PRIVATE GTest::gtest)
+    target_include_directories(
+      webgpu_op_test
+      PRIVATE "${EXECUTORCH_ROOT}/third-party/json/single_include"
+    )
+
+    # Device-free util unit test: no backend/Dawn link (pure manifest/tolerance
+    # helpers), so it does NOT use the native-test helper.
+    add_executable(
+      webgpu_op_test_util_test test/op_tests/test_driver_util.cpp
+                               test/op_tests/driver_util.cpp
+    )
+    target_include_directories(
+      webgpu_op_test_util_test
+      PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+              "${EXECUTORCH_ROOT}/third-party/json/single_include"
+    )
+    target_link_libraries(
+      webgpu_op_test_util_test PRIVATE GTest::gtest GTest::gtest_main
+    )
+    target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
+    set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
+  endif()
 endif()
diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp
index aed769da4a4..ceca89d1710 100644
--- a/backends/webgpu/runtime/WebGPUBackend.cpp
+++ b/backends/webgpu/runtime/WebGPUBackend.cpp
@@ -98,20 +98,21 @@ Error WebGPUBackend::execute(
   const size_t num_outputs = graph->output_ids().size();
 
   // Copy inputs from EValue tensors to GPU buffers
-  std::vector<std::pair<const void*, size_t>> inputs;
+  std::vector<InputData> inputs;
   inputs.reserve(num_inputs);
   for (size_t i = 0; i < num_inputs; i++) {
     const auto& tensor = args[i]->toTensor();
-    inputs.emplace_back(tensor.const_data_ptr(), tensor.nbytes());
+    const bool host_is_int64 =
+        tensor.scalar_type() == executorch::aten::ScalarType::Long;
+    inputs.push_back({tensor.const_data_ptr(), tensor.nbytes(), host_is_int64});
   }
-  graph->copy_inputs(inputs);
-
   // Fail loud as a runtime Error so a throw never crosses the backend boundary.
   try {
+    graph->copy_inputs(inputs);
     graph->update_symints_from_inputs(inputs);
     graph->propagate_resize();
   } catch (const std::exception& e) {
-    ET_LOG(Error, "WebGPU symint refresh/resize failed: %s", e.what());
+    ET_LOG(Error, "WebGPU input copy / symint refresh failed: %s", e.what());
     return Error::Internal;
   }
 
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 1c977d130dd..b7fb4313400 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -26,6 +26,10 @@ namespace executorch::backends::webgpu {
 
 namespace {
 
+// Op name the AOT exporter emits for a prepacked constant (must match the
+// serialized schema); compared in the prepack pre-scan below.
+constexpr const char* kPrepackOpName = "et_vk.prepack.default";
+
 size_t vk_datatype_size(vkgraph::VkDataType dtype) {
   switch (dtype) {
     case vkgraph::VkDataType::BOOL:
@@ -45,6 +49,19 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) {
   }
 }
 
+bool vk_datatype_is_int(vkgraph::VkDataType dtype) {
+  switch (dtype) {
+    case vkgraph::VkDataType::BOOL:
+    case vkgraph::VkDataType::UINT8:
+    case vkgraph::VkDataType::INT8:
+    case vkgraph::VkDataType::INT32:
+    case vkgraph::VkDataType::INT64:
+      return true;
+    default:
+      return false;
+  }
+}
+
 } // namespace
 
 WebGPUGraph::WebGPUGraph() = default;
@@ -61,7 +78,7 @@ WGPUBuffer WebGPUGraph::create_scratch_buffer(size_t nbytes) {
 }
 
 void WebGPUGraph::update_symints_from_inputs(
-    const std::vector<std::pair<const void*, size_t>>& inputs) {
+    const std::vector<InputData>& inputs) {
   for (const auto& src : symint_sources_) {
     int pos = -1;
     for (size_t i = 0; i < input_ids_.size(); i++) {
@@ -100,8 +117,8 @@ void WebGPUGraph::update_symints_from_inputs(
     // Reads the [0,..,index,..,0] element; symint sources are scalar-ish.
     const int64_t offset = static_cast<int64_t>(index) * stride;
     // elem_size back-derived from build-time numel (sources are static-shaped).
-    const void* host = inputs[pos].first;
-    const size_t elem_size = inputs[pos].second / static_cast<size_t>(numel);
+    const void* host = inputs[pos].data;
+    const size_t elem_size = inputs[pos].nbytes / static_cast<size_t>(numel);
     int32_t val;
     if (elem_size == sizeof(int64_t)) {
       val = static_cast<int32_t>(static_cast<const int64_t*>(host)[offset]);
@@ -217,6 +234,10 @@ void WebGPUGraph::build(
 
   const auto* graph = vkgraph::GetVkGraph(flatbuffer_data);
 
+  // .pte byte sources for prepack-time constant materialization (build-only).
+  constant_data_ = constant_data;
+  named_data_map_ = named_data_map;
+
   // Phase 1: Create all values
   const auto* values = graph->values();
   const int num_vals = values ? values->size() : 0;
@@ -226,6 +247,42 @@ void WebGPUGraph::build(
   ints_.resize(num_vals, 0);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
+  value_lists_.resize(num_vals);
+
+  // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the
+  // prepack node materializes it once) only if it is a prepack source AND never
+  // a direct arg of a non-prepack op. ValueList args are expanded so a constant
+  // reached through a list still counts as a direct use.
+  std::unordered_set<int> prepack_src_ids;
+  std::unordered_set<int> direct_use_ids;
+  const auto* chain_prescan = graph->chain();
+  if (chain_prescan) {
+    for (unsigned ci = 0; ci < chain_prescan->size(); ci++) {
+      const auto* oc = chain_prescan->Get(ci);
+      const bool is_prepack = oc->name()->str() == kPrepackOpName;
+      const auto* a = oc->args();
+      if (!a) {
+        continue;
+      }
+      for (unsigned j = 0; j < a->size(); j++) {
+        int id = static_cast<int>(a->Get(j));
+        if (is_prepack && j == 0) {
+          prepack_src_ids.insert(id);
+        } else if (!is_prepack) {
+          direct_use_ids.insert(id);
+          const auto* v = values ? values->Get(id) : nullptr;
+          if (v && v->value_type() == vkgraph::GraphTypes::ValueList) {
+            const auto* items = v->value_as_ValueList()->items();
+            if (items) {
+              for (unsigned k = 0; k < items->size(); k++) {
+                direct_use_ids.insert(static_cast<int>(items->Get(k)));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 
   for (int i = 0; i < num_vals; i++) {
     const auto* val = values->Get(i);
@@ -248,56 +305,57 @@ void WebGPUGraph::build(
             numel *= dims->Get(j);
           }
         }
-        tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());
+        tensor.elem_size = vk_datatype_size(vk_tensor->datatype());
+        tensor.is_int = vk_datatype_is_int(vk_tensor->datatype());
+        tensor.nbytes = numel * tensor.elem_size;
 
         int constant_id = vk_tensor->constant_id();
         int mem_obj_id = vk_tensor->mem_obj_id();
 
-        // Constants always get dedicated buffers regardless of mem_obj_id
+        // Constants are dedicated. Every constant is recorded as a
+        // ConstantSource and materialized via materialize_constant (one
+        // CPU->GPU write); a constant consumed ONLY via prepack is deferred
+        // (no eager buffer -- its prepack node performs that one write).
         if (constant_id >= 0 || mem_obj_id < 0) {
           tensor_mem_obj_ids_[i] = -1;
-          WGPUBufferDescriptor buf_desc = {};
-          buf_desc.size = std::max(tensor.nbytes, size_t(4));
-          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-              WGPUBufferUsage_CopySrc;
-          buf_desc.mappedAtCreation = false;
-          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-          if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
+
+          if (constant_id >= 0) {
             const auto* constants = graph->constants();
-            if (constants &&
-                constant_id < static_cast<int>(constants->size())) {
-              const auto* vk_bytes = constants->Get(constant_id);
-              if (vk_bytes->offset() != UINT64_MAX) {
-                const uint8_t* src = constant_data + vk_bytes->offset();
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, src, tensor.nbytes);
-              } else if (
-                  vk_bytes->named_key() != nullptr &&
-                  named_data_map != nullptr) {
-                // Constant stored in the PTE named-data map.
-                auto buf =
-                    named_data_map->get_data(vk_bytes->named_key()->c_str());
-                if (!buf.ok()) {
-                  throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() +
-                      "' not found in NamedDataMap");
-                }
-                if (buf->size() < tensor.nbytes) {
-                  throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() + "' undersized: have " +
-                      std::to_string(buf->size()) + " bytes, need " +
-                      std::to_string(tensor.nbytes));
-                }
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
-                buf->Free();
-              } else {
-                throw std::runtime_error(
-                    "WebGPU: constant has no inline offset and no named-data key");
-              }
+            if (!constants ||
+                constant_id >= static_cast<int>(constants->size())) {
+              throw std::runtime_error(
+                  "WebGPU: constant_id set but the constants table is missing "
+                  "or the id is out of range");
+            }
+            const auto* vk_bytes = constants->Get(constant_id);
+            ConstantSource cs;
+            cs.nbytes = tensor.nbytes;
+            if (vk_bytes->offset() != UINT64_MAX) {
+              cs.inline_offset = vk_bytes->offset();
+            } else if (vk_bytes->named_key() != nullptr) {
+              cs.named_key = vk_bytes->named_key()->str();
+            } else {
+              throw std::runtime_error(
+                  "WebGPU: constant has no inline offset and no named-data key");
+            }
+            constant_sources_[i] = std::move(cs);
+          }
+
+          // Defer constants consumed solely via prepack: skip the eager buffer.
+          const bool defer = constant_id >= 0 &&
+              prepack_src_ids.count(i) != 0 && direct_use_ids.count(i) == 0;
+          if (!defer) {
+            WGPUBufferDescriptor buf_desc = {};
+            buf_desc.size = std::max(tensor.nbytes, size_t(4));
+            buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                WGPUBufferUsage_CopySrc;
+            buf_desc.mappedAtCreation = false;
+            tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+            // Same single CPU->GPU write the prepack node uses (no
+            // duplication).
+            if (constant_id >= 0) {
+              materialize_constant(i, tensor.buffer);
             }
           }
         } else {
@@ -348,6 +406,16 @@ void WebGPUGraph::build(
         add_uniform_buffer_bytes(kSymIntUniformBytes);
         break;
       }
+      case vkgraph::GraphTypes::ValueList: {
+        value_types_[i] = ValueType::ValueList;
+        const auto* items = val->value_as_ValueList()->items();
+        if (items) {
+          for (unsigned j = 0; j < items->size(); j++) {
+            value_lists_[i].push_back(static_cast<int>(items->Get(j)));
+          }
+        }
+        break;
+      }
       default:
         value_types_[i] = ValueType::Null;
         break;
@@ -424,6 +492,47 @@ void WebGPUGraph::build(
       webgpu_operator_registry().get_op_fn(op_name)(*this, args);
     }
   }
+
+  // Prepack nodes (Phase 3) materialized their constants directly into the
+  // consumer buffers via materialize_constant; no separate copy pass needed.
+  // The .pte bytes are freed right after build() returns (WebGPUBackend
+  // processed->Free()), so clear the build-only source pointers.
+  constant_data_ = nullptr;
+  named_data_map_ = nullptr;
+}
+
+void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) {
+  auto it = constant_sources_.find(const_value_id);
+  if (it == constant_sources_.end()) {
+    throw std::runtime_error(
+        "WebGPU: no source recorded for constant id " +
+        std::to_string(const_value_id));
+  }
+  const ConstantSource& cs = it->second;
+  if (cs.nbytes == 0) {
+    return;
+  }
+  if (cs.inline_offset != UINT64_MAX) {
+    if (constant_data_ == nullptr) {
+      throw std::runtime_error("WebGPU: inline constant data is null");
+    }
+    wgpuQueueWriteBuffer(
+        queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes);
+  } else if (!cs.named_key.empty() && named_data_map_ != nullptr) {
+    auto buf = named_data_map_->get_data(cs.named_key.c_str());
+    if (!buf.ok()) {
+      throw std::runtime_error(
+          "WebGPU: named constant '" + cs.named_key + "' not found");
+    }
+    if (buf->size() < cs.nbytes) {
+      throw std::runtime_error(
+          "WebGPU: named constant '" + cs.named_key + "' undersized");
+    }
+    wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes);
+    buf->Free();
+  } else {
+    throw std::runtime_error("WebGPU: constant has no source");
+  }
 }
 
 WGPUShaderModule WebGPUGraph::get_or_create_shader(
@@ -484,16 +593,47 @@ WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl(
   return bgl;
 }
 
-void WebGPUGraph::copy_inputs(
-    const std::vector<std::pair<const void*, size_t>>& inputs) {
+void WebGPUGraph::copy_inputs(const std::vector<InputData>& inputs) {
   for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) {
-    if (inputs[i].second == 0) {
+    const InputData& in = inputs[i];
+    if (in.nbytes == 0) {
       continue;
     }
     int tid = input_ids_[i];
     const auto& tensor = tensors_[tid];
-    wgpuQueueWriteBuffer(
-        queue_, tensor.buffer, 0, inputs[i].first, inputs[i].second);
+
+    // Fast path: host and GPU element types match byte-for-byte.
+    if (in.nbytes == tensor.nbytes) {
+      wgpuQueueWriteBuffer(queue_, tensor.buffer, 0, in.data, tensor.nbytes);
+      continue;
+    }
+
+    // Narrow int64 host indices into the int32 buffer (mirrors Vulkan).
+    const bool buffer_is_int32 = tensor.is_int && tensor.elem_size == 4;
+    if (in.host_is_int64 && buffer_is_int32 && in.nbytes == tensor.nbytes * 2) {
+      const size_t numel = tensor.nbytes / 4;
+      const int64_t* src = static_cast<const int64_t*>(in.data);
+      std::vector<int32_t> narrowed(numel);
+      for (size_t e = 0; e < numel; e++) {
+#ifndef NDEBUG
+        // Index tensors (tokens/positions) are far below int32 range in
+        // practice; assert in debug that the narrowing is lossless.
+        if (static_cast<int32_t>(src[e]) != src[e]) {
+          throw std::runtime_error("WebGPU: int64 index overflows int32");
+        }
+#endif
+        narrowed[e] = static_cast<int32_t>(src[e]);
+      }
+      wgpuQueueWriteBuffer(
+          queue_, tensor.buffer, 0, narrowed.data(), tensor.nbytes);
+      continue;
+    }
+
+    throw std::runtime_error(
+        "WebGPU: unsupported input copy for input " + std::to_string(i) +
+        " (host " + std::to_string(in.nbytes) + " bytes" +
+        (in.host_is_int64 ? " int64" : "") + " vs buffer " +
+        std::to_string(tensor.nbytes) + " bytes)");
   }
 }
 
@@ -715,10 +855,11 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
       stats.num_tensors++;
-      // Shared tensors are tracked via shared_buffer_sizes_
+      // Shared tensors are tracked via shared_buffer_sizes_; a deferred
+      // prepack-routed constant has no buffer (no GPU memory) -> not counted.
       bool is_shared =
           i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
-      if (!is_shared) {
+      if (!is_shared && tensors_[i].buffer != nullptr) {
         stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
       }
     }
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 3cff09ecb6d..3572f751a06 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -25,6 +25,16 @@ struct WebGPUTensor {
   WGPUBuffer buffer = nullptr;
   std::vector<int64_t> dims;
   size_t nbytes = 0;
+  // Serialized (GPU-side) element type, used to narrow wider host inputs.
+  size_t elem_size = 0;
+  bool is_int = false;
+};
+
+// Host-side view of one graph input, passed to copy_inputs.
+struct InputData {
+  const void* data = nullptr;
+  size_t nbytes = 0;
+  bool host_is_int64 = false;
 };
 
 struct WebGPUDispatch {
@@ -40,6 +50,15 @@ struct OutputCopy {
   size_t nbytes = 0;
 };
 
+// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef
+// (sizes + a data reference, not a live GPU tensor). The prepack node is the
+// sole materialization, so the constant needs no eager GPU buffer.
+struct ConstantSource {
+  uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key
+  std::string named_key; // non-empty => fetch from named_data_map_
+  size_t nbytes = 0;
+};
+
 struct ExecuteConfig {
   size_t chunk_size = 0;
   size_t initial_chunk_size = 0;
@@ -75,7 +94,7 @@ class WebGPUGraph {
       const executorch::runtime::NamedDataMap* named_data_map = nullptr);
 
   // Copy input tensor data from host pointers into GPU buffers.
-  void copy_inputs(const std::vector<std::pair<const void*, size_t>>& inputs);
+  void copy_inputs(const std::vector<InputData>& inputs);
 
   // Execute all recorded dispatches.
   void execute();
@@ -109,6 +128,10 @@ class WebGPUGraph {
   bool get_bool(int id) const {
     return bools_[id];
   }
+  // Member value ids of a serialized ValueList (op multi-output list).
+  const std::vector<int>& get_value_list(int id) const {
+    return value_lists_[id];
+  }
 
   // Live-scalar (SymInt) API; mirrors the Vulkan SymInt/ParamsBuffer UBO.
   // set_symint writes the buffer + marks dirty only if the value changed.
@@ -138,8 +161,7 @@ class WebGPUGraph {
   }
 
   // Execute-time select_as_symint read; mirrors Vulkan select_as_symint_impl.
-  void update_symints_from_inputs(
-      const std::vector<std::pair<const void*, size_t>>& inputs);
+  void update_symints_from_inputs(const std::vector<InputData>& inputs);
 
   // Per-SymInt resize hook; mirrors Vulkan DynamicDispatchNode::trigger_resize.
   void add_resize_hook(int symint_id, std::function<void(WebGPUGraph&)> fn) {
@@ -167,6 +189,11 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
+  // transfer. Build-time only (the .pte bytes are freed after build()).
+  // Mirrors Vulkan prepack_standard.
+  void materialize_constant(int const_value_id, WGPUBuffer dst);
+
   void add_uniform_buffer_bytes(size_t bytes) {
     uniform_buffer_bytes_ += bytes;
   }
@@ -206,7 +233,16 @@ class WebGPUGraph {
     return static_cast<int>(value_types_.size());
   }
 
-  enum class ValueType { Tensor, Int, Double, Bool, Null, String, SymInt };
+  enum class ValueType {
+    Tensor,
+    Int,
+    Double,
+    Bool,
+    Null,
+    String,
+    SymInt,
+    ValueList
+  };
 
   ValueType get_value_type(int id) const {
     return value_types_[id];
@@ -224,6 +260,7 @@ class WebGPUGraph {
   std::vector<int64_t> ints_;
   std::vector<double> doubles_;
   std::vector<bool> bools_;
+  std::vector<std::vector<int>> value_lists_;
 
   // SymInt (live scalar): id -> {live Uniform buffer, current value}, sparse.
   struct SymIntSlot {
@@ -263,6 +300,13 @@ class WebGPUGraph {
 
   std::vector<WebGPUDispatch> dispatches_;
 
+  // Prepack-routed constant sources (offset/named-key + size); the prepack node
+  // materializes these once. constant_data_/named_data_map_ point at the .pte
+  // bytes and are valid only during build().
+  const uint8_t* constant_data_ = nullptr;
+  const executorch::runtime::NamedDataMap* named_data_map_ = nullptr;
+  std::unordered_map<int, ConstantSource> constant_sources_;
+
   ExecuteConfig execute_config_;
 
   // Caches for reusing GPU objects across dispatches.
diff --git a/backends/webgpu/runtime/WebGPUUtils.h b/backends/webgpu/runtime/WebGPUUtils.h
index 690ea72ebf7..39eb3caa28b 100644
--- a/backends/webgpu/runtime/WebGPUUtils.h
+++ b/backends/webgpu/runtime/WebGPUUtils.h
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <stdexcept>
 #include <string>
 
@@ -48,4 +49,25 @@ inline uint32_t compute_1d_workgroup_count(
   return count;
 }
 
+// Create a uniform buffer mapped-at-creation, copy `size` bytes in, and unmap.
+inline WGPUBuffer
+make_uniform(WGPUDevice device, const void* data, size_t size) {
+  WGPUBufferDescriptor desc = {};
+  desc.size = size;
+  desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  desc.mappedAtCreation = true;
+  WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &desc);
+  if (!buf) {
+    throw std::runtime_error("make_uniform: buffer creation failed");
+  }
+  void* ptr = wgpuBufferGetMappedRange(buf, 0, size);
+  if (!ptr) {
+    wgpuBufferRelease(buf);
+    throw std::runtime_error("make_uniform: mapped range is null");
+  }
+  std::memcpy(ptr, data, size);
+  wgpuBufferUnmap(buf);
+  return buf;
+}
+
 } // namespace executorch::backends::webgpu::utils
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index 28d4e8fef91..84b5349ef2d 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -8,8 +8,9 @@
 # Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader.
 # This is the substantive op-coverage gate: unlike the python operators suite
 # (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these
-# executables run rms_norm / multi-dispatch ordering / scratch through the real
-# WebGPU backend on Dawn.
+# executables run quantized_linear / SDPA / update_cache / multi-dispatch
+# ordering / scratch through the real WebGPU backend on Dawn. (Simple ops —
+# add / rms_norm / the misc ops — run through the cases.py op-test framework.)
 #
 # Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES +
 # LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs:
@@ -17,9 +18,9 @@
 #   bash backends/webgpu/scripts/test_webgpu_native_ci.sh
 #
 # Builds whatever native test targets are present in the landed tree (NOT a fixed
-# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) +
-# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199) +
-# webgpu_update_cache_test (D107547307). SDPA executables join once they land.
+# list): webgpu_native_test (base) + webgpu_dispatch_order_test,
+# webgpu_scratch_buffer_test (D107576199) + webgpu_update_cache_test
+# (D107547307). SDPA executables join once they land.
 
 set -e
 
@@ -37,22 +38,31 @@ fi
 cd "${EXECUTORCH_ROOT}"
 
 # ── Exports for the model-driven executables (best-effort) ───────────────────
-# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
-# self-skip if absent; scratch is standalone (generates its own inputs).
-PTE_MODEL="/tmp/webgpu_add_test.pte"
-PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
-RMS_NORM_DIR="/tmp/rmsn"
-RMS_NORM_OK=1
+# native_test (quantized_linear/SDPA/update_cache) + dispatch_order read .pte/
+# golden inputs via env/dir and self-skip if absent; scratch is standalone.
+# native_test itself is gated below on the executorch wheel being importable.
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
-
-$PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
-export_add_model('${PTE_MODEL}')
-export_chained_add_model('${PTE_CHAINED_MODEL}')
-" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
+EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
+EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
+EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
+EMBEDDING_LLAMA1B_MODEL="/tmp/webgpu_embedding_q4gsw_llama1b.pte"
+EMBEDDING_LLAMA1B_INDICES="/tmp/webgpu_embedding_q4gsw_llama1b_indices.bin"
+EMBEDDING_LLAMA1B_GOLDEN="/tmp/webgpu_embedding_q4gsw_llama1b_golden.bin"
+ROPE_MODEL="/tmp/webgpu_rope.pte"
+ROPE_XQ_GOLDEN="/tmp/webgpu_rope_xq_golden.bin"
+ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin"
+ROPE_DECODE_MODEL="/tmp/webgpu_rope_decode.pte"
+ROPE_DECODE_XQ_GOLDEN="/tmp/webgpu_rope_decode_xq_golden.bin"
+ROPE_DECODE_XK_GOLDEN="/tmp/webgpu_rope_decode_xk_golden.bin"
+PREPACK_MODEL="/tmp/webgpu_prepack.pte"
+PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin"
+PREPACK2_MODEL="/tmp/webgpu_prepack_two_const.pte"
+PREPACK2_GOLDEN="/tmp/webgpu_prepack_two_const_golden.bin"
+PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte"
+PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin"
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
@@ -60,9 +70,23 @@ export_all_quantized_linear_models('/tmp')
 " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
-export_rms_norm_cases('${RMS_NORM_DIR}')
-" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
+from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import export_embedding_q4gsw_model
+export_embedding_q4gsw_model('${EMBEDDING_MODEL}', '${EMBEDDING_GOLDEN}', '${EMBEDDING_INDICES}')
+export_embedding_q4gsw_model('${EMBEDDING_LLAMA1B_MODEL}', '${EMBEDDING_LLAMA1B_GOLDEN}', '${EMBEDDING_LLAMA1B_INDICES}', 'llama1b')
+" || echo "WARN: embedding_q4gsw export failed; embedding configs will FAIL in webgpu_native_test"
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model
+export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
+export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode')
+" || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test"
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model
+export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
+export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
+export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}')
+" || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
@@ -112,7 +136,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -141,20 +165,35 @@ for t in "${TARGETS[@]}"; do
 done
 
 echo "=== Run native tests on Dawn + SwiftShader ==="
-# native_test is model-driven; only run it if the export produced its .pte
-# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare
-# local run without the wheel self-skips here rather than hard-failing on load).
-if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
-  env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
-      WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
-      WEBGPU_TEST_SDPA_DIR=/tmp/ \
+# webgpu_native_test hosts the quantized_linear / SDPA / update_cache / symint
+# sweeps. Gate on the executorch wheel being importable (the proxy for "the
+# exports above ran"): CI has the wheel so they ran; a bare local run without it
+# skips here rather than hard-failing the required-config guards.
+if [[ -x "${BIN_DIR}/webgpu_native_test" ]] &&
+  "${PYTHON_EXECUTABLE}" -c "import executorch" 2>/dev/null; then
+  env WEBGPU_TEST_SDPA_DIR=/tmp/ \
       WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL="${EMBEDDING_MODEL}" \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES="${EMBEDDING_INDICES}" \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN="${EMBEDDING_GOLDEN}" \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL="${EMBEDDING_LLAMA1B_MODEL}" \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES="${EMBEDDING_LLAMA1B_INDICES}" \
+      WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN="${EMBEDDING_LLAMA1B_GOLDEN}" \
+      WEBGPU_TEST_ROPE_MODEL="${ROPE_MODEL}" \
+      WEBGPU_TEST_ROPE_XQ_GOLDEN="${ROPE_XQ_GOLDEN}" \
+      WEBGPU_TEST_ROPE_XK_GOLDEN="${ROPE_XK_GOLDEN}" \
+      WEBGPU_TEST_ROPE_DECODE_MODEL="${ROPE_DECODE_MODEL}" \
+      WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN="${ROPE_DECODE_XQ_GOLDEN}" \
+      WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN="${ROPE_DECODE_XK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \
+      WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \
+      WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \
+      WEBGPU_TEST_PREPACK_TIED_MODEL="${PREPACK_TIED_MODEL}" \
+      WEBGPU_TEST_PREPACK_TIED_GOLDEN="${PREPACK_TIED_GOLDEN}" \
       "${BIN_DIR}/webgpu_native_test"
 else
-  echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
-fi
-if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
-  "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
+  echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)"
 fi
 if [[ "${UPDATE_CACHE_OK}" == "1" && -x "${BIN_DIR}/webgpu_update_cache_test" ]]; then
   "${BIN_DIR}/webgpu_update_cache_test" "${UPDATE_CACHE_DIR}"
@@ -165,3 +204,25 @@ fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="
+
+# ── Op-test codegen framework: generate manifest → build → run (Dawn+SwiftShader) ──
+# Reconfigure the SAME build dir adding GTest (EXECUTORCH_BUILD_TESTS=ON), then run
+# every op in cases.py against its torch golden. Self-skips if the generator can't run.
+OP_TEST_DIR="/tmp/webgpu_op_tests"
+if $PYTHON_EXECUTABLE -m executorch.backends.webgpu.test.op_tests.generate_op_tests \
+    --output "${OP_TEST_DIR}"; then
+  echo "=== Reconfigure with GTest + build/run op-test framework ==="
+  cmake -DEXECUTORCH_BUILD_TESTS=ON -B "${BUILD_DIR}" "${EXECUTORCH_ROOT}"
+  OP_DEFINED="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)"
+  if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test_util_test; then
+    cmake --build "${BUILD_DIR}" --target webgpu_op_test_util_test -j"${NPROC}"
+    "${BIN_DIR}/webgpu_op_test_util_test"
+  fi
+  if printf '%s\n' "${OP_DEFINED}" | grep -qw webgpu_op_test; then
+    cmake --build "${BUILD_DIR}" --target webgpu_op_test -j"${NPROC}"
+    "${BIN_DIR}/webgpu_op_test" --manifest "${OP_TEST_DIR}/manifest.json"
+  fi
+  echo "=== WebGPU op-test framework on Dawn: passed ==="
+else
+  echo "WARN: op-test manifest generation failed (needs the executorch wheel); skipping"
+fi
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 6681499c055..5ea465e853b 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -26,36 +26,18 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v
 
 echo "=== Step 1: Run Python export tests ==="
 $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
-# Non-fatal: a rms_norm pytest failure skips the rms_norm native test below
-# rather than aborting the whole run.
-RMS_NORM_PYTEST_OK=1
-$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v \
-    || RMS_NORM_PYTEST_OK=0
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
 echo "=== Step 2: Export test models ==="
-PTE_MODEL="/tmp/webgpu_add_test.pte"
-PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
-RMS_NORM_DIR="/tmp/rmsn"
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
-export_add_model('${PTE_MODEL}')
-export_chained_add_model('${PTE_CHAINED_MODEL}')
-"
-$PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
 "
-if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
-  $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
-export_rms_norm_cases('${RMS_NORM_DIR}')
-" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; }
-fi
 
 echo "=== Export update_cache model ==="
 UPDATE_CACHE_OK=1
@@ -113,7 +95,6 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
-cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC}
 
@@ -125,18 +106,10 @@ else
   echo "(skipping update_cache native test: export did not complete)"
 fi
 env \
-    WEBGPU_TEST_MODEL="${PTE_MODEL}" \
-    WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
     ${UPDATE_CACHE_ENV_VAR} \
     WEBGPU_TEST_SDPA_DIR=/tmp/ \
     "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
 
-if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
-  "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
-else
-  echo "(skipping rms_norm native test: pytest or export did not complete)"
-fi
-
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test"
 
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index ef643d33482..8d987578aa1 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -24,118 +24,6 @@ using namespace executorch::backends::webgpu;
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
-static bool test_single_add(const std::string& model_path) {
-  printf("\n--- Test: single add (1024x1024) ---\n");
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  constexpr int dim = 1024;
-  constexpr int size = dim * dim;
-
-  std::vector<float> a_data(size);
-  std::vector<float> b_data(size);
-  for (int i = 0; i < size; i++) {
-    a_data[i] = static_cast<float>(i) * 1.0f;
-    b_data[i] = static_cast<float>(i) * 2.0f;
-  }
-
-  auto a = make_tensor_ptr({dim, dim}, std::vector<float>(a_data));
-  auto b = make_tensor_ptr({dim, dim}, std::vector<float>(b_data));
-
-  auto result = module.forward({EValue(a), EValue(b)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-
-  const auto& out_tensor = outputs[0].toTensor();
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_error = 0.0f;
-  int check_count = std::min(size, 1024);
-  for (int i = 0; i < check_count; i++) {
-    float expected = a_data[i] + b_data[i];
-    float error = std::abs(out_data[i] - expected);
-    max_error = std::max(max_error, error);
-  }
-
-  printf("Max error: %e (checked %d elements)\n", max_error, check_count);
-  if (max_error > 1e-3f) {
-    printf("FAIL: max error exceeds tolerance 1e-3\n");
-    return false;
-  }
-  printf("PASS: single add test\n");
-  return true;
-}
-
-static bool test_chained_add(const std::string& model_path) {
-  printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n");
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  constexpr int dim = 1024;
-  constexpr int size = dim * dim;
-
-  std::vector<float> x_data(size);
-  std::vector<float> y_data(size);
-  for (int i = 0; i < size; i++) {
-    x_data[i] = static_cast<float>(i % 100) * 0.01f;
-    y_data[i] = static_cast<float>(i % 50) * 0.02f;
-  }
-
-  auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
-  auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));
-
-  auto result = module.forward({EValue(x), EValue(y)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-
-  // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
-  const auto& out_tensor = outputs[0].toTensor();
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_error = 0.0f;
-  for (int i = 0; i < size; i++) {
-    float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
-    float error = std::abs(out_data[i] - expected);
-    max_error = std::max(max_error, error);
-  }
-
-  printf("Max error: %e (checked %d elements)\n", max_error, size);
-  if (max_error > 1e-3f) {
-    printf("FAIL: max error exceeds tolerance 1e-3\n");
-    return false;
-  }
-  printf("PASS: chained add test\n");
-  return true;
-}
-
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
 // Capacity-overrun must throw; runs without a device or TimestampQuery.
 static bool test_query_pool_overrun_throws() {
@@ -407,7 +295,112 @@ static float q4gsw_ramp(int i) {
   return static_cast<float>((i % 17) - 8) / 16.0f;
 }
 
-// Per-element dual tolerance (abs OR rel), parameterized like sdpa_within_tol.
+// Fwd decl of the per-element abs-OR-rel tolerance helper (defined below).
+static bool quant_within_tol(
+    const float* out,
+    const float* golden,
+    int n,
+    float atol,
+    float rtol,
+    float* ma,
+    float* mr);
+
+static std::vector<int32_t> load_indices(
+    const std::string& path,
+    size_t numel) {
+  // Load raw little-endian int32 indices written by the export .py.
+  std::vector<int32_t> g(numel);
+  FILE* f = std::fopen(path.c_str(), "rb");
+  if (!f) {
+    return {};
+  }
+  size_t n = std::fread(g.data(), sizeof(int32_t), numel, f);
+  std::fclose(f);
+  if (n != numel) {
+    return {};
+  }
+  return g;
+}
+
+static bool test_embedding_q4gsw(
+    const std::string& model_path,
+    const std::string& indices_path,
+    const std::string& golden_path,
+    int num_indices,
+    int embed,
+    const char* label) {
+  // q4gsw embedding-gather vs torch golden; shapes per test_embedding_q4gsw.py.
+  const int out_numel = num_indices * embed;
+  printf(
+      "\n--- Test: embedding_q4gsw (%s: indices=%d, embed=%d) ---\n",
+      label,
+      num_indices,
+      embed);
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  std::vector<int32_t> idx32 = load_indices(indices_path, num_indices);
+  std::vector<float> golden = load_golden(golden_path, out_numel);
+  if (idx32.empty() || golden.empty()) {
+    printf(
+        "FAIL: could not load indices %s / golden %s\n",
+        indices_path.c_str(),
+        golden_path.c_str());
+    return false;
+  }
+
+  // int64 at the program boundary; copy_inputs narrows to the int32 buffer.
+  std::vector<int64_t> idx64(idx32.begin(), idx32.end());
+  auto idx = make_tensor_ptr({num_indices}, std::move(idx64));
+
+  auto result = module.forward({EValue(idx)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.numel() != out_numel) {
+    printf(
+        "FAIL: output numel %zu != expected %d\n",
+        (size_t)out_tensor.numel(),
+        out_numel);
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f, max_rel_err = 0.0f;
+  const bool pass = quant_within_tol(
+      out_data,
+      golden.data(),
+      out_numel,
+      1e-3f,
+      1e-3f,
+      &max_abs_err,
+      &max_rel_err);
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      max_abs_err,
+      max_rel_err,
+      out_numel);
+  if (!pass) {
+    printf("FAIL: embedding_q4gsw exceeds tolerance 1e-3 (abs AND rel)\n");
+    return false;
+  }
+  printf("PASS: embedding_q4gsw test\n");
+  return true;
+}
+
 static bool quant_within_tol(
     const float* out,
     const float* golden,
@@ -432,6 +425,185 @@ static bool quant_within_tol(
   return ok;
 }
 
+static bool test_rope(
+    const std::string& model_path,
+    const std::string& xq_golden_path,
+    const std::string& xk_golden_path,
+    int S,
+    int NH,
+    int NKV,
+    int HD,
+    const char* label) {
+  // Llama interleaved RoPE vs torch goldens; shapes/ramps per test_rope.py.
+  const int xq_numel = S * NH * HD;
+  const int xk_numel = S * NKV * HD;
+  const int freqs_numel = S * (HD / 2);
+  printf(
+      "\n--- Test: apply_rotary_emb (%s: S=%d,NH=%d,NKV=%d,HD=%d) ---\n",
+      label,
+      S,
+      NH,
+      NKV,
+      HD);
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  // ((i % mod) - off) / 16: exact in fp32, matches test_rope.py::_ramp.
+  auto ramp = [](int i, int mod, int off) {
+    return static_cast<float>((i % mod) - off) / 16.0f;
+  };
+  std::vector<float> xq(xq_numel), xk(xk_numel), fc(freqs_numel),
+      fs(freqs_numel);
+  for (int i = 0; i < xq_numel; i++) {
+    xq[i] = ramp(i, 17, 8);
+  }
+  for (int i = 0; i < xk_numel; i++) {
+    xk[i] = ramp(i, 13, 6);
+  }
+  for (int i = 0; i < freqs_numel; i++) {
+    fc[i] = ramp(i, 11, 5);
+    fs[i] = ramp(i, 7, 3);
+  }
+
+  auto xqt = make_tensor_ptr({1, S, NH, HD}, std::vector<float>(xq));
+  auto xkt = make_tensor_ptr({1, S, NKV, HD}, std::vector<float>(xk));
+  auto fct = make_tensor_ptr({S, HD / 2}, std::vector<float>(fc));
+  auto fst = make_tensor_ptr({S, HD / 2}, std::vector<float>(fs));
+
+  auto result =
+      module.forward({EValue(xqt), EValue(xkt), EValue(fct), EValue(fst)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+
+  // Outputs in graph order [0]=xq_out, [1]=xk_out (positional; the numel check
+  // below guards a swap, since NH != NKV under GQA).
+  if (outputs.size() < 2 || !outputs[0].isTensor() || !outputs[1].isTensor()) {
+    printf("FAIL: expected 2 tensor outputs, got %zu\n", outputs.size());
+    return false;
+  }
+  const auto& xq_t = outputs[0].toTensor();
+  const auto& xk_t = outputs[1].toTensor();
+  if (xq_t.numel() != xq_numel || xk_t.numel() != xk_numel) {
+    printf(
+        "FAIL: output shapes [%zu,%zu] != expected [%d,%d]\n",
+        (size_t)xq_t.numel(),
+        (size_t)xk_t.numel(),
+        xq_numel,
+        xk_numel);
+    return false;
+  }
+  const float* xq_out = xq_t.const_data_ptr<float>();
+  const float* xk_out = xk_t.const_data_ptr<float>();
+
+  std::vector<float> gq = load_golden(xq_golden_path, xq_numel);
+  std::vector<float> gk = load_golden(xk_golden_path, xk_numel);
+  if (gq.empty() || gk.empty()) {
+    printf(
+        "FAIL: could not load goldens %s / %s\n",
+        xq_golden_path.c_str(),
+        xk_golden_path.c_str());
+    return false;
+  }
+
+  // Per-element abs-OR-rel on xq and xk (shared helper, defined above).
+  float maq = 0.0f, mrq = 0.0f, mak = 0.0f, mrk = 0.0f;
+  const bool pass_q =
+      quant_within_tol(xq_out, gq.data(), xq_numel, 1e-3f, 1e-3f, &maq, &mrq);
+  const bool pass_k =
+      quant_within_tol(xk_out, gk.data(), xk_numel, 1e-3f, 1e-3f, &mak, &mrk);
+  const float max_abs_err = std::max(maq, mak);
+  const float max_rel_err = std::max(mrq, mrk);
+
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      max_abs_err,
+      max_rel_err,
+      xq_numel + xk_numel);
+  if (!(pass_q && pass_k)) {
+    printf("FAIL: apply_rotary_emb exceeds tolerance 1e-3 (abs AND rel)\n");
+    return false;
+  }
+  printf("PASS: apply_rotary_emb test\n");
+  return true;
+}
+
+static bool test_prepack(
+    const std::string& model_path,
+    const std::string& golden_path,
+    const std::string& label = "x + const w") {
+  // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py.
+  constexpr int n = 4;
+  constexpr int numel = n * n;
+  printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n);
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  std::vector<float> golden = load_golden(golden_path, numel);
+  if (golden.empty()) {
+    printf("FAIL: could not load golden %s\n", golden_path.c_str());
+    return false;
+  }
+
+  // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs.
+  std::vector<float> x_data(numel);
+  for (int i = 0; i < numel; i++) {
+    x_data[i] = static_cast<float>((i % 13) - 6) / 16.0f;
+  }
+  auto x = make_tensor_ptr({n, n}, std::vector<float>(x_data));
+
+  auto result = module.forward({EValue(x)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.numel() != numel) {
+    printf(
+        "FAIL: output numel %zu != expected %d\n",
+        (size_t)out_tensor.numel(),
+        numel);
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f, max_rel_err = 0.0f;
+  // Per-element abs-OR-rel (quant_within_tol): a global rel gate spuriously
+  // fails near-zero outputs where rel error explodes.
+  const bool within = quant_within_tol(
+      out_data, golden.data(), numel, 1e-3f, 1e-3f, &max_abs_err, &max_rel_err);
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      max_abs_err,
+      max_rel_err,
+      numel);
+  if (!within) {
+    printf("FAIL: prepack exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: prepack test\n");
+  return true;
+}
+
 // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
 static bool test_q4gsw_config(
     const Q4gswConfig& cfg,
@@ -1318,12 +1490,11 @@ static bool test_symint_roundtrip(const std::string& blob_path) {
     return false;
   }
   const auto& in_ids = graph.input_ids();
-  std::vector<std::pair<const void*, size_t>> fake_inputs(
-      in_ids.size(), {nullptr, 0});
+  std::vector<InputData> fake_inputs(in_ids.size());
   int64_t fake_pos = 5;
   for (size_t i = 0; i < in_ids.size(); i++) {
     if (in_ids[i] == srcs[0].input_tensor_id) {
-      fake_inputs[i] = {&fake_pos, sizeof(int64_t)};
+      fake_inputs[i] = {&fake_pos, sizeof(int64_t), true};
     }
   }
   graph.update_symints_from_inputs(fake_inputs);
@@ -1440,19 +1611,6 @@ static bool test_resize_hook(const std::string& blob_path) {
 }
 
 int main(int argc, char** argv) {
-  std::string model_path = "webgpu_add_test.pte";
-  if (argc > 1) {
-    model_path = argv[1];
-  }
-  if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) {
-    model_path = env;
-  }
-
-  std::string chained_model_path;
-  if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
-    chained_model_path = env;
-  }
-
   std::string update_cache_model_path;
   if (const char* env = std::getenv("WEBGPU_TEST_UPDATE_CACHE_MODEL")) {
     update_cache_model_path = env;
@@ -1467,6 +1625,86 @@ int main(int argc, char** argv) {
     }
   }
 
+  // embedding_q4gsw on-GPU configs: small + llama1b (env-gated,
+  // run-if-present).
+  struct EmbConfig {
+    const char* name;
+    const char* model_env;
+    const char* indices_env;
+    const char* golden_env;
+    int num_indices;
+    int embed;
+  };
+  const EmbConfig emb_configs[] = {
+      {"small",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_MODEL",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_INDICES",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_GOLDEN",
+       4,
+       64},
+      {"llama1b",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_MODEL",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_INDICES",
+       "WEBGPU_TEST_EMBEDDING_Q4GSW_LLAMA1B_GOLDEN",
+       4,
+       2048},
+  };
+
+  // apply_rotary_emb on-GPU configs: multi + decode (env-gated,
+  // run-if-present).
+  struct RopeConfig {
+    const char* name;
+    const char* model_env;
+    const char* xq_env;
+    const char* xk_env;
+    int S;
+    int NH;
+    int NKV;
+    int HD;
+  };
+  const RopeConfig rope_configs[] = {
+      {"multi",
+       "WEBGPU_TEST_ROPE_MODEL",
+       "WEBGPU_TEST_ROPE_XQ_GOLDEN",
+       "WEBGPU_TEST_ROPE_XK_GOLDEN",
+       5,
+       8,
+       2,
+       64},
+      {"decode",
+       "WEBGPU_TEST_ROPE_DECODE_MODEL",
+       "WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN",
+       "WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN",
+       1,
+       32,
+       8,
+       64},
+  };
+
+  std::string prepack_model_path, prepack_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) {
+    prepack_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) {
+    prepack_golden_path = env;
+  }
+
+  std::string prepack2_model_path, prepack2_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) {
+    prepack2_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) {
+    prepack2_golden_path = env;
+  }
+
+  std::string prepack_tied_model_path, prepack_tied_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_MODEL")) {
+    prepack_tied_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_GOLDEN")) {
+    prepack_tied_golden_path = env;
+  }
+
   // SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
   // this directory (default "" = the embedded-file root / cwd). Set
   // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
@@ -1494,12 +1732,6 @@ int main(int argc, char** argv) {
   ok = test_query_pool_overrun_throws() && ok;
   ok = test_query_pool_roundtrip(ctx) && ok;
 #endif // WGPU_BACKEND_ENABLE_PROFILING
-  ok = test_single_add(model_path) && ok;
-
-  if (!chained_model_path.empty()) {
-    ok = test_chained_add(chained_model_path) && ok;
-  }
-
   if (!update_cache_model_path.empty()) {
     ok = test_update_cache(update_cache_model_path) && ok;
   }
@@ -1520,6 +1752,42 @@ int main(int argc, char** argv) {
     ok = false;
   }
 
+  for (const auto& c : emb_configs) {
+    const char* m = std::getenv(c.model_env);
+    const char* ip = std::getenv(c.indices_env);
+    const char* g = std::getenv(c.golden_env);
+    if (m && ip && g && *m && *ip && *g) {
+      ok = test_embedding_q4gsw(m, ip, g, c.num_indices, c.embed, c.name) && ok;
+    }
+  }
+
+  for (const auto& c : rope_configs) {
+    const char* m = std::getenv(c.model_env);
+    const char* xq = std::getenv(c.xq_env);
+    const char* xk = std::getenv(c.xk_env);
+    if (m && xq && xk && *m && *xq && *xk) {
+      ok = test_rope(m, xq, xk, c.S, c.NH, c.NKV, c.HD, c.name) && ok;
+    }
+  }
+
+  if (!prepack_model_path.empty() && !prepack_golden_path.empty()) {
+    ok = test_prepack(prepack_model_path, prepack_golden_path) && ok;
+  }
+
+  if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) {
+    ok = test_prepack(
+             prepack2_model_path, prepack2_golden_path, "x + w1 + w2") &&
+        ok;
+  }
+
+  if (!prepack_tied_model_path.empty() && !prepack_tied_golden_path.empty()) {
+    ok = test_prepack(
+             prepack_tied_model_path,
+             prepack_tied_golden_path,
+             "x + w + w (tied weights, shared key)") &&
+        ok;
+  }
+
   bool sdpa_ran = false;
   bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
   if (sdpa_ran) {
diff --git a/codegen/api/et_cpp.py b/codegen/api/et_cpp.py
index 88f1eb83fe0..a144128368c 100644
--- a/codegen/api/et_cpp.py
+++ b/codegen/api/et_cpp.py
@@ -40,7 +40,6 @@
     tensorT,
 )
 
-
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
@@ -278,7 +277,7 @@ def default_expr(d: str, t: Type) -> str:
 
     if isinstance(t, OptionalType):
         if d == "None":
-            return "torch::executor::nullopt"
+            return "std::nullopt"
 
         return default_expr(d, t.elem)
 
diff --git a/codegen/api/types/types.py b/codegen/api/types/types.py
index 712d7e5e341..dd80daebb33 100644
--- a/codegen/api/types/types.py
+++ b/codegen/api/types/types.py
@@ -16,7 +16,6 @@
 )
 from torchgen.model import BaseTy
 
-
 halfT = BaseCppType("torch::executor", "Half")
 bfloat16T = BaseCppType("torch::executor", "BFloat16")
 stringT = BaseCppType("torch::executor", "string_view")
@@ -59,7 +58,7 @@ class OptionalCType(CType):
 
     def cpp_type(self, *, strip_ref: bool = False) -> str:
         # Do not pass `strip_ref` recursively.
-        return f"torch::executor::optional<{self.elem.cpp_type()}>"
+        return f"std::optional<{self.elem.cpp_type()}>"
 
     def remove_const_ref(self) -> CType:
         return OptionalCType(self.elem.remove_const_ref())
diff --git a/devtools/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md
index 096ab10fb57..c161958f189 100644
--- a/devtools/bundled_program/schema/README.md
+++ b/devtools/bundled_program/schema/README.md
@@ -4,3 +4,13 @@ and other useful info together for verifying the correctness of ExecuTorch progr
 
 ## Rules to ensure forward/backward compatibility
 Please check the rules in [here](../../../schema/README.md) for more info.
+
+
+## Regenerating generated code
+
+Schema changes require regenerating the Python bindings in
+`devtools/bundled_program/serialize/generated` and committing the updated files. From the repo root:
+
+```sh
+python devtools/bundled_program/serialize/generate_bundled_program.py
+```
\ No newline at end of file
diff --git a/devtools/bundled_program/serialize/BUCK b/devtools/bundled_program/serialize/BUCK
index ae920d1e4c2..89a8122503c 100644
--- a/devtools/bundled_program/serialize/BUCK
+++ b/devtools/bundled_program/serialize/BUCK
@@ -9,7 +9,7 @@ fbcode_target(_kind = runtime.python_library,
     name = "lib",
     srcs = [
         "__init__.py",
-    ],
+    ] + glob(["generated/**/*.py"]),
     resources = {
         "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
         "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
@@ -19,6 +19,7 @@ fbcode_target(_kind = runtime.python_library,
     # Please ask before changing this.
     visibility = ["PUBLIC"],
     deps = [
+        "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/setuptools:setuptools",
         "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir/_serialize:lib",
diff --git a/devtools/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py
index ceba7670910..50c6b5768ce 100644
--- a/devtools/bundled_program/serialize/__init__.py
+++ b/devtools/bundled_program/serialize/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,23 +9,62 @@
 
 # TODO(T138924864): Refactor to unify the serialization for bundled program and executorch program.
 
+import functools
 import importlib.resources as _resources
 import json
 import os
+import re
 import tempfile
+from typing import Any
 
 import executorch.devtools.bundled_program.schema as bp_schema
 
 import executorch.devtools.bundled_program.serialize as serialization_package
+
+import flatbuffers  # pyre-ignore[21]
 from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.serialize.generated.bundled_program_flatbuffer import (
+    Bool as _Bool,
+    BundledMethodTestCase as _BundledMethodTestCase,
+    BundledMethodTestSuite as _BundledMethodTestSuite,
+    BundledProgram as _BundledProgram,
+    Double as _Double,
+    Int as _Int,
+    Tensor as _Tensor,
+    Value as _Value,
+    ValueUnion as _ValueUnion,
+)
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
+from executorch.exir._serialize._flatbuffer_program import (
+    _coerce_bytes,
+    _create_aligned_byte_vector,
+)
 
 # The prefix of schema files used for bundled program
 BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema"
 SCALAR_TYPE_SCHEMA_NAME = "scalar_type"
 
 
+@functools.lru_cache(maxsize=1)
+def _bundled_program_file_identifier() -> bytes:
+    schema = _resources.read_binary(
+        serialization_package, f"{BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
+    )
+    match = re.search(rb'file_identifier\s+"([^"]+)"', schema)
+    if match is None:
+        raise ValueError(
+            f"Missing file_identifier in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
+        )
+    file_identifier = match.group(1)
+    if len(file_identifier) != 4:
+        raise ValueError(
+            f"Invalid file_identifier length {len(file_identifier)} "
+            f"in {BUNDLED_PROGRAM_SCHEMA_NAME}.fbs"
+        )
+    return file_identifier
+
+
 def write_schema(d: str, schema_name: str) -> None:
     schema_path = os.path.join(d, "{}.fbs".format(schema_name))
     with open(schema_path, "wb") as schema_file:
@@ -78,6 +117,145 @@ def convert_from_flatbuffer(program_flatbuffer: bytes) -> bytes:
             return output_file.read()
 
 
+def _pack_tensor(self: Any, builder: Any) -> int:
+    if self.sizes is not None:
+        _Tensor.TensorStartSizesVector(builder, len(self.sizes))
+        for i in reversed(range(len(self.sizes))):
+            builder.PrependInt32(self.sizes[i])
+        sizes = builder.EndVector()
+    if self.data is not None:
+        data = _create_aligned_byte_vector(builder, _coerce_bytes(self.data), 16)
+    if self.dimOrder is not None:
+        dim_order = _create_aligned_byte_vector(
+            builder, _coerce_bytes(self.dimOrder), 1
+        )
+
+    _Tensor.TensorStart(builder)
+    _Tensor.TensorAddScalarType(builder, self.scalarType)
+    if self.sizes is not None:
+        _Tensor.TensorAddSizes(builder, sizes)
+    if self.data is not None:
+        _Tensor.TensorAddData(builder, data)
+    if self.dimOrder is not None:
+        _Tensor.TensorAddDimOrder(builder, dim_order)
+    return _Tensor.TensorEnd(builder)
+
+
+def _pack_bundled_program(self: Any, builder: Any) -> int:
+    if self.methodTestSuites is not None:
+        method_test_suites_list = [
+            method_test_suite.Pack(builder)
+            for method_test_suite in self.methodTestSuites
+        ]
+        _BundledProgram.BundledProgramStartMethodTestSuitesVector(
+            builder, len(self.methodTestSuites)
+        )
+        for i in reversed(range(len(self.methodTestSuites))):
+            builder.PrependUOffsetTRelative(method_test_suites_list[i])
+        method_test_suites = builder.EndVector()
+    if self.program is not None:
+        program = _create_aligned_byte_vector(builder, _coerce_bytes(self.program), 32)
+
+    _BundledProgram.BundledProgramStart(builder)
+    _BundledProgram.BundledProgramAddVersion(builder, self.version)
+    if self.methodTestSuites is not None:
+        _BundledProgram.BundledProgramAddMethodTestSuites(builder, method_test_suites)
+    if self.program is not None:
+        _BundledProgram.BundledProgramAddProgram(builder, program)
+    return _BundledProgram.BundledProgramEnd(builder)
+
+
+@functools.lru_cache(maxsize=1)
+def _install_fast_packers() -> None:
+    _Tensor.TensorT.Pack = _pack_tensor
+    _BundledProgram.BundledProgramT.Pack = _pack_bundled_program
+
+
+def _convert_tensor(val: bp_schema.Tensor) -> Any:
+    result = _Tensor.TensorT()
+    result.scalarType = int(val.scalar_type)
+    result.sizes = list(val.sizes)
+    result.data = _coerce_bytes(val.data)
+    result.dimOrder = _coerce_bytes(val.dim_order)
+    return result
+
+
+def _convert_int(val: bp_schema.Int) -> Any:
+    result = _Int.IntT()
+    result.intVal = val.int_val
+    return result
+
+
+def _convert_bool(val: bp_schema.Bool) -> Any:
+    result = _Bool.BoolT()
+    result.boolVal = val.bool_val
+    return result
+
+
+def _convert_double(val: bp_schema.Double) -> Any:
+    result = _Double.DoubleT()
+    result.doubleVal = val.double_val
+    return result
+
+
+def _convert_value_union(val: bp_schema.ValueUnion) -> tuple[int, Any]:
+    if isinstance(val, bp_schema.Tensor):
+        return _ValueUnion.ValueUnion.Tensor, _convert_tensor(val)
+    if isinstance(val, bp_schema.Int):
+        return _ValueUnion.ValueUnion.Int, _convert_int(val)
+    if isinstance(val, bp_schema.Bool):
+        return _ValueUnion.ValueUnion.Bool, _convert_bool(val)
+    if isinstance(val, bp_schema.Double):
+        return _ValueUnion.ValueUnion.Double, _convert_double(val)
+    return _ValueUnion.ValueUnion.NONE, None
+
+
+def _convert_value(val: bp_schema.Value) -> Any:
+    result = _Value.ValueT()
+    result.valType, result.val = _convert_value_union(val.val)
+    return result
+
+
+def _convert_method_test_case(val: bp_schema.BundledMethodTestCase) -> Any:
+    result = _BundledMethodTestCase.BundledMethodTestCaseT()
+    result.inputs = [_convert_value(value) for value in val.inputs]
+    result.expectedOutputs = [_convert_value(value) for value in val.expected_outputs]
+    return result
+
+
+def _convert_method_test_suite(val: bp_schema.BundledMethodTestSuite) -> Any:
+    result = _BundledMethodTestSuite.BundledMethodTestSuiteT()
+    result.methodName = val.method_name
+    result.testCases = [
+        _convert_method_test_case(test_case) for test_case in val.test_cases
+    ]
+    return result
+
+
+def _convert_bundled_program(val: bp_schema.BundledProgram) -> Any:
+    result = _BundledProgram.BundledProgramT()
+    result.version = val.version
+    result.methodTestSuites = [
+        _convert_method_test_suite(suite) for suite in val.method_test_suites
+    ]
+    result.program = _coerce_bytes(val.program)
+    return result
+
+
+def _bundled_program_schema_to_flatbuffer(
+    bundled_program: bp_schema.BundledProgram,
+) -> bytes:
+    _install_fast_packers()
+    bundled_program_t = _convert_bundled_program(bundled_program)
+    builder = flatbuffers.Builder()
+    bundled_program_offset = bundled_program_t.Pack(builder)
+    builder.Finish(
+        bundled_program_offset,
+        file_identifier=_bundled_program_file_identifier(),
+    )
+    return bytes(builder.Output())
+
+
 # from bundled program to flatbuffer
 def serialize_from_bundled_program_to_flatbuffer(
     bundled_program: BundledProgram,
@@ -94,9 +272,7 @@ def serialize_from_bundled_program_to_flatbuffer(
 
     bundled_program_in_schema = bundled_program.serialize_to_schema()
 
-    return convert_to_flatbuffer(
-        serialize_from_bundled_program_to_json(bundled_program_in_schema)
-    )
+    return _bundled_program_schema_to_flatbuffer(bundled_program_in_schema)
 
 
 # From flatbuffer to bundled program in schema.
diff --git a/docs/source/backends/nxp/op-support.csv b/docs/source/backends/nxp/op-support.csv
index 8a250dce88d..fb67f47bf62 100644
--- a/docs/source/backends/nxp/op-support.csv
+++ b/docs/source/backends/nxp/op-support.csv
@@ -13,6 +13,7 @@ aten.constant_pad_nd.default,int8,static int8,"H or W padding only"
 aten.convolution.default,int8,static int8,"1D or 2D convolution, constant weights, groups=1 or groups=channels_count (depthwise)"
 aten.dim_order_ops._clone_dim_order.default,,, "See aten.clone.default"
 aten.div.Tensor,int8,static int8,"divisor - static tensor or scalar value, one dimension must satisfy %8 = 0 or scalar division (all dims = 1)"
+aten.exp.default,int8,static int8,
 aten.hardtanh.default,int8,static int8,"supported ranges: <0,6>, <-1, 1>, <0,1>, <0,inf>"
 aten.leaky_relu.default,int8,static int8,
 aten.log.default,int8,static int8,
diff --git a/examples/arm/executor_runner/arm_memory_allocator.cpp b/examples/arm/executor_runner/arm_memory_allocator.cpp
index de670df29ae..d3337b6005e 100644
--- a/examples/arm/executor_runner/arm_memory_allocator.cpp
+++ b/examples/arm/executor_runner/arm_memory_allocator.cpp
@@ -26,7 +26,7 @@ static void asan_unpoison_buffer(void* base, size_t size) {
 #endif
 
 ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
-    : MemoryAllocator(size, base_address), used_(0) {
+    : MemoryAllocator(size, base_address) {
 #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
   asan_poison_buffer(base_address, size);
 #endif
@@ -34,35 +34,16 @@ ArmMemoryAllocator::ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
 
 void* ArmMemoryAllocator::allocate(size_t size, size_t alignment) {
   void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
-  if (ret != nullptr) {
 #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
+  if (ret != nullptr) {
     asan_unpoison_buffer(ret, size);
-#endif
-    // Align with the same code as in MemoryAllocator::allocate() to keep
-    // used_ "in sync" As alignment is expected to be power of 2 (checked by
-    // MemoryAllocator::allocate()) we can check it the lower bits
-    // (same as alignment - 1) is zero or not.
-    if ((size & (alignment - 1)) == 0) {
-      // Already aligned.
-      used_ += size;
-    } else {
-      used_ = (used_ | (alignment - 1)) + 1 + size;
-    }
   }
+#endif
   return ret;
 }
 
-size_t ArmMemoryAllocator::used_size() const {
-  return used_;
-}
-
-size_t ArmMemoryAllocator::free_size() const {
-  return executorch::runtime::MemoryAllocator::size() - used_;
-}
-
 void ArmMemoryAllocator::reset() {
   executorch::runtime::MemoryAllocator::reset();
-  used_ = 0;
 #if defined(EXECUTORCH_ENABLE_ADDRESS_SANITIZER)
   asan_poison_buffer(base_address(), size());
 #endif
diff --git a/examples/arm/executor_runner/arm_memory_allocator.h b/examples/arm/executor_runner/arm_memory_allocator.h
index 1d7bbdecb4c..3c82f72c44b 100644
--- a/examples/arm/executor_runner/arm_memory_allocator.h
+++ b/examples/arm/executor_runner/arm_memory_allocator.h
@@ -10,21 +10,14 @@ using executorch::runtime::MemoryAllocator;
 
 #pragma once
 
-// Setup our own allocator that can show some extra stuff like used and free
-// memory info
+// Custom allocator that poisons/unpoisons its buffer for AddressSanitizer. The
+// used and free byte counts are reported by the base MemoryAllocator's
+// used_size() / free_size().
 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
  public:
   ArmMemoryAllocator(uint32_t size, uint8_t* base_address);
 
   void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
 
-  // Returns the used size of the allocator's memory buffer.
-  size_t used_size() const;
-
-  // Returns the free size of the allocator's memory buffer.
-  size_t free_size() const;
-  void reset();
-
- private:
-  size_t used_;
+  void reset() override;
 };
diff --git a/examples/espressif/README.md b/examples/espressif/README.md
index 025bdf94094..a76e794030c 100644
--- a/examples/espressif/README.md
+++ b/examples/espressif/README.md
@@ -44,8 +44,6 @@ examples/espressif/
 ├── executor_runner/
 │   ├── CMakeLists.txt           # Component/standalone CMake build
 │   ├── esp_executor_runner.cpp  # Main executor runner
-│   ├── esp_memory_allocator.h   # Custom memory allocator
-│   ├── esp_memory_allocator.cpp
 │   ├── esp_perf_monitor.h       # Performance monitoring
 │   ├── esp_perf_monitor.cpp
 │   └── pte_to_header.py         # Convert .pte to C header
diff --git a/examples/espressif/executor_runner/CMakeLists.txt b/examples/espressif/executor_runner/CMakeLists.txt
index a103a1ddc8c..2a26c53d5e0 100644
--- a/examples/espressif/executor_runner/CMakeLists.txt
+++ b/examples/espressif/executor_runner/CMakeLists.txt
@@ -28,7 +28,6 @@ if(ESP_PLATFORM)
     SRCS
     "esp_executor_runner.cpp"
     "esp_pal.cpp"
-    "esp_memory_allocator.cpp"
     "esp_perf_monitor.cpp"
     INCLUDE_DIRS
     "."
@@ -283,7 +282,7 @@ else()
   add_executable(esp_executor_runner)
   target_sources(
     esp_executor_runner PRIVATE esp_executor_runner.cpp esp_pal.cpp
-                                esp_perf_monitor.cpp esp_memory_allocator.cpp
+                                esp_perf_monitor.cpp
   )
 
   target_link_libraries(
diff --git a/examples/espressif/executor_runner/esp_executor_runner.cpp b/examples/espressif/executor_runner/esp_executor_runner.cpp
index 9260e6b88a0..c2f1fa34dde 100644
--- a/examples/espressif/executor_runner/esp_executor_runner.cpp
+++ b/examples/espressif/executor_runner/esp_executor_runner.cpp
@@ -73,7 +73,6 @@
 #include <executorch/runtime/platform/runtime.h>
 
 #include "esp_executor_runner.h"
-#include "esp_memory_allocator.h"
 #include "esp_perf_monitor.h"
 
 #if defined(ESP_PLATFORM)
@@ -478,8 +477,8 @@ struct RunnerContext {
   bool bundle_io = false;
   Box<BufferDataLoader> loader;
   Box<Program> program;
-  Box<EspMemoryAllocator> method_allocator;
-  Box<EspMemoryAllocator> temp_allocator;
+  Box<MemoryAllocator> method_allocator;
+  Box<MemoryAllocator> temp_allocator;
   std::vector<Span<uint8_t>> planned_spans;
   Box<HierarchicalAllocator> planned_memory;
   Box<MemoryManager> memory_manager;
@@ -1020,7 +1019,7 @@ bool et_runner_init(void) {
     return false;
   }
 #endif
-  EspMemoryAllocator file_allocator(
+  MemoryAllocator file_allocator(
       method_allocation_pool_size, method_allocation_pool);
   auto [buffer, buffer_size] =
       load_file_from_fs("/spiffs/model.pte", file_allocator);
@@ -1247,4 +1246,4 @@ size_t et_runner_outputs_size(void) {
   ET_CHECK_MSG(model_ok == true, "Problem running model");
 
   ET_LOG(Info, "Program complete.");
-}
\ No newline at end of file
+}
diff --git a/examples/models/BUCK b/examples/models/BUCK
index a2b6789a95e..ed72a16e05f 100644
--- a/examples/models/BUCK
+++ b/examples/models/BUCK
@@ -33,6 +33,9 @@ fbcode_target(_kind = python_library,
         "//executorch/examples/models/phi_4_mini:phi_4_mini",  # @manual
         "//executorch/examples/models/smollm2:smollm2",  # @manual
         "//executorch/examples/models/smollm3:smollm3",  # @manual
+        "//executorch/examples/models/smolvlm:smolvlm",  # @manual
+        "//executorch/examples/models/whisper:whisper",  # @manual
+        "//executorch/examples/models/yolo26:yolo26",  # @manual
     ],
 )
 
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index 241a5cc366e..d50554006bd 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -45,6 +45,10 @@ class Model(str, Enum):
     MobileNetV1025 = "mobilenet_v1_025"
     ResNet8 = "resnet8"
     Sdpa = "sdpa"
+    Qwen3 = "qwen3"
+    SmolVLM = "smolvlm"
+    YOLO26 = "yolo26"
+    Whisper = "whisper"
 
     def __str__(self) -> str:
         return self.value
@@ -105,6 +109,10 @@ def __str__(self) -> str:
     ),
     str(Model.ResNet8): ("mlperf_tiny.resnet8", "ResNet8Model"),
     str(Model.Sdpa): ("toy_model", "SdpaModule"),
+    str(Model.Qwen3): ("qwen3", "Qwen3Model"),
+    str(Model.SmolVLM): ("smolvlm", "SmolVLMModel"),
+    str(Model.YOLO26): ("yolo26", "YOLO26Model"),
+    str(Model.Whisper): ("whisper", "WhisperModel"),
 }
 
 __all__ = [
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
index ae3bcb24c19..482f64083a0 100644
--- a/examples/models/gemma4_31b/README.md
+++ b/examples/models/gemma4_31b/README.md
@@ -93,14 +93,31 @@ method with dynamic sequence length and host-side sampling.
 
 Writes `model.pte` (and optionally `model.ptd`) into `--output-dir`.
 
-#### TurboQuant KV cache (long context, MLX only)
+#### TurboQuant KV cache (long context, CUDA + MLX)
 
 For long-context inference, add `--turboquant` to swap the full-attention
 layers' KV cache for a TurboQuant TQ4 cache (4-bit codebook + nibble pack).
 This gives ~3.8× cache memory savings on the full-attention layers and lets
-you fit context lengths that wouldn't fit in bf16. Sliding-window layers are unaffected.
+you fit context lengths that wouldn't fit in bf16. Sliding-window layers are
+unaffected. Supported on both the CUDA and MLX backends.
+
+**Long context requires BOTH flags**: `--turboquant` *and* a larger
+`--max-seq-len`. Raising `--max-seq-len` alone keeps a bf16 KV cache, which does
+not fit at long context. On CUDA, `--turboquant` is what enables 128k: Gemma4-31B
+at `--max-seq-len 131072` runs within ~27 GiB at runtime (fits a 32 GB card).
+
+```bash
+# CUDA — 128k context (TQ4 KV)
+python examples/models/gemma4_31b/export.py \
+    --gguf ./gemma-4-31B-it-Q4_K_M.gguf \
+    --output-dir ./gemma4_31b_exports_128k \
+    --max-seq-len 131072 \
+    --backend cuda \
+    --turboquant
+```
 
 ```bash
+# MLX (Apple Silicon)
 python examples/models/gemma4_31b/export.py \
     --prequantized ./gemma4_31b_int4 \
     --output-dir ./gemma4_31b_exports_mlx_tq \
diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py
index f9b383cf224..132ddb33f1d 100644
--- a/examples/models/gemma4_31b/cuda_source_transformations.py
+++ b/examples/models/gemma4_31b/cuda_source_transformations.py
@@ -25,11 +25,17 @@
 
 import types
 
+# Importing this module registers ``torch.ops.triton.sdpa`` /
+# ``torch.ops.triton.sdpa_decode_splitk`` (the length-aware bf16 attention ops
+# used by the non-TurboQuant full-attention path below).
+import executorch.backends.cuda.triton.kernels.sdpa  # noqa: F401
+
 # Importing this module registers ``torch.ops.triton.tq4_sdpa``.
 import executorch.backends.cuda.triton.kernels.tq4_sdpa  # noqa: F401
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from executorch.examples.models.gemma4.text_decoder import apply_rotary_emb
 from executorch.extension.llm.modules.turboquant import TurboQuantKVCache
@@ -46,6 +52,9 @@ def _turboquant_attention_forward(
 
     Mirrors the default forward up to (and including) RoPE; only the
     cache update and SDPA call differ.
+
+    NOTE: ``attn_mask`` is unused here and will be reconstucted in
+    the kernel to save data transfer, but is passed to the default forward
     """
     B, T, _ = x.shape
 
@@ -77,14 +86,17 @@ def _turboquant_attention_forward(
     # uncompressed K/V is never materialized.
     k_packed, k_norms, v_packed, v_norms = self.kv_cache.update(input_pos, k, v)
 
-    # Number of valid (filled) KV positions = input_pos[0] + T. Bounds tq4_sdpa's
-    # KV loop to the actual context (O(context), not O(max_seq_len)) and enables
-    # the split-K decode path. GPU scalar (no .item()) so it's CUDA-graph-safe.
+    # Number of valid (filled) KV positions = input_pos[0] + T. Passing this to
+    # tq4_sdpa bounds its KV loop to the actual context instead of the full
+    # pre-allocated buffer (max_seq_len for global layers), making attention
+    # O(context) instead of O(max_seq_len). Kept as a GPU scalar (no ``.item()``)
+    # so the bound is captured correctly by the decode CUDA graph. Decode: T=1 ->
+    # input_pos+1; prefill chunk: T -> chunk_end.
+    # NOTE: this call-site argument was dropped during a rebase, which silently
+    # disabled the O(context) bound and forced a full max_seq_len sweep every
+    # step (catastrophic at 128k: ~2.7 tok/s decode vs ~37+ when bounded).
     kv_len = input_pos[0] + input_pos.shape[0]
 
-    # ``scale=self.scaling`` (= 1.0 for Gemma 4) — overrides tq4_sdpa's
-    # default ``1/sqrt(D)`` because Gemma's QK-norm has absorbed the
-    # 1/sqrt(d) factor into trained weights.
     y = torch.ops.triton.tq4_sdpa(
         q,
         k_packed,
@@ -93,8 +105,8 @@ def _turboquant_attention_forward(
         v_norms,
         self.kv_cache.centroids,
         self.kv_cache.rotation,
-        attn_mask,
-        False,  # is_causal — attn_mask already encodes causal masking
+        None,  # reconstuct attention mask in the kernel to save data transfer
+        False,  # is_causal: needs L_q==L_kv; causal comes from mask_is_causal
         self.scaling,
         kv_len,
         True,  # mask_is_causal: Gemma full-attention mask is standard causal
@@ -104,6 +116,178 @@ def _turboquant_attention_forward(
     return self.o_proj(y)
 
 
+def _lenaware_attention_forward(
+    self,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    attn_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Drop-in ``Gemma4Attention.forward`` for full-attention layers on the
+    non-TurboQuant CUDA path that bounds SDPA to the valid context length.
+
+    Identical to the default forward (plain bf16 KV cache) except the final
+    ``F.scaled_dot_product_attention`` is replaced with
+    ``torch.ops.triton.sdpa(..., kv_len=...)``. Passing ``kv_len`` bounds the
+    attention KV loop to the actual filled context instead of the full
+    pre-allocated buffer (``max_seq_len`` for global layers), making decode
+    O(context) instead of O(max_seq_len) — and routes L_q==1 decode through the
+    length-aware split-K flash-decoding kernel. Sliding-window layers are not
+    patched (they already use a bounded ring buffer).
+    """
+    B, T, _ = x.shape
+
+    q = self.q_proj(x).view(B, T, self.n_heads, self.head_dim)
+    raw_k = self.k_proj(x).view(B, T, self.n_kv_heads, self.head_dim)
+    if self.k_eq_v:
+        raw_v = raw_k
+    else:
+        raw_v = self.v_proj(x).view(B, T, self.n_kv_heads, self.head_dim)
+
+    q = self.q_norm(q)
+    k = self.k_norm(raw_k)
+    v = self.v_norm(raw_v)
+
+    # (B, H, T, D) for SDPA / KV cache.
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+
+    # RoPE: same code path as default forward.
+    freqs = torch.outer(input_pos.float(), self.inv_freq)
+    emb = torch.cat((freqs, freqs), dim=-1)
+    cos = torch.cos(emb)
+    sin = torch.sin(emb)
+    q, k = apply_rotary_emb(q, k, cos, sin)
+
+    # Update cache and read back the full (pre-allocated) K/V buffers.
+    k, v = self.kv_cache.update(input_pos, k, v)
+
+    # Number of valid (filled) KV positions = input_pos[0] + T. Passing this to
+    # sdpa bounds its KV loop to the actual context instead of the full
+    # pre-allocated buffer (max_seq_len for global layers), making attention
+    # O(context) instead of O(max_seq_len). Kept as a GPU scalar (no ``.item()``)
+    # so the bound is captured correctly by the decode CUDA graph. Decode: T=1 ->
+    # input_pos+1; prefill chunk: T -> chunk_end.
+    kv_len = input_pos[0] + input_pos.shape[0]
+
+    # ``scale=self.scaling`` (= 1.0 for Gemma 4) — Gemma's QK-norm has absorbed
+    # the 1/sqrt(d) factor into trained weights. ``enable_gqa=True`` lets the
+    # kernel handle the head ratio without materializing expanded K/V.
+    y = torch.ops.triton.sdpa(
+        q,
+        k,
+        v,
+        attn_mask,
+        0.0,  # dropout_p
+        False,  # is_causal: attn_mask already encodes causal masking
+        self.scaling,
+        True,  # enable_gqa
+        kv_len,
+    )
+
+    y = y.transpose(1, 2).contiguous().view(B, T, self.n_heads * self.head_dim)
+    return self.o_proj(y)
+
+
+def _fused_mlp_forward(self, x: torch.Tensor) -> torch.Tensor:
+    """Drop-in ``Gemma4MLP.forward`` over a fused gate|up projection.
+
+    Identical math to ``down(gelu(gate(x)) * up(x))``: the single
+    ``gate_up_proj`` emits ``[gate | up]`` concatenated on the last dim,
+    which is then split. One W4A8 matmul (and one activation-quant of ``x``)
+    instead of two.
+    """
+    h = self.gate_up_proj(x)
+    gate = h[..., : self.intermediate_size]
+    up = h[..., self.intermediate_size :]
+    return self.down_proj(F.gelu(gate, approximate="tanh") * up)
+
+
+def _concat_coalesced_int4_along_n(a, b):
+    """Concatenate two ``CudaCoalescedInt4Tensor`` along the output (N) dim.
+
+    qdata is ``[N, K/2]`` and scale/zero_point are ``[N, n_groups]`` in the
+    coalesced layout, so a per-output-row concat on dim 0 is exact: the W4A8
+    dp4a matvec reads each output row's qdata/scale/zero independently, so
+    out[:N_a] reproduces ``a`` and out[N_a:] reproduces ``b`` bit-for-bit.
+    """
+    from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor
+
+    return CudaCoalescedInt4Tensor(
+        torch.cat([a.qdata, b.qdata], dim=0),
+        torch.cat([a.scale, b.scale], dim=0),
+        torch.cat([a.zero_point, b.zero_point], dim=0),
+        a.block_size,
+        torch.Size([a.shape[0] + b.shape[0], a.shape[1]]),
+        None,
+        a.activation_dtype,
+    )
+
+
+def _is_fuseable_int4_pair(gate_w, up_w) -> bool:
+    """True iff gate/up are both coalesced-int4 with matching K + block_size.
+
+    Q4_K MLP weights become ``CudaCoalescedInt4Tensor`` (fuseable); a Q6_K
+    weight becomes ``CudaDp4aPlanarInt6Tensor`` (left alone). ``act_pre_scale``
+    is unused on this path but we require it absent so the concat stays exact.
+    """
+    from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor
+
+    return (
+        isinstance(gate_w, CudaCoalescedInt4Tensor)
+        and isinstance(up_w, CudaCoalescedInt4Tensor)
+        and list(gate_w.block_size) == list(up_w.block_size)
+        and gate_w.shape[1] == up_w.shape[1]
+        and gate_w.act_pre_scale is None
+        and up_w.act_pre_scale is None
+    )
+
+
+def _fuse_gate_up_proj(model: nn.Module) -> None:
+    """Fuse each MLP's ``gate_proj | up_proj`` into one ``gate_up_proj``.
+
+    gate and up share the same input, so the unfused path quantizes ``x`` to
+    int8 twice and launches two W4A8 matvecs per layer. Fusing the weights
+    into one ``[2*inter, hidden]`` tensor halves both. Weight bytes read are
+    unchanged, so the win is launch + activation-quant overhead (decode is
+    launch-bound). Only Q4_K (coalesced-int4) layers are fused; any layer
+    with a non-int4 weight is left as two matmuls (still correct).
+
+    Must run AFTER weights are packed to ``CudaCoalescedInt4Tensor`` (i.e.
+    inside ``_export_cuda``), and is independent of TurboQuant.
+    """
+    n_fused = 0
+    n_skipped = 0
+    for layer in model.layers:
+        mlp = getattr(layer, "mlp", None)
+        if mlp is None or not (hasattr(mlp, "gate_proj") and hasattr(mlp, "up_proj")):
+            continue
+        gate_w = mlp.gate_proj.weight
+        up_w = mlp.up_proj.weight
+        if not _is_fuseable_int4_pair(gate_w, up_w):
+            n_skipped += 1
+            continue
+        inter = up_w.shape[0]
+        hidden = up_w.shape[1]
+        fused_w = _concat_coalesced_int4_along_n(gate_w, up_w)
+
+        # Container built on meta to avoid materializing a dense
+        # [2*inter, hidden] weight before we overwrite it with fused_w.
+        gate_up = nn.Linear(hidden, 2 * inter, bias=False, device="meta")
+        gate_up.weight = nn.Parameter(fused_w, requires_grad=False)
+        mlp.gate_up_proj = gate_up
+        mlp.intermediate_size = inter
+        del mlp.gate_proj
+        del mlp.up_proj
+        mlp.forward = types.MethodType(_fused_mlp_forward, mlp)
+        n_fused += 1
+
+    msg = f"[gemma4_31b cuda] Fused gate+up on {n_fused} MLP layers"
+    if n_skipped:
+        msg += f" ({n_skipped} skipped: non-int4 weights)"
+    print(msg)
+
+
 def cuda_source_transformations(
     model: nn.Module,
     *,
@@ -111,6 +295,11 @@ def cuda_source_transformations(
 ) -> None:
     """Apply CUDA source transformations to a Gemma 4 31B model in place.
 
+    Always fuses each MLP's ``gate_proj|up_proj`` into a single matmul (one
+    activation-quant + one W4A8 matvec per layer instead of two; Q4_K
+    coalesced-int4 layers only — other quant types are left untouched).
+    Optionally also swaps full-attention KV caches for TurboQuant TQ4.
+
     Args:
         model: ``Gemma4_31B`` instance to transform.
         use_turboquant: When True, swap full-attention layers' KV caches
@@ -119,7 +308,25 @@ def cuda_source_transformations(
             ``torch.ops.triton.tq4_sdpa``. Sliding-window layers are
             unaffected.
     """
+    _fuse_gate_up_proj(model)
+
     if not use_turboquant:
+        # Non-TurboQuant path: keep the bf16 KV cache but bound full-attention
+        # SDPA to the valid context length via a runtime kv_len scalar (routes
+        # through torch.ops.triton.sdpa, which dispatches L_q==1 decode to the
+        # length-aware split-K flash-decoding kernel). Sliding-window layers
+        # already use a bounded ring buffer, so they are left untouched.
+        n_bounded = 0
+        for layer in model.layers:
+            attn = layer.self_attn
+            if attn.is_sliding:
+                continue
+            attn.forward = types.MethodType(_lenaware_attention_forward, attn)
+            n_bounded += 1
+        print(
+            f"[gemma4_31b cuda] length-aware SDPA: bounded {n_bounded} "
+            f"full-attention layers to runtime kv_len (O(context) attention)"
+        )
         return
 
     config = model.config
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
index 59be23020f2..b2b2264178a 100644
--- a/examples/models/gemma4_31b/export.py
+++ b/examples/models/gemma4_31b/export.py
@@ -171,6 +171,7 @@ def _export_cuda(
     )
     from executorch.exir.backend.compile_spec_schema import CompileSpec
     from executorch.exir.passes import MemoryPlanningPass
+    from executorch.exir.passes.propagate_device_pass import PropagateDeviceConfig
     from torch.export import Dim, export
 
     inductor_config.coordinate_descent_tuning = False
@@ -181,12 +182,11 @@ def _export_cuda(
 
     materialize_runtime_buffers(model, dtype=torch.bfloat16)
 
-    if use_turboquant:
-        from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
-            cuda_source_transformations,
-        )
+    from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
+        cuda_source_transformations,
+    )
 
-        cuda_source_transformations(model, use_turboquant=True)
+    cuda_source_transformations(model, use_turboquant=use_turboquant)
 
     # Int4Tensor weights are used directly — no format conversion.
     # F.linear dispatches to executorch_cuda::int4_plain_mm (CUDA shim).
@@ -270,6 +270,14 @@ def _export_cuda(
                 alloc_graph_input=False,
             ),
             emit_mutable_buffer_names=True,
+            # Keep method inputs/outputs device-resident so the CUDA backend
+            # does not insert boundary H2D/D2H copies: the runner stages inputs
+            # in CUDA memory and reads the sampled token back with a single
+            # small D2H. CUDA-only (no effect on the MLX path).
+            propagate_device_config=PropagateDeviceConfig(
+                skip_h2d_for_method_inputs=True,
+                skip_d2h_for_method_outputs=True,
+            ),
         ),
     )
 
diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py
index e95581dc95d..90839ea6f6a 100644
--- a/examples/models/gemma4_31b/gguf_loader.py
+++ b/examples/models/gemma4_31b/gguf_loader.py
@@ -17,9 +17,12 @@
   linear and embedding. ``embed_tokens`` and ``lm_head`` stay tied -- they share
   the one quantized tensor.
 * **CUDA**: Q4_K -> ``Int4Tensor``, Q6_K -> ``CudaDp4aPlanarInt6Tensor`` (a genuine
-  6-bit packed weight, lossless, symmetric); ``lm_head`` keeps the quantized
-  tensor but the token embedding is dequantized to bf16 (the packed tensors can't
-  gather), so they are untied.
+  6-bit packed weight, lossless, symmetric). ``embed_tokens`` and ``lm_head`` are
+  untied: ``lm_head`` keeps a packed (int6/int4) matmul weight, while the token
+  embedding becomes a gatherable ``IntxUnpackedToInt8Tensor`` (int8) -- the truly
+  packed int4/int6 tensors can't gather. For the Q6_K tied weight the decode is
+  done once and shared between the two, avoiding a whole-tensor bf16 dequant and
+  a second decode (see ``_untie_embed_lm_head``).
 
 Usage:
     model, config = load_gguf_model("model.gguf", backend="cuda")
@@ -116,6 +119,55 @@ def _resolve_tied_lm_head(model, lm_head_weight, packers):
         )
 
 
+def _untie_embed_lm_head(model, gtensor, weight, backend):
+    """Untie the GGUF token-embed / lm_head weight, returning ``(embed, lm_head)``.
+
+    GGUF ties ``embed_tokens`` and ``lm_head`` to one quantized weight. The
+    returned ``lm_head`` is packed into ``model.lm_head`` after the streaming loop
+    (``_resolve_tied_lm_head``), or is ``None`` when this function already
+    assigned it.
+
+    * **MLX**: keep both tied on the raw ``ExportableGGUFTensor``.
+    * **CUDA** (Q6_K or Q4_K): untie so ``lm_head`` keeps a packed low-bit matmul
+      weight while the token embedding becomes a gatherable int8
+      ``IntxUnpackedToInt8Tensor`` -- the truly packed int4/int6 tensors can't
+      gather. Instead of dequantizing the whole ~1.4 B-element weight to bf16
+      (2 B/elem), decode it once to int8 (1 B/elem; the decode is lossless so the
+      result is numerically identical), halving the embedding's host + GPU-constant
+      footprint. The token embedding (Q4_K for the Gemma checkpoint) is the single
+      biggest weight, so this is the dominant saving vs the bf16 path. ``lm_head``:
+        - Q6_K -> ``CudaDp4aPlanarInt6Tensor`` from the *same* int8 decode and
+          assigned here (``pack_linear_for_cuda`` would mis-route an int8 tensor to
+          the int8 path), so the post-loop resolve is a no-op.
+        - Q4_K -> kept as the native ``Int4Tensor`` and returned, so
+          ``_resolve_tied_lm_head`` packs it to ``CudaCoalescedInt4Tensor`` (same
+          as a regular Q4_K linear).
+    * **CUDA, other types**: fall back to the bf16 embedding.
+    """
+    if backend == "mlx":
+        return weight, gtensor
+
+    if gtensor.ggml_type in ("q6_k", "q4_k"):
+        intx = gtensor.to_intx_unpacked_to_int8_tensor()
+        if gtensor.ggml_type == "q6_k":
+            import torch.nn as nn
+            from executorch.backends.cuda.dp4a_planar_int6_tensor import (
+                CudaDp4aPlanarInt6Tensor,
+            )
+
+            model.lm_head.weight = nn.Parameter(
+                CudaDp4aPlanarInt6Tensor._from_intx_int8(intx), requires_grad=False
+            )
+            return intx, None
+        # Q4_K: ``weight`` is the native Int4Tensor; let _resolve_tied_lm_head
+        # pack it to CudaCoalescedInt4Tensor. Only the embedding switches to int8.
+        return intx, weight
+
+    from executorch.examples.models.gemma4_31b.quant import dequantize_weight
+
+    return dequantize_weight(weight, torch.bfloat16), weight
+
+
 def load_gguf_model(
     gguf_path: str,
     max_seq_len: int = 4096,
@@ -140,7 +192,7 @@ def load_gguf_model(
         Gemma4_31BConfig,
         materialize_runtime_buffers,
     )
-    from executorch.examples.models.gemma4_31b.quant import dequantize_weight, pack_one
+    from executorch.examples.models.gemma4_31b.quant import pack_one
     from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf
 
     if backend == "cuda":
@@ -161,7 +213,7 @@ def load_gguf_model(
     with torch.device("meta"):
         model = Gemma4_31B(config)
 
-    lm_head_weight = None  # weight reused for a tied lm_head
+    lm_head_weight = None  # tied weight resolved into lm_head after the loop
     n_processed = 0
 
     print(f"Streaming GGUF from {gguf_path}...")
@@ -173,11 +225,9 @@ def load_gguf_model(
         if isinstance(value, ExportableGGUFTensor):
             weight = _convert_weight(model, model_key, value, backend)
             if model_key == "embed_tokens.weight":
-                # Tied lm_head reuses the embedding weight: MLX wants the raw
-                # ExportableGGUFTensor (linear pattern), CUDA the quant tensor.
-                lm_head_weight = value if backend == "mlx" else weight
-                if backend == "cuda":
-                    weight = dequantize_weight(weight, torch.bfloat16)
+                weight, lm_head_weight = _untie_embed_lm_head(
+                    model, value, weight, backend
+                )
             value = weight
         elif value.dtype == torch.float32:
             value = value.to(torch.bfloat16)
diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp
index 83d1f639e75..3d9970b1610 100644
--- a/examples/models/gemma4_31b/main.cpp
+++ b/examples/models/gemma4_31b/main.cpp
@@ -23,8 +23,11 @@
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/backend/options.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 
@@ -76,25 +79,29 @@ DEFINE_bool(
     cuda_graph,
     false,
     "Enable CUDA graph capture for the decode method. CUDA only.");
-DEFINE_bool(
-    ignore_eos,
-    false,
-    "Do not stop at EOS; always generate exactly max_new_tokens. For "
-    "benchmarking decode throughput at a fixed token count (mirrors "
-    "llama.cpp --ignore-eos).");
 
 namespace llm = ::executorch::extension::llm;
 using ::executorch::extension::from_blob;
+using ::executorch::extension::make_tensor_ptr;
 using ::executorch::extension::Module;
+using ::executorch::extension::TensorPtr;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
+#ifdef EXECUTORCH_BUILD_CUDA
+using ::executorch::extension::clone_tensor_ptr_to;
+#endif
 
 using SizesType = executorch::aten::SizesType;
 
-// Read a sampled token ID from a scalar float output (CUDA path).
+// Read a sampled token ID from a scalar int64 output (CUDA path).
+//
+// The model now emits the sampled token as int64 (see sampler.py), matching
+// the decode method's int64 token input so the on-device output buffer can be
+// aliased directly as the next step's input. We still copy the 8-byte scalar
+// back to the host here for EOS detection and detokenization.
 static uint64_t read_token(const executorch::aten::Tensor& output) {
   const void* ptr = output.const_data_ptr();
-  float val = 0.0f;
+  int64_t val = 0;
 
 #ifdef EXECUTORCH_BUILD_CUDA
   cudaPointerAttributes attrs{};
@@ -102,7 +109,7 @@ static uint64_t read_token(const executorch::aten::Tensor& output) {
       attrs.type == cudaMemoryTypeDevice;
   if (on_device) {
     cudaError_t err =
-        cudaMemcpy(&val, ptr, sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&val, ptr, sizeof(int64_t), cudaMemcpyDeviceToHost);
     if (err != cudaSuccess) {
       ET_LOG(
           Error,
@@ -111,13 +118,13 @@ static uint64_t read_token(const executorch::aten::Tensor& output) {
       return 0;
     }
   } else {
-    memcpy(&val, ptr, sizeof(float));
+    memcpy(&val, ptr, sizeof(int64_t));
   }
 #else
-  memcpy(&val, ptr, sizeof(float));
+  memcpy(&val, ptr, sizeof(int64_t));
 #endif
 
-  return static_cast<uint64_t>(llrintf(val));
+  return static_cast<uint64_t>(val);
 }
 
 int main(int argc, char** argv) {
@@ -187,6 +194,8 @@ int main(int argc, char** argv) {
       FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
 
 #ifdef EXECUTORCH_BUILD_CUDA
+  const auto cuda_device =
+      executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0);
   if (FLAGS_cuda_graph) {
     executorch::runtime::BackendOptions<2> cuda_opts;
     cuda_opts.set_option("enable_cuda_graph_for_method", "decode");
@@ -223,8 +232,9 @@ int main(int argc, char** argv) {
     ET_LOG(Error, "Failed to load decode method");
     return 1;
   }
-  auto temp_tensor =
-      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
+  auto temp_tensor = clone_tensor_ptr_to(
+      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float),
+      cuda_device);
 #else
   if (FLAGS_cuda_graph) {
     ET_LOG(Info, "--cuda_graph ignored on non-CUDA build");
@@ -292,6 +302,12 @@ int main(int argc, char** argv) {
   // ---------------------------------------------------------------
   uint64_t cur_token = 0;
   int64_t prefill_pos = 0;
+#ifdef EXECUTORCH_BUILD_CUDA
+  // Alias of the most recent forward's on-device int64 output token. The last
+  // prefill chunk's output seeds the first decode step (no token H2D); each
+  // decode step then re-aliases its own output for the next step.
+  TensorPtr device_out_token;
+#endif
   while (prefill_pos < num_prompt_tokens) {
     int64_t chunk_len =
         std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk);
@@ -310,6 +326,12 @@ int main(int argc, char** argv) {
     auto pos_tensor = from_blob(
         pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long);
 
+#ifdef EXECUTORCH_BUILD_CUDA
+    // skip_h2d: prefill/decode method inputs must already live in CUDA memory.
+    tokens_tensor = clone_tensor_ptr_to(tokens_tensor, cuda_device);
+    pos_tensor = clone_tensor_ptr_to(pos_tensor, cuda_device);
+#endif
+
     std::vector<EValue> inputs;
     inputs.push_back(EValue(tokens_tensor));
     inputs.push_back(EValue(pos_tensor));
@@ -328,7 +350,11 @@ int main(int argc, char** argv) {
     }
 
 #ifdef EXECUTORCH_BUILD_CUDA
-    cur_token = read_token(result.get()[0].toTensor());
+    const auto& out_tensor = result.get()[0].toTensor();
+    cur_token = read_token(out_tensor);
+    // Keep the sampled token on device: alias the output buffer so it feeds
+    // straight into the next forward as the int64 token input (zero copy).
+    device_out_token = make_tensor_ptr(out_tensor);
 #else
     cur_token = static_cast<uint64_t>(
         llm::logits_to_token(result.get()[0].toTensor(), temp_val));
@@ -360,22 +386,69 @@ int main(int argc, char** argv) {
   // Decode loop
   // ---------------------------------------------------------------
   int64_t pos = num_prompt_tokens;
-  std::vector<int64_t> decode_token_data = {static_cast<int64_t>(cur_token)};
   std::vector<int64_t> decode_pos_data = {pos};
+  auto decode_pos_cpu = from_blob(
+      decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
+#ifdef EXECUTORCH_BUILD_CUDA
+  // Fixed device-resident position input slot: the decode method always reads
+  // the position from this same address every step (cuda-graph-safe). Seeded
+  // once here with a one-time H2D; refreshed each step by an on-device D2D.
+  auto decode_pos = clone_tensor_ptr_to(decode_pos_cpu, cuda_device);
+  // Upload the FULL decode position array to device ONCE (a single H2D - the
+  // one-time copy we keep). Each step copies its position from here into the
+  // fixed slot with a device-to-device copy, so there is NO per-round pos H2D.
+  std::vector<int64_t> pos_seq_data(FLAGS_max_new_tokens);
+  for (int32_t i = 0; i < FLAGS_max_new_tokens; i++) {
+    pos_seq_data[i] = num_prompt_tokens + i;
+  }
+  auto pos_seq_dev = clone_tensor_ptr_to(
+      from_blob(
+          pos_seq_data.data(),
+          {S(FLAGS_max_new_tokens)},
+          executorch::aten::ScalarType::Long),
+      cuda_device);
+  auto* pos_seq_dev_ptr =
+      static_cast<int64_t*>(pos_seq_dev->mutable_data_ptr());
+  auto* decode_pos_slot_ptr =
+      static_cast<int64_t*>(decode_pos->mutable_data_ptr());
+#else
+  // Non-CUDA (MLX) path: keep host token/pos buffers; the backend stages them
+  // and the host samples from the returned logits.
+  std::vector<int64_t> decode_token_data = {static_cast<int64_t>(cur_token)};
   auto decode_tokens = from_blob(
       decode_token_data.data(), {1, 1}, executorch::aten::ScalarType::Long);
-  auto decode_pos = from_blob(
-      decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
+  auto decode_pos = decode_pos_cpu;
+#endif
 
   uint64_t prev_token = cur_token;
-  bool hit_eos =
-      !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end();
+  bool hit_eos = eos_ids.find(cur_token) != eos_ids.end();
   for (int32_t step = 0; step < FLAGS_max_new_tokens && !hit_eos; step++) {
-    decode_token_data[0] = static_cast<int64_t>(cur_token);
+#ifdef EXECUTORCH_BUILD_CUDA
+    // No per-round H2D: copy this step's position from the pre-uploaded device
+    // position array into the fixed position slot with an on-device D2D. With
+    // the token aliased on device (Option A) and the position staged via D2D,
+    // the per-round HtoD count is zero (independent of decode length).
+    // cudaMemcpy D2D is host-synchronous, so the slot is updated before the
+    // decode kernels read it; with cuda graph enabled this becomes a captured
+    // cudaMemcpyAsync on the decode stream into this same fixed slot.
+    ET_CHECK_MSG(
+        cudaMemcpy(
+            decode_pos_slot_ptr,
+            pos_seq_dev_ptr + step,
+            sizeof(int64_t),
+            cudaMemcpyDeviceToDevice) == cudaSuccess,
+        "Failed to copy decode position D2D");
+#else
     decode_pos_data[0] = pos;
+    decode_token_data[0] = static_cast<int64_t>(cur_token);
+#endif
 
     std::vector<EValue> inputs;
+#ifdef EXECUTORCH_BUILD_CUDA
+    inputs.push_back(EValue(device_out_token));
+#else
     inputs.push_back(EValue(decode_tokens));
+#endif
     inputs.push_back(EValue(decode_pos));
 
 #ifdef EXECUTORCH_BUILD_CUDA
@@ -392,7 +465,10 @@ int main(int argc, char** argv) {
 
     prev_token = cur_token;
 #ifdef EXECUTORCH_BUILD_CUDA
-    cur_token = read_token(result.get()[0].toTensor());
+    const auto& out_tensor = result.get()[0].toTensor();
+    cur_token = read_token(out_tensor);
+    // Alias this step's on-device output token as the next step's token input.
+    device_out_token = make_tensor_ptr(out_tensor);
 #else
     cur_token = static_cast<uint64_t>(
         llm::logits_to_token(result.get()[0].toTensor(), temp_val));
@@ -405,7 +481,7 @@ int main(int argc, char** argv) {
       fflush(stdout);
     }
 
-    hit_eos = !FLAGS_ignore_eos && eos_ids.find(cur_token) != eos_ids.end();
+    hit_eos = eos_ids.find(cur_token) != eos_ids.end();
   }
   printf("\n");
 
diff --git a/examples/models/gemma4_31b/model.py b/examples/models/gemma4_31b/model.py
index bfaa73a754b..d953541a244 100644
--- a/examples/models/gemma4_31b/model.py
+++ b/examples/models/gemma4_31b/model.py
@@ -484,7 +484,7 @@ def forward(
             temperature: 1-D float tensor for Gumbel-max sampling.
 
         Returns:
-            (B, 1) sampled token IDs as float.
+            (B, 1) sampled token IDs as int64.
         """
         x = self.embed_tokens(tokens) * self.embed_normalizer
 
diff --git a/examples/models/gemma4_31b/sampler.py b/examples/models/gemma4_31b/sampler.py
index 690344fd2e4..2ce428224a2 100644
--- a/examples/models/gemma4_31b/sampler.py
+++ b/examples/models/gemma4_31b/sampler.py
@@ -26,9 +26,12 @@ def sample(
             temperature still works ("near-greedy").
 
     Returns:
-        ``[B, 1]`` float32 token IDs (``argmax(logits/T + gumbel_noise)``).
+        ``[B, 1]`` int64 token IDs (``argmax(logits/T + gumbel_noise)``).
+        Emitting int64 (rather than casting to float) lets the runner alias the
+        on-device output token directly as the next decode step's int64 token
+        input — no D2H/H2D round-trip and no dtype cast.
     """
     logits = logits / temperature.clamp(min=1e-6)
     noise = torch.rand_like(logits)
     gumbel = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
-    return (logits + gumbel).argmax(dim=-1, keepdim=True).float()
+    return (logits + gumbel).argmax(dim=-1, keepdim=True).to(torch.int64)
diff --git a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
index caf0a44e03b..c346c1d2f82 100644
--- a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
+++ b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
@@ -246,13 +246,14 @@ def _load(self, tmp):
 
     def test_load_converts_weights(self):
         """GGUF -> CUDA: Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> CudaDp4aPlanarInt6Tensor,
-        embedding bf16."""
+        embedding int8 (gatherable)."""
         from executorch.backends.cuda.coalesced_int4_tensor import (
             CudaCoalescedInt4Tensor,
         )
         from executorch.backends.cuda.dp4a_planar_int6_tensor import (
             CudaDp4aPlanarInt6Tensor,
         )
+        from torchao.quantization import IntxUnpackedToInt8Tensor
 
         with tempfile.TemporaryDirectory() as tmp:
             model, _ = self._load(tmp)
@@ -263,11 +264,49 @@ def test_load_converts_weights(self):
         self.assertIsInstance(
             model.layers[0].mlp.down_proj.weight.data, CudaDp4aPlanarInt6Tensor
         )
-        # Tied lm_head is repacked to int6 by pack_cuda (it keeps quantization,
-        # unlike the token embedding which is dequantized for the gather).
+        # Tied lm_head keeps a packed int6 matmul weight.
         self.assertIsInstance(model.lm_head.weight.data, CudaDp4aPlanarInt6Tensor)
-        # Token embedding is dequantized to bf16 (Int4/packed-int6 can't gather).
-        self.assertEqual(model.embed_tokens.weight.dtype, torch.bfloat16)
+        # Token embedding is decoded to a gatherable int8 tensor (not bf16): the
+        # Q6_K decode is lossless and shared with lm_head. Keeping it int8 (vs
+        # bf16) avoids a ~5.6 GB fp32 dequant transient and ~1.4 GB resident at
+        # export time.
+        self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor)
+
+    def test_int8_embedding_matches_bf16(self):
+        """Guard the bf16 -> int8 token-embedding switch.
+
+        The embedding is now loaded as a gatherable int8 ``IntxUnpackedToInt8Tensor``
+        instead of being dequantized to bf16. Its gathered rows must match the bf16
+        dequant of the *source* GGUF token embedding -- i.e. exactly what the old
+        ``dequantize_weight(..., bf16)`` path returned. The GGUF decode is lossless,
+        so they agree to bf16 precision.
+        """
+        from executorch.examples.models.gemma4_31b.gguf_loader import gguf_to_model_key
+        from executorch.extension.llm.export.gguf import ExportableGGUFTensor, iter_gguf
+        from torchao.quantization import IntxUnpackedToInt8Tensor
+
+        with tempfile.TemporaryDirectory() as tmp:
+            path = os.path.join(tmp, "tiny.gguf")
+            build_gguf_checkpoint(path)
+            # Reference = bf16 dequant of the source GGUF token embedding (the
+            # tensor the previous bf16 embedding path materialized).
+            ref_bf16 = None
+            for name, val in iter_gguf(path):
+                if gguf_to_model_key(name) == "embed_tokens.weight":
+                    self.assertIsInstance(val, ExportableGGUFTensor)
+                    ref_bf16 = val.dequantize(torch.bfloat16)
+                    break
+            self.assertIsNotNone(ref_bf16, "token_embd.weight not found in GGUF")
+            model, _ = load_gguf_model(path, backend="cuda", config=GGUF_CONFIG)
+
+        self.assertIsInstance(model.embed_tokens.weight.data, IntxUnpackedToInt8Tensor)
+
+        ids = torch.tensor([0, 1, 7, GGUF_CONFIG.vocab_size - 1])
+        out = model.embed_tokens(ids)  # int8 gather + dequant
+        ref = ref_bf16[ids]
+        self.assertEqual(out.shape, ref.shape)
+        rel_err = (out.float() - ref.float()).abs().mean() / ref.float().abs().mean()
+        self.assertLess(rel_err.item(), 0.02)
 
     def test_generate(self):
         """GGUF -> CUDA -> eager generate produces valid tokens (inference.py)."""
diff --git a/examples/models/parakeet/CMakeLists.txt b/examples/models/parakeet/CMakeLists.txt
index 810f2815abd..a2b798de557 100644
--- a/examples/models/parakeet/CMakeLists.txt
+++ b/examples/models/parakeet/CMakeLists.txt
@@ -109,32 +109,49 @@ if(EXECUTORCH_BUILD_VULKAN)
   executorch_target_link_options_shared_lib(vulkan_backend)
 endif()
 
-add_executable(parakeet_runner main.cpp timestamp_utils.cpp tokenizer_utils.cpp)
-if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-  target_link_options_gc_sections(parakeet_runner)
-  if(NOT APPLE AND NOT MSVC)
-    target_link_options(parakeet_runner PRIVATE "LINKER:-s")
-  endif()
-endif()
+set(parakeet_shared_sources parakeet_transcriber.cpp timestamp_utils.cpp
+                            tokenizer_utils.cpp
+)
 
-# Copy MLX metallib for runtime if MLX delegate is enabled
-if(TARGET mlxdelegate)
-  executorch_target_copy_mlx_metallib(parakeet_runner)
-endif()
+set(parakeet_common_include_directories
+    ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
+)
 
-target_include_directories(
-  parakeet_runner PUBLIC ${_common_include_directories}
+add_executable(parakeet_runner main.cpp ${parakeet_shared_sources})
+add_executable(
+  parakeet_helper parakeet_helper.cpp parakeet_helper_protocol.cpp
+                  ${parakeet_shared_sources}
 )
-target_link_libraries(parakeet_runner PUBLIC ${link_libraries})
-target_compile_options(parakeet_runner PUBLIC ${_common_compile_options})
+
+foreach(parakeet_target parakeet_runner parakeet_helper)
+  if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    target_link_options_gc_sections(${parakeet_target})
+    if(NOT APPLE AND NOT MSVC)
+      target_link_options(${parakeet_target} PRIVATE "LINKER:-s")
+    endif()
+  endif()
+
+  if(TARGET mlxdelegate)
+    executorch_target_copy_mlx_metallib(${parakeet_target})
+  endif()
+
+  target_include_directories(
+    ${parakeet_target} PUBLIC ${parakeet_common_include_directories}
+  )
+  target_link_libraries(${parakeet_target} PUBLIC ${link_libraries})
+  target_compile_options(${parakeet_target} PUBLIC ${_common_compile_options})
+endforeach()
 
 # On Windows, copy required DLLs to the executable directory
 if(MSVC AND EXECUTORCH_BUILD_CUDA)
-  add_custom_command(
-    TARGET parakeet_runner
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
-            $<TARGET_FILE_DIR:parakeet_runner>
-    COMMENT "Copying aoti_cuda_shims.dll to parakeet_runner directory"
-  )
+  foreach(parakeet_target parakeet_runner parakeet_helper)
+    add_custom_command(
+      TARGET ${parakeet_target}
+      POST_BUILD
+      COMMAND
+        ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+        $<TARGET_FILE_DIR:${parakeet_target}>
+      COMMENT "Copying aoti_cuda_shims.dll to ${parakeet_target} directory"
+    )
+  endforeach()
 endif()
diff --git a/examples/models/parakeet/CMakePresets.json b/examples/models/parakeet/CMakePresets.json
index 87ace61e315..90a90fbbdf5 100644
--- a/examples/models/parakeet/CMakePresets.json
+++ b/examples/models/parakeet/CMakePresets.json
@@ -89,42 +89,42 @@
             "displayName": "Build Parakeet runner (CPU)",
             "configurePreset": "parakeet-cpu",
             "configuration": "Release",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         },
         {
             "name": "parakeet-cuda",
             "displayName": "Build Parakeet runner (CUDA)",
             "configurePreset": "parakeet-cuda",
             "configuration": "Release",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         },
         {
             "name": "parakeet-cuda-debug",
             "displayName": "Build Parakeet runner (CUDA, Debug)",
             "configurePreset": "parakeet-cuda-debug",
             "configuration": "Debug",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         },
         {
             "name": "parakeet-metal",
             "displayName": "Build Parakeet runner (Metal)",
             "configurePreset": "parakeet-metal",
             "configuration": "Release",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         },
         {
             "name": "parakeet-mlx",
             "displayName": "Build Parakeet runner (MLX)",
             "configurePreset": "parakeet-mlx",
             "configuration": "Release",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         },
         {
             "name": "parakeet-vulkan",
             "displayName": "Build Parakeet runner (Vulkan)",
             "configurePreset": "parakeet-vulkan",
             "configuration": "Release",
-            "targets": ["parakeet_runner"]
+            "targets": ["parakeet_runner", "parakeet_helper"]
         }
     ],
     "workflowPresets": [
diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md
index 62cec6a9cc4..e2f09f8aa99 100644
--- a/examples/models/parakeet/README.md
+++ b/examples/models/parakeet/README.md
@@ -242,6 +242,11 @@ make parakeet-cuda
 make parakeet-mlx
 ```
 
+Each Parakeet build now produces both:
+
+- `parakeet_runner` for one-shot CLI transcription from an audio file
+- `parakeet_helper` for long-lived host integrations that keep the model warm and stream PCM requests over stdin/stdout
+
 On Windows (PowerShell), use CMake workflow presets directly:
 
 ```powershell
@@ -310,6 +315,26 @@ If your generator is single-config, the runner may be at `.\cmake-out\examples\m
 | `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) |
 | `--timestamps`     | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) |
 
+### Persistent Helper
+
+The helper binary uses the same Parakeet transcription stack as `parakeet_runner`,
+but keeps the model loaded across multiple requests so host apps can avoid repeated
+startup and model load overhead.
+
+Example:
+
+```bash
+# Metal
+DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_helper \
+  --model_path examples/models/parakeet/parakeet_metal/model.pte \
+  --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model
+```
+
+The helper accepts framed requests over stdin, validates 16 kHz mono float32 PCM
+payloads, and returns status/result messages over stdout. It is intended for app
+integrations such as the macOS `ExecuWhisper` frontend in the separate
+`executorch-examples` repository.
+
 ### Mobile App
 
 Check out a [demo Android app](https://github.com/meta-pytorch/executorch-examples/tree/main/parakeet/android/ParakeetApp) for Parakeet in the separate `executorch-examples` repository.
diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp
index b8a052004e4..410ba6cea62 100644
--- a/examples/models/parakeet/main.cpp
+++ b/examples/models/parakeet/main.cpp
@@ -6,25 +6,14 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cmath>
-#include <cstdint>
+#include <gflags/gflags.h>
+
 #include <exception>
 #include <iostream>
 #include <string>
-#include <unordered_set>
-#include <vector>
 
-#include <gflags/gflags.h>
+#include "parakeet_transcriber.h"
 
-#include "timestamp_utils.h"
-#include "tokenizer_utils.h"
-#include "types.h"
-
-#include <executorch/extension/asr/runner/transducer_runner.h>
-#include <executorch/extension/llm/runner/wav_loader.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor_ptr_maker.h>
-#include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/log.h>
 #ifdef ET_BUILD_METAL
 #include <executorch/backends/apple/metal/runtime/stats.h>
@@ -44,69 +33,17 @@ DEFINE_string(
     timestamps,
     "segment",
     "Timestamp output mode: none|token|word|segment|all");
-
-using ::executorch::extension::from_blob;
-using ::executorch::extension::Module;
-using ::executorch::runtime::Error;
-using ::executorch::runtime::EValue;
-
-using ::parakeet::TextWithOffsets;
-using ::parakeet::TokenWithTextInfo;
-
-namespace {
-// TDT duration values for Parakeet models
-const std::vector<int> DURATIONS = {0, 1, 2, 3, 4};
-
-struct TimestampOutputMode {
-  bool token = false;
-  bool word = false;
-  bool segment = false;
-
-  bool enabled() const {
-    return token || word || segment;
-  }
-};
-
-std::string to_lower_ascii(std::string s) {
-  for (char& ch : s) {
-    ch = static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
-  }
-  return s;
-}
-
-TimestampOutputMode parse_timestamp_output_mode(const std::string& raw_arg) {
-  if (raw_arg.empty()) {
-    throw std::invalid_argument(
-        "Invalid --timestamps value (empty). Expected: token, word, segment, all.");
-  }
-  const std::string mode = to_lower_ascii(raw_arg);
-  if (mode == "none") {
-    return {false, false, false};
-  }
-  if (mode == "token") {
-    return {true, false, false};
-  }
-  if (mode == "word") {
-    return {false, true, false};
-  }
-  if (mode == "segment") {
-    return {false, false, true};
-  }
-  if (mode == "all") {
-    return {true, true, true};
-  }
-  throw std::invalid_argument(
-      "Invalid --timestamps value '" + raw_arg +
-      "'. Expected: token, word, segment, all.");
-}
-} // namespace
+DEFINE_bool(
+    runtime_profile,
+    false,
+    "Print a detailed runtime profile for preprocessor, encoder, and decode-loop execution.");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  TimestampOutputMode timestamp_mode;
+  parakeet::TimestampOutputMode timestamp_mode;
   try {
-    timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps);
+    timestamp_mode = parakeet::parse_timestamp_output_mode(FLAGS_timestamps);
   } catch (const std::invalid_argument& e) {
     ET_LOG(Error, "%s", e.what());
     return 1;
@@ -117,162 +54,57 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // --- Build config and runner ---
-  executorch::extension::asr::TransducerConfig config;
-  config.durations = DURATIONS;
-
-  std::optional<std::string> data_path_opt;
-  if (!FLAGS_data_path.empty()) {
-    data_path_opt = FLAGS_data_path;
-  }
-
-  executorch::extension::asr::TransducerRunner runner(
-      FLAGS_model_path, FLAGS_tokenizer_path, config, data_path_opt);
-
-  auto load_err = runner.load();
-  if (load_err != Error::Ok) {
-    ET_LOG(Error, "Failed to load model.");
-    return 1;
-  }
-
-  // --- Load and preprocess audio ---
-  ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
-  std::vector<float> audio_data =
-      ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
-  ET_LOG(Info, "Loaded %zu audio samples", audio_data.size());
-
-  auto audio_tensor = from_blob(
-      audio_data.data(),
-      {static_cast<::executorch::aten::SizesType>(audio_data.size())},
-      ::executorch::aten::ScalarType::Float);
-
-  ET_LOG(Info, "Running preprocessor...");
-  auto preprocess_result = runner.preprocess(audio_tensor);
-  if (!preprocess_result.ok()) {
-    ET_LOG(Error, "Preprocessing failed.");
-    return 1;
-  }
-  auto preprocess_out = preprocess_result.get();
-
-  // --- Transcribe ---
-  ET_LOG(Info, "Running TDT greedy decode...");
-  auto result = runner.transcribe(
-      preprocess_out.features,
-      [](const std::string& piece) { std::cout << piece << std::flush; },
-      preprocess_out.length);
-
-  if (!result.ok()) {
-    ET_LOG(Error, "Transcription failed.");
-    return 1;
-  }
-
-  auto& decoded_tokens = result.get();
-  ET_LOG(Info, "Decoded %zu tokens", decoded_tokens.size());
-
-  // Use the runner's tokenizer for text decoding and timestamps
-  const auto* tokenizer = runner.tokenizer();
-  if (!tokenizer || !tokenizer->is_loaded()) {
-    ET_LOG(Error, "Tokenizer not available.");
-    return 1;
-  }
-
-  // Print full transcribed text
-  std::string text = parakeet::tokenizer_utils::decode_token_sequence(
-      decoded_tokens, *tokenizer);
-  std::cout << "\nTranscribed text: " << text << std::endl;
-
-#ifdef ET_BUILD_METAL
-  executorch::backends::metal::print_metal_backend_stats();
-#endif // ET_BUILD_METAL
-
-  if (!timestamp_mode.enabled()) {
-    return 0;
-  }
-
-  // --- Timestamps ---
-  // Query timestamp-related metadata from the model.
-  // These are Parakeet-specific constants, not part of TransducerRunner.
-  std::unique_ptr<Module> meta_module;
-  if (data_path_opt) {
-    meta_module = std::make_unique<Module>(
-        FLAGS_model_path, *data_path_opt, Module::LoadMode::Mmap);
-  } else {
-    meta_module =
-        std::make_unique<Module>(FLAGS_model_path, Module::LoadMode::Mmap);
-  }
-  auto meta_load_err = meta_module->load();
-  if (meta_load_err != Error::Ok) {
-    ET_LOG(Error, "Failed to load model for timestamp metadata.");
-    return 1;
-  }
-
-  std::vector<::executorch::runtime::EValue> empty_inputs;
-  auto window_stride_result =
-      meta_module->execute("window_stride", empty_inputs);
-  auto encoder_subsampling_factor_result =
-      meta_module->execute("encoder_subsampling_factor", empty_inputs);
-
-  if (!window_stride_result.ok() || !encoder_subsampling_factor_result.ok()) {
-    ET_LOG(
-        Error,
-        "Failed to query timestamp metadata (window_stride, encoder_subsampling_factor).");
-    return 1;
-  }
-
-  double window_stride = window_stride_result.get()[0].toDouble();
-  int64_t encoder_subsampling_factor =
-      encoder_subsampling_factor_result.get()[0].toInt();
-  meta_module.reset();
-
-  ET_LOG(Info, "Computing timestamps...");
-  std::unordered_set<std::string> supported_punctuation =
-      parakeet::tokenizer_utils::derive_supported_punctuation(*tokenizer);
-
-  std::vector<TokenWithTextInfo> tokens_with_text_info;
   try {
-    tokens_with_text_info =
-        parakeet::timestamp_utils::get_tokens_with_text_info(
-            decoded_tokens, *tokenizer, supported_punctuation);
-  } catch (const std::exception& e) {
-    ET_LOG(Error, "Failed to get tokens with text info: %s", e.what());
-    return 1;
-  }
-  const auto word_offsets = parakeet::timestamp_utils::get_words_offsets(
-      tokens_with_text_info, *tokenizer, supported_punctuation);
-  const auto segment_offsets =
-      parakeet::timestamp_utils::get_segment_offsets(word_offsets);
+    parakeet::ParakeetTranscriber transcriber(
+        FLAGS_model_path, FLAGS_tokenizer_path, FLAGS_data_path);
+    const auto result = transcriber.transcribe_wav_path(
+        FLAGS_audio_path,
+        parakeet::TranscribeConfig{timestamp_mode, FLAGS_runtime_profile});
+
+    std::cout << "Transcribed text: " << result.text << std::endl;
+    if (!result.stats_json.empty()) {
+      std::cout << "PyTorchObserver " << result.stats_json << std::endl;
+    }
+    if (result.runtime_profile_report.has_value()) {
+      std::cout << *result.runtime_profile_report;
+    }
 
-  const double frame_to_seconds =
-      window_stride * static_cast<double>(encoder_subsampling_factor);
+#ifdef ET_BUILD_METAL
+    executorch::backends::metal::print_metal_backend_stats();
+#endif
 
-  if (timestamp_mode.segment) {
-    std::cout << "\nSegment timestamps:" << std::endl;
-    for (const auto& segment : segment_offsets) {
-      const double start = segment.start_offset * frame_to_seconds;
-      const double end = segment.end_offset * frame_to_seconds;
-      std::cout << start << "s - " << end << "s : " << segment.text
-                << std::endl;
+    if (timestamp_mode.segment) {
+      std::cout << "\nSegment timestamps:" << std::endl;
+      for (const auto& segment : result.segment_offsets) {
+        const double start = segment.start_offset * result.frame_to_seconds;
+        const double end = segment.end_offset * result.frame_to_seconds;
+        std::cout << start << "s - " << end << "s : " << segment.text
+                  << std::endl;
+      }
     }
-  }
 
-  if (timestamp_mode.word) {
-    std::cout << "\nWord timestamps:" << std::endl;
-    for (const auto& word : word_offsets) {
-      const double start = word.start_offset * frame_to_seconds;
-      const double end = word.end_offset * frame_to_seconds;
-      std::cout << start << "s - " << end << "s : " << word.text << std::endl;
+    if (timestamp_mode.word) {
+      std::cout << "\nWord timestamps:" << std::endl;
+      for (const auto& word : result.word_offsets) {
+        const double start = word.start_offset * result.frame_to_seconds;
+        const double end = word.end_offset * result.frame_to_seconds;
+        std::cout << start << "s - " << end << "s : " << word.text << std::endl;
+      }
     }
-  }
 
-  if (timestamp_mode.token) {
-    std::cout << "\nToken timestamps:" << std::endl;
-    for (const auto& token : tokens_with_text_info) {
-      const double start = token.start_offset * frame_to_seconds;
-      const double end = token.end_offset * frame_to_seconds;
-      std::cout << start << "s - " << end << "s : " << token.decoded_text
-                << std::endl;
+    if (timestamp_mode.token) {
+      std::cout << "\nToken timestamps:" << std::endl;
+      for (const auto& token : result.token_offsets) {
+        const double start = token.start_offset * result.frame_to_seconds;
+        const double end = token.end_offset * result.frame_to_seconds;
+        std::cout << start << "s - " << end << "s : " << token.decoded_text
+                  << std::endl;
+      }
     }
-  }
 
-  return 0;
+    return 0;
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "%s", e.what());
+    return 1;
+  }
 }
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
index e1b54d644b2..726657a3779 100644
--- a/examples/models/qwen3_5_moe/CMakeLists.txt
+++ b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -54,9 +54,14 @@ elseif(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda_backend)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+elseif(TARGET mlxdelegate)
+  list(APPEND link_libraries mlxdelegate mlx)
+  executorch_target_link_options_shared_lib(mlxdelegate)
+  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(
-    FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
+    FATAL_ERROR
+      "Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON"
   )
 endif()
 
@@ -69,9 +74,21 @@ target_include_directories(
 )
 target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})
 
+add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
+target_include_directories(
+  qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
+  target_link_options_gc_sections(qwen3_5_moe_worker)
+  target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
+endif()
+
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
 endif()
 
 if(EXECUTORCH_BUILD_CUDA)
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
index 0d6de7f60eb..276c2116148 100644
--- a/examples/models/qwen3_5_moe/CMakePresets.json
+++ b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -36,19 +36,42 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Qwen3.5 MoE runner (MLX)",
+            "inherits": ["qwen3-5-moe-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_MLX": "ON"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
+            "displayName": "Build Qwen3.5 MoE runner, worker, and no-bleed test (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
-            "targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
+            "targets": [
+                "qwen3_5_moe_runner",
+                "qwen3_5_moe_worker",
+                "test_qwen35_moe_nobleed"
+            ]
         },
         {
             "name": "qwen3-5-moe-metal",
-            "displayName": "Build Qwen3.5 MoE runner (Metal)",
+            "displayName": "Build Qwen3.5 MoE runner and worker (Metal)",
             "configurePreset": "qwen3-5-moe-metal",
+            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Build Qwen3.5 MoE runner (MLX)",
+            "configurePreset": "qwen3-5-moe-mlx",
             "targets": ["qwen3_5_moe_runner"]
         }
     ],
@@ -80,6 +103,20 @@
                     "name": "qwen3-5-moe-metal"
                 }
             ]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Configure and build Qwen3.5 MoE runner (MLX)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "qwen3-5-moe-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "qwen3-5-moe-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
index e3f13cc77d6..c275641bfd7 100644
--- a/examples/models/qwen3_5_moe/README.md
+++ b/examples/models/qwen3_5_moe/README.md
@@ -147,6 +147,56 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 `--cuda_graph` is intentionally single-session only. CUDA graph replay captures
 device pointers, so it is not combined with per-session mutable-state rebinding.
 
+## OpenAI-compatible serving
+
+The CUDA build also produces `qwen3_5_moe_worker`, a C++ model-execution worker
+used by the generic `examples/llm_server` control plane. The Qwen launcher wires
+in the model's Hugging Face chat template and Qwen XML tool-call parser:
+
+```bash
+python -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path qwen35_moe_exports/model.pte \
+    --data-path qwen35_moe_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --hf-tokenizer ~/models/Qwen3.5-35B-A3B \
+    --model-id qwen3.5-moe \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --no-think
+```
+
+`--max-sessions` controls how many isolated sessions the worker can host on one
+weight load. One slot is reserved for anonymous requests; clients should send a
+stable `session_id` (or session-affinity header) to get per-conversation
+isolation and warm append-only resume.
+
+### Use from pi
+
+Point pi at the Qwen server via `~/.pi/agent/models.json`:
+
+```json
+{
+  "providers": {
+    "executorch": {
+      "baseUrl": "http://127.0.0.1:8000/v1",
+      "api": "openai-completions",
+      "apiKey": "x",
+      "models": [
+        {
+          "id": "qwen3.5-moe",
+          "compat": { "sendSessionAffinityHeaders": true }
+        }
+      ]
+    }
+  }
+}
+```
+
+The model id must match `--model-id`. `sendSessionAffinityHeaders` lets pi route
+each conversation or subagent to a stable server session; without it, requests
+use the anonymous scratch session and do not get per-conversation isolation or
+warm resume.
+
 ### CUDA no-bleed test
 
 The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
@@ -211,7 +261,38 @@ python export.py \
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--tiny-test` | off | Build tiny model with random weights for CI testing |
 
-### Run (MLX)
+### Build (MLX)
+
+Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the
+MLX backend and the runner binary. Requires Apple Silicon (Darwin).
+
+```bash
+make qwen3_5_moe-mlx
+```
+
+This builds ExecuTorch with MLX support, then the runner binary at
+`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib`
+copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd`
+data file is produced or needed.
+
+### Run (MLX, C++ runner)
+
+The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and
+a `tokenizer.json`; no `--data_path`):
+
+```bash
+cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
+    --model_path ./qwen35_moe_mlx/model.pte \
+    --tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --prompt "What is the capital of France?" \
+    --max_new_tokens 50
+```
+
+The MLX export emits a single dynamic-seq `forward` method; the runner loads and
+calls it for both prefill and decode (sampling on host), matching the Python
+runner. See the [Run](#run) section above for the full flag list.
+
+### Run (MLX, Python)
 
 ```bash
 python -m executorch.examples.models.qwen3_5_moe.run \
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
index d7e7d9ca293..566d61e6cfc 100644
--- a/examples/models/qwen3_5_moe/export.py
+++ b/examples/models/qwen3_5_moe/export.py
@@ -768,10 +768,16 @@ def _export_mlx(model, config, args):
     gc.collect()
 
     print("Lowering to ExecuTorch with MLX backend...")
+    # Largest prefill chunk the runner may submit in one forward call. The MLX
+    # runner chunks long prompts to cap peak memory; bound it by the compiled
+    # dynamic max (max_seq_len - 1) so a chunk can never exceed what `forward`
+    # was compiled for.
+    max_prefill_chunk = min(1024, config.max_seq_len - 1)
     metadata = {
         "get_max_seq_len": config.max_seq_len,
         "get_vocab_size": config.vocab_size,
         "get_n_layers": config.num_hidden_layers,
+        "get_max_prefill_chunk": max_prefill_chunk,
         "use_kv_cache": True,
         "use_sdpa_with_kv_cache": False,
         "enable_dynamic_shape": True,
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
index 3c5b2eec439..713f6211330 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -19,6 +19,8 @@
 #include <cmath>
 #include <cstring>
 
+#include <algorithm>
+
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
 #include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
@@ -39,6 +41,22 @@ using SizesType = executorch::aten::SizesType;
 
 namespace {
 
+#ifdef EXECUTORCH_BUILD_MLX
+// The MLX export emits a single dynamic-seq `forward` method that handles both
+// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads
+// and calls `forward` for both phases.
+constexpr const char* kPrefillMethod = "forward";
+constexpr const char* kDecodeMethod = "forward";
+#else
+// CUDA/Metal exports emit two separate methods.
+constexpr const char* kPrefillMethod = "prefill";
+constexpr const char* kDecodeMethod = "decode";
+#endif
+
+// Constant method exported by the MLX .pte giving the largest prefill chunk the
+// `forward` method was compiled for. Read into the metadata map in create().
+constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk";
+
 Result<uint64_t> read_sampled_token(
     const executorch::aten::Tensor& output,
     float temperature) {
@@ -98,8 +116,10 @@ Result<std::unique_ptr<Module>> build_qwen_module(
   }
 #endif
 
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill"));
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
+  if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
+  }
   return module;
 }
 
@@ -240,34 +260,63 @@ class Qwen35MoESession : public LLMSession {
     }
 
     stop_.store(false, std::memory_order_relaxed);
-    std::vector<int64_t> token_data(tokens.begin(), tokens.end());
-    std::vector<int64_t> pos_data(T);
-    for (int64_t i = 0; i < T; ++i) {
-      pos_data[i] = pos_ + i;
+
+    // On MLX, run prefill in fixed-size chunks (caps peak memory and the
+    // compiled prefill shape). Other backends prefill the whole prompt in one
+    // pass. Only the final chunk's sampled token is kept; the recurrence/KV
+    // state from earlier chunks persists via pos_ advancement.
+#ifdef EXECUTORCH_BUILD_MLX
+    // Chunk size: default to the compiled max (kMaxSeqLen - 1), overridden by
+    // the exported get_max_prefill_chunk constant when present (mirrors
+    // gemma4_31b). Falls back to T (single pass) if no metadata is available at
+    // all.
+    int64_t chunk_size = T;
+    if (auto it = metadata_.find(kMaxSeqLen);
+        it != metadata_.end() && it->second > 1) {
+      chunk_size = it->second - 1;
     }
-    auto tokens_tensor = from_blob(
-        token_data.data(),
-        {1, static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-    auto pos_tensor = from_blob(
-        pos_data.data(),
-        {static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-
-    const char* method = (T >= 2) ? "prefill" : "decode";
-    std::vector<EValue> inputs;
-    inputs.push_back(tokens_tensor);
-    inputs.push_back(pos_tensor);
+    if (auto it = metadata_.find(kMaxPrefillChunk);
+        it != metadata_.end() && it->second > 0) {
+      chunk_size = it->second;
+    }
+#else
+    const int64_t chunk_size = T;
+#endif
+
+    uint64_t sampled_token = 0;
+    for (int64_t off = 0; off < T; off += chunk_size) {
+      const int64_t len = std::min(chunk_size, T - off);
+      std::vector<int64_t> token_data(
+          tokens.begin() + off, tokens.begin() + off + len);
+      std::vector<int64_t> pos_data(len);
+      for (int64_t i = 0; i < len; ++i) {
+        pos_data[i] = pos_ + i;
+      }
+      auto tokens_tensor = from_blob(
+          token_data.data(),
+          {1, static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+      auto pos_tensor = from_blob(
+          pos_data.data(),
+          {static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+
+      const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod;
+      std::vector<EValue> inputs;
+      inputs.push_back(tokens_tensor);
+      inputs.push_back(pos_tensor);
 #ifdef EXECUTORCH_BUILD_CUDA
-    set_temp(first_token_temp);
-    inputs.push_back(EValue(temp_tensor_));
+      set_temp(first_token_temp);
+      inputs.push_back(EValue(temp_tensor_));
 #endif
-    auto sampled =
-        run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
-    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
-    pending_ = sampled.get();
+      auto sampled =
+          run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
+      ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+      sampled_token = sampled.get();
+      pos_ += len;
+    }
+    pending_ = sampled_token;
     prev_decode_token_.reset();
-    pos_ += T;
     return Error::Ok;
   }
 
@@ -334,7 +383,7 @@ class Qwen35MoESession : public LLMSession {
     inputs.push_back(EValue(temp_tensor_));
 #endif
     auto sampled =
-        run_locked("decode", inputs, temperature_, /*sync_after=*/false);
+        run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
     ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
     pending_ = sampled.get();
     prev_decode_token_ = token;
@@ -457,6 +506,14 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
     ET_LOG(Error, "Qwen35MoEEngine: failed to read metadata");
     return metadata_result.error();
   }
+#ifdef EXECUTORCH_BUILD_MLX
+  // Surface the compiled max prefill chunk (a constant method get_llm_metadata
+  // doesn't harvest) into the metadata map so the session can chunk long
+  // prompts within the shape `forward` was compiled for.
+  if (auto mpc = meta_module->get(kMaxPrefillChunk); mpc.ok()) {
+    metadata_result.get()[kMaxPrefillChunk] = mpc->toScalar().to<int64_t>();
+  }
+#endif
   auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get());
   // This export's metadata doesn't carry the chat-turn EOS (config.json has no
   // eos_token_id and the .pte exports no get_eos_ids method), so get_eos_ids()
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 8e0dc70bbb5..4a5d1fd023d 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -130,12 +130,12 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 Default example using hybrid mode.
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
-```
+
 
 #### Codegen2
 Default example using kv mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
 ```
 
 #### Gemma 2B
@@ -210,7 +210,17 @@ Default example using hybrid mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
 ```
 
-## Multimodal Support
+#### Using custom calibration samples for LLMs
+
+Instead of `--calib_tasks`, you can supply your own conversation JSON files via `--calib_samples`. The samples are fed into the quantization calibration pass to collect activation observer statistics — they do not affect the inference prompt. This is useful when you want to calibrate on domain-specific or instruct-format data rather than a generic lm_eval task.
+
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
+```
+
+You can also provide both `--calib_tasks` and `--calib_samples` at the same time; the pipeline concatenates both data sources for calibration.
+
+
 
 ### Overview
 
@@ -268,7 +278,7 @@ pip install soundfile
 
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/audio.json
 ```
 
 ### Specifying Custom Audio
@@ -281,9 +291,6 @@ You can specify a custom audio file for ALM models using the `--audio_path` flag
 - **Local file paths**: Absolute or relative paths to `.wav` files on your system
   - Example: `"/path/to/your/audio.wav"`
 
-**Default behavior:**
-If `--audio_path` is not specified, the system will automatically use the default audio file defined in the model's configuration file (`encoder/encoder_config.py`).
-
 #### Audio Preprocessing
 
 The audio encoder configuration is defined in `encoder/encoder_config.py`:
@@ -294,7 +301,6 @@ The audio encoder configuration is defined in `encoder/encoder_config.py`:
 class GraniteSpeechEncoder(AudioModalityConfig):
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
-    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"  # Default audio (content: "After his nap, ...")
     quant_recipe = GraniteSpeechEncoderQuantRecipe
 ```
 
@@ -351,13 +357,13 @@ Vision-Language Models (VLMs) combine computer vision and natural language proce
 #### SmolVLM 500M
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 #### InternVL 1B
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 ### Specifying Custom Image
@@ -370,9 +376,6 @@ Take a example image of Statue-of-Liberty in New York Bay
 - **Local file paths**: Absolute or relative paths to image files on your system
   - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png)
 
-**Default behavior:**
-If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`).
-
 #### Image Preprocessing
 
 Each VLM model has specific preprocessing requirements defined in its configuration:
@@ -385,7 +388,6 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_seq_len = 64
     img_resized_h = 512
     img_resized_w = 512
-    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"  # Default image
     quant_recipe = SmolVLMEncoderQuantRecipe
 ```
 
@@ -427,7 +429,7 @@ PROMPT2="Answer the question: What's the main object in first image?"
 PROMPT3="<image>Caption this image."
 
 # Execute the multi-turn conversation
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 **How it works:**
@@ -453,16 +455,19 @@ The VLM inference pipeline consists of:
    - Special tokens (e.g., `<image>`, `<|fake_token_around_image|>`, `<fake_token_around_image>`) mark modality boundaries (see [tokenizer.py](tokenizer.py))
 
    ```python
-   # Special tokens for Vision-Language Model
-   VLM_SPECIAL_TOKENS = {
-       "smolvlm_500m_instruct": {
-           "image_token": "<image>",
-           "global_img": "<global-img>",
-           "fake_wrap_start": "<fake_token_around_image>",
-           "fake_wrap_end": "<fake_token_around_image>",
-       },
-       ...
-   }
+   # Token fields on each encoder config subclass (encoder/encoder_config.py)
+   @dataclass(init=False, frozen=True)
+   class SmolVLMEncoder(VisionModalityConfig):
+       img_token = "<image>"
+       fake_wrap_start = "<fake_token_around_image>"
+       fake_wrap_end = "<fake_token_around_image>"
+       global_img_token = "<global-img>"
+
+   @dataclass(init=False, frozen=True)
+   class InternVL3Encoder(VisionModalityConfig):
+       img_token = "<IMG_CONTEXT>"
+       fake_wrap_start = "<img>"
+       fake_wrap_end = "</img>"
    ```
    - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]`
 
@@ -545,16 +550,13 @@ From the example script above, 1 wikitext sample is used to evaluate all 3 phase
 Example:
 ```bash
 # 1st run to compile with --calib_limit 1
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 -a ${FOLDER_TO_PRE_GEN_PTE} --compile_only
 ```
 ```bash
 # 2nd run to perform QNN device execution with --eval_limit 3
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
 ```
 
-#### Tasks quantization calibration
-If `--calib_tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration.
-`--calib_tasks` and `--eval_tasks` are independent flags. `--calib_tasks` controls which tasks are used for quantization calibration, while `--eval_tasks` controls which tasks are used for perplexity evaluation. They can be set to different tasks or limits as needed.
 
 #### SQNR Evalution
 To evaluate QNN's output logits against the golden logits from `nn.Module`, users can provide the flag `--sqnr_eval`. Please note that SQNR evaluation will only compare the logits of the user's prompt and will not compare the new tokens generated by the model.
@@ -563,6 +565,52 @@ Example:
 python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods sqnr_eval
 ```
 
+
+
+#### Quantization
+
+The calibration data is independent from the runtime evaluation set, and only affects quantization quality, not the inference output.
+
+Calibration data is required for compilation. There are two ways to supply it:
+
+1. **`--calib_tasks`** — calibrate on one or more lm_eval tasks (tune with `--calib_limit` and `--calib_num_fewshot`). LLM-only.
+2. **`--calib_samples`** — calibrate on custom conversation samples provided as JSON files (see format below). Required for multimodal models (VLM/ALM).
+
+For LLMs, provide at least one of the two; for multimodal models, `--calib_samples` is mandatory.
+
+Calibration and runtime evaluation use separate flag sets and can target different tasks or limits as needed:
+
+| Purpose | Flags |
+|---|---|
+| Calibration data (lm_eval tasks) | `--calib_tasks`, `--calib_limit`, `--calib_num_fewshot` |
+| Calibration data (custom samples) | `--calib_samples` (JSON files, HuggingFace message format) |
+
+##### Custom calibration samples (`--calib_samples`)
+
+`--calib_samples` accepts one or more JSON files. Each file is a flat list of sample objects. Each sample has a `messages` field following the HuggingFace chat template, and an optional `files` field for media inputs (local paths or URLs):
+
+```json
+[
+  {
+    "files": ["path/or/url/to/files"],
+    "messages": [
+      {"role": "user",    "content": "..." },
+      {"role": "assistant", "content": "..."}
+    ]
+  }
+]
+```
+
+`files` is only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). For LLM-only models, `files` can be omitted. `content` can be a plain string or a list of HuggingFace content blocks (e.g. `[{"type": "image"}, {"type": "text", "text": "..."}]` for vision inputs).
+
+Ready-to-use examples for each model type are provided under `assets/samples/`:
+
+| Model type | Example file |
+|---|---|
+| LLM | [assets/samples/text.json](assets/samples/text.json) |
+| ALM (audio) | [assets/samples/audio.json](assets/samples/audio.json) |
+| VLM (vision) | [assets/samples/vision.json](assets/samples/vision.json) |
+
 #### Quantization Guidance
 
 To automatically identify sensitive layers and generate a mixed-precision recipe suggestion, add the `--quant_recipe_suggestion` flag. During calibration, the analyzer compares FP32 and QDQ intermediate outputs layer-by-layer using SQNR, then writes two files to the working directory:
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index 30b86eabb01..c00525d6fe7 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -23,17 +23,6 @@ runtime.python_library(
     ],
 )
 
-runtime.python_library(
-    name = "decoder_utils",
-    srcs = [
-        "decoder_utils.py",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/examples/models/llama:eval_library",
-    ],
-)
-
 runtime.python_library(
     name = "masking_utils",
     srcs = [
@@ -81,19 +70,112 @@ runtime.python_library(
     srcs = [
         "tokenizer.py",
     ],
+    deps = [
+        ":decoder_constants",
+        ":static_llama",
+        "//caffe2:torch",
+        "fbsource//third-party/pypi/transformers:transformers",
+    ],
+)
+
+runtime.python_library(
+    name = "utils",
+    srcs = [
+        "utils.py",
+    ],
     deps = [
         "//caffe2:torch",
+        "//executorch/exir:lib",
+    ],
+)
+
+runtime.python_library(
+    name = "inference",
+    srcs = [
+        "inference/__init__.py",
+        "inference/decoder.py",
+        "inference/encoder.py",
+        "inference/model.py",
+    ],
+    deps = [
+        ":masking_utils",
+        "//caffe2:torch",
     ],
 )
 
 runtime.python_library(
     name = "dataset",
     srcs = [
-        "dataset.py",
+        "dataset/__init__.py",
+        "dataset/builders.py",
+        "dataset/collators.py",
+        "dataset/config.py",
+        "dataset/datasets.py",
+        "dataset/loaders.py",
+        "dataset/preprocessors.py",
+        "dataset/schema.py",
+    ],
+    deps = [
+        ":decoder_constants",
+        ":encoder",
+        ":masking_utils",
+        ":tokenizer",
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:eval_library",
+        "fbsource//third-party/pypi/lm-eval:lm-eval",
+        "fbsource//third-party/pypi/transformers:transformers",
+    ],
+)
+
+runtime.python_library(
+    name = "quantize",
+    srcs = [
+        "quantize/__init__.py",
+        "quantize/ptq.py",
+        "quantize/strategy.py",
+    ],
+    deps = [
+        ":decoder_constants",
+        ":inference",
+        ":utils",
+        "//caffe2:torch",
+        "//executorch/backends/qualcomm/_passes:passes",
+    ],
+)
+
+runtime.python_library(
+    name = "mix_precision_analyzer",
+    srcs = [
+        "mix_precision_analyzer.py",
+    ],
+    deps = [
+        ":inference",
+        "//caffe2:torch",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/devtools:lib",
+        "//executorch/exir:lib",
+        "//pytorch/ao:torchao",
+    ],
+)
+
+runtime.python_library(
+    name = "evaluator",
+    srcs = [
+        "evaluator/__init__.py",
+        "evaluator/device_evaluator.py",
+        "evaluator/lm_eval_adapter.py",
     ],
     deps = [
+        ":dataset",
+        ":decoder_constants",
+        ":inference",
         ":tokenizer",
+        ":utils",
         "//caffe2:torch",
+        "//executorch/backends/qualcomm:export_utils",
+        "//executorch/examples/models/llama:eval_library",
+        "//pytorch/ao:torchao",
+        "fbsource//third-party/pypi/lm-eval:lm-eval",
     ],
 )
 
@@ -106,10 +188,16 @@ runtime.python_library(
         "wrappers/llm_wrappers.py",
     ],
     deps = [
+        ":dataset",
         ":decoder_constants",
         ":encoder",
+        ":evaluator",
+        ":inference",
+        ":mix_precision_analyzer",
+        ":quantize",
         ":static_llama",
         ":static_llm_quant_recipe",
+        ":tokenizer",
         "//caffe2:torch",
         "//executorch/backends/qualcomm:export_utils",
         "//executorch/backends/qualcomm/_passes:passes",
@@ -129,10 +217,11 @@ runtime.python_library(
     deps = [
         ":dataset",
         ":decoder_constants",
-        ":decoder_utils",
         ":encoder",
+        ":evaluator",
         ":masking_utils",
         ":static_llm_quant_recipe",
+        ":tokenizer",
         ":wrappers",
         "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
@@ -180,22 +269,6 @@ python_binary(
     ],
 )
 
-python_binary(
-    name = "eval_llama_qnn",
-    srcs = ["eval_llama_qnn.py"],
-    main_function = "executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn.main",
-    preload_deps = [
-        "//executorch/extension/llm/custom_ops:model_sharding_py",
-    ],
-    deps = [
-        ":llama_lib",
-        "//executorch/examples/models/llama:eval_library",
-        "//executorch/examples/qualcomm/oss_scripts/llama:range_setting_pt2e",
-        "fbsource//third-party/pypi/lm-eval:lm-eval",
-    ],
-    keep_gpu_sections = True,
-)
-
 runtime.command_alias(
     name = "llama_qnn",
     env = {
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
index 9ed44f6f3e0..b53f4bda689 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
@@ -49,12 +49,10 @@ class AudioModalityConfig(MultiModalityConfig):
 
     Attributes:
         audio_seq_len: Number of audio tokens in the sequence.
-        audio_url: Default audio URL for validation and calibration.
     """
 
     audio_seq_len: int
     n_bins: int
-    audio_url: str
 
     def create_encoder(self, config):
         return self.encoder_class(config, n_bins=self.n_bins)
@@ -71,13 +69,11 @@ class VisionModalityConfig(MultiModalityConfig):
         img_seq_len: Number of image tokens/patches in the sequence.
         img_resized_h: Target height for image resizing (pixels).
         img_resized_w: Target width for image resizing (pixels).
-        img_url: Default image URL for validation and calibration.
     """
 
     img_seq_len: int
     img_resized_h: int
     img_resized_w: int
-    img_url: str
 
     def create_encoder(self, config):
         return self.encoder_class(
@@ -94,7 +90,6 @@ class GraniteSpeechEncoder(AudioModalityConfig):
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
     n_bins = 844
-    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
     quant_recipe = GraniteSpeechEncoderQuantRecipe
     num_sharding = 8
 
@@ -109,7 +104,6 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_seq_len = 64
     img_resized_h = 512
     img_resized_w = 512
-    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
     quant_recipe = SmolVLMEncoderQuantRecipe
 
 
@@ -123,5 +117,4 @@ class InternVL3Encoder(VisionModalityConfig):
     img_seq_len = 256
     img_resized_h = 448
     img_resized_w = 448
-    img_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     quant_recipe = InternVL3EncoderQuantRecipe
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ea09451a697..d3d4a475288 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -12,7 +12,7 @@
 import os
 import sys
 from multiprocessing.connection import Client
-from typing import Dict
+from typing import Dict, List
 
 import torch
 from executorch.backends.qualcomm.export_utils import (
@@ -30,7 +30,11 @@
     LLMModelConfig,
     SUPPORTED_LLM_MODELS,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.dataset import DatasetBuilder
+from executorch.examples.qualcomm.oss_scripts.llama.dataset import (
+    DataConfig,
+    DatasetBuilder,
+    MessageSample,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     ATTENTION_SINK_EVICTOR,
     AUDIO_ENCODER,
@@ -46,7 +50,7 @@
     TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import (
+from executorch.examples.qualcomm.oss_scripts.llama.evaluator.device_evaluator import (
     DefaultEval,
     SqnrEval,
     TaskEval,
@@ -96,10 +100,9 @@ def compile(
     args,
     decoder_model_config: LLMModelConfig,
     pte_filenames: Dict[str, str],
-    tokenizer,
-    calibration_data,
+    tokenizer_wrapper,
     is_multimodal,
-):
+) -> Dict[str, List]:
     os.makedirs(args.artifact, exist_ok=True)
     multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
 
@@ -185,9 +188,8 @@ def compile(
 
     # perform ptq
     multi_modal_mgr.quantize(
-        calibration_data=calibration_data,
+        tokenizer_wrapper=tokenizer_wrapper,
         skip_quantize=skip_quantize,
-        tokenizer=tokenizer,
         backend=get_backend_type(args.backend),
         soc_model=args.soc_model,
     )
@@ -204,15 +206,14 @@ def inference(
     args,
     decoder_model_config: LLMModelConfig,
     runtime_tokenizer_path,
-    tokenizer,
-    chat_template,
+    tokenizer_wrapper: TokenizerWrapper,
     text_decoder_pte_path: str,
     encoder_pte_paths: Dict[str, str],
     tok_embedding_pte_path: str,
     attention_sink_evictor_pte_path: str,
-    calibration_data,
     is_multimodal,
 ):
+    tokenizer = tokenizer_wrapper.tokenizer
 
     assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}."
 
@@ -250,15 +251,35 @@ def inference(
                 {modality: encoder_pte_path},
             )
 
+    multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
+    audio_encoder = multi_modal_mgr.audio_encoder.model
+    vision_encoder = multi_modal_mgr.vision_encoder.model
+    tok_embedding = multi_modal_mgr.text_decoder.calibration_prefill.tok_embedding
+    source_model = multi_modal_mgr.text_decoder.calibration_prefill.decoder
+    audio_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get(
+        "audio_token_id", None
+    )
+    image_token_id = multi_modal_mgr.text_decoder.calibration_prefill.meta.get(
+        "image_token_id", None
+    )
+    dataset_builder = DatasetBuilder(
+        DataConfig.from_args(args),
+        decoder_model_config,
+        tokenizer_wrapper,
+        attn_mask=source_model.get_example_inputs()[1],
+    )
     if PROMPT_EVAL in args.eval_methods:
         prompt_evaluator = DefaultEval(
             args=args,
+            decoder_model_config=decoder_model_config,
             pte_paths=pte_paths,
             runtime_tokenizer_path=runtime_tokenizer_path,
             is_multimodal=is_multimodal,
-            modality_inputs=calibration_data,
+            dataset_builder=dataset_builder,
+        )
+        output_prompt = prompt_evaluator.run(
+            prompt=args.prompt, audio_paths=args.audio_path, image_paths=args.image_path
         )
-        output_prompt = prompt_evaluator.run(prompt=args.prompt)
         eval_results.update(
             {
                 "inference_speed": prompt_evaluator.inference_speed,
@@ -270,31 +291,31 @@ def inference(
 
     if SQNR_EVAL in args.eval_methods:
         assert not is_multimodal, "Modality Model does not support SQNR_EVAL."
-        tokenizer_wrapper = TokenizerWrapper(
-            args,
-            decoder_model_config,
-        )
-        prompt = (
-            tokenizer_wrapper.apply_prompt_template(
-                chat_template, args.prompt[0], args.system_prompt
-            )
-            if chat_template is not None
-            else args.prompt[0]
+        runtime_message = tokenizer_wrapper.prepare_messages(args.prompt)[0]
+        message = MessageSample(
+            files=runtime_message["files_path"],
+            messages=tokenizer_wrapper.make_chat_template(
+                runtime_message["text"], args.system_prompt
+            ),
         )
-        multi_modal_mgr = MultiModalManager(
-            control_args=args, config=decoder_model_config
-        )
-        source_model = multi_modal_mgr.text_decoder.decode.decoder
         sqnr_evaluator = SqnrEval(
             source_model=source_model,
             get_example_inputs=source_model.get_example_inputs,
             args=args,
             pte_paths=pte_paths,
-            tokenizer=tokenizer,
+            tokenizer_wrapper=tokenizer_wrapper,
+            decoder_model_config=decoder_model_config,
             runtime_tokenizer_path=runtime_tokenizer_path,
             is_multimodal=is_multimodal,
+            dataset_builder=dataset_builder,
+            encoder=audio_encoder or vision_encoder,
+            tok_embedding=tok_embedding,
+            audio_token_id=audio_token_id,
+            image_token_id=image_token_id,
+        )
+        sqnr, golden_logits, _ = sqnr_evaluator.run(
+            message, audio_paths=args.audio_path, image_paths=args.image_path
         )
-        sqnr, golden_logits, _ = sqnr_evaluator.run(prompt=prompt)
         logging.info(f"SQNR Eval Score between FP32 nn.Module and QNN: {sqnr}")
         eval_results.update(
             {
@@ -315,11 +336,19 @@ def inference(
                 get_example_inputs=source_model.get_example_inputs,
                 args=args,
                 pte_paths=pte_paths,
-                tokenizer=tokenizer,
+                tokenizer_wrapper=tokenizer_wrapper,
+                decoder_model_config=decoder_model_config,
                 runtime_tokenizer_path=runtime_tokenizer_path,
                 is_multimodal=is_multimodal,
+                dataset_builder=dataset_builder,
+                encoder=audio_encoder or vision_encoder,
+                tok_embedding=tok_embedding,
+                audio_token_id=audio_token_id,
+                image_token_id=image_token_id,
+            )
+            qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(
+                message, audio_paths=args.audio_path, image_paths=args.image_path
             )
-            qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(prompt=prompt)
             eval_results["qdq_sqnr"] = qdq_sqnr
             logging.info(f"SQNR Eval Score between CPU QDQ and QNN: {qdq_sqnr}")
             logging.info(
@@ -335,6 +364,7 @@ def inference(
         # Generate the eval wrapper
         ppl_evaluator = TaskEval(
             args=args,
+            decoder_model_config=decoder_model_config,
             pte_paths=pte_paths,
             tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
@@ -410,7 +440,7 @@ def _build_parser():
 
     parser.add_argument(
         "--prompt",
-        help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
+        help="User prompts used during runtime inference only (not compilation or calibration). When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
         required=True,
         type=str,
         nargs="+",
@@ -506,7 +536,7 @@ def _build_parser():
 
     parser.add_argument(
         "--audio_path",
-        help="Path to the audio file for multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.",
+        help="Path to the audio file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.",
         default=[],
         type=str,
         nargs="+",
@@ -514,7 +544,7 @@ def _build_parser():
 
     parser.add_argument(
         "--image_path",
-        help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
+        help="Path to the image file used during runtime inference only (not compilation or calibration). For multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
         default=[],
         type=str,
         nargs="+",
@@ -528,7 +558,7 @@ def _build_parser():
         help="Choose eval methods(default: prompt_eval). Users can provide more than 1 eval methods. For example: --eval_methods tasks_eval sqnr_eval."
         "Following eval methods are supported:"
         "1) prompt_eval: Model will generate the output response based on the provided prompt through the flag --prompt."
-        "2) tasks_eval: This will eval the tasks provided through the flag --tasks."
+        "2) tasks_eval: This will eval the tasks provided through the flag --eval_tasks."
         "3) sqnr_eval: This will eval the sqnr between between QNN's output logit V.S. Static Llama nn.Module's output logit. Eval is based on the provided prompt through the --prompt flag. Please note that sqnr will only eval the prompt's logit but not the new generated token's logit.",
     )
 
@@ -546,6 +576,7 @@ def _build_parser():
         default=1,
         help="number of samples to evalulate. If not set, evaluate all samples",
     )
+
     parser.add_argument(
         "--eval_num_fewshot",
         type=int,
@@ -577,6 +608,19 @@ def _build_parser():
         help="Number of examples to calibrate in few-shot context",
     )
 
+    parser.add_argument(
+        "--calib_samples",
+        nargs="+",
+        type=str,
+        default=None,
+        help="One or more paths to calibration sample JSON files. Only JSON format is supported. "
+        "Each file must be a flat list of sample objects: "
+        '[{"files": ["path_or_url", ...], "messages": [{"role": "user"|"assistant", "content": "..." | [...]}]}]. '
+        '"files" is optional and only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). '
+        '"messages" follows the HuggingFace chat template; "content" can be a plain string or a list of content blocks. '
+        "Multiple files are merged.",
+    )
+
     parser.add_argument(
         "-F",
         "--use_fp16",
@@ -587,31 +631,16 @@ def _build_parser():
 
     parser.add_argument("-v", "--verbose", action="store_true")
 
-    parser.add_argument(
-        "--calibration_num_threads",
-        type=int,
-        default=0,
-        help="Thread count for calibration forward passes. 0 = auto-tune (default).",
-    )
-
     parser.add_argument(
         "--quant_recipe_suggestion",
         action="store_true",
         help="Enable automatic quant recipe suggestion in PTQ",
     )
 
-    parser.add_argument(
-        "--skip_user_prompt_calibration",
-        action="store_true",
-        help="Skip using user prompt for calibration. Useful when only dataset-based calibration is desired.",
-    )
-
     return parser
 
 
 def export_llama(args) -> None:
-    if args.calibration_num_threads < 0:
-        raise ValueError("--calibration_num_threads must be >= 0")
     if args.compile_only and args.pre_gen_pte:
         raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true")
     if (TASKS_EVAL or SQNR_EVAL) in args.eval_methods and args.model_mode not in {
@@ -622,6 +651,12 @@ def export_llama(args) -> None:
             "Eval device perplexity is only supported for KV mode. Hybrid mode will only use KV mode when evaluating tasks/sqnr."
         )
     if TASKS_EVAL in args.eval_methods and args.eval_tasks is None:
+        if args.calib_tasks is None:
+            logging.warning(
+                "--eval_tasks is set but --calib_tasks is not; quantization "
+                "calibration will use --prompt instead of a task dataset. "
+                "Pass --calib_tasks to match the previous --tasks behavior."
+            )
         raise RuntimeError("Please provide --eval_tasks to eval perplexity")
     assert (
         args.decoder_model in SUPPORTED_LLM_MODELS
@@ -674,17 +709,9 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
     )
-    runtime_tokenizer_path, tokenizer, chat_template = (
-        tokenizer_wrapper.get_runtime_tokenizer(
-            args.tokenizer_model, args.tokenizer_bin
-        )
-    )
+    runtime_tokenizer_path = tokenizer_wrapper.runtime_tokenizer_path
 
     # Prepare dataset
-    dataset_builder = DatasetBuilder(args, decoder_model_config, tokenizer_wrapper)
-    calibration_data = dataset_builder.prepare_calibration_dataset(
-        args.prompt, chat_template
-    )
     text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte"
     attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte"
     tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
@@ -701,13 +728,26 @@ def export_llama(args) -> None:
             hasattr(decoder_model_config, AUDIO_ENCODER),
         ]
     )
-    # TODO: Implement attention sink support for multimodal models (vision/audio).
-    assert (
-        not is_multimodal or args.use_attention_sink is None
-    ), "Multimodal models currently do not support attention sink feature."
-    assert (
-        not is_multimodal or not args.skip_user_prompt_calibration
-    ), "--skip_user_prompt_calibration is not supported for multimodal models (VLM/ALM) as they do not support task-based calibration yet."
+    if is_multimodal:
+        # TODO: Implement attention sink support for multimodal models (vision/audio).
+        if args.use_attention_sink is not None:
+            raise ValueError(
+                "Multimodal models currently do not support attention sink feature."
+            )
+        if args.eval_tasks is not None:
+            raise ValueError("Multimodal models do not support --eval_tasks.")
+
+    if not args.pre_gen_pte:
+        if is_multimodal and args.calib_samples is None:
+            raise ValueError(
+                "For MLLMs calibration data is required for compilation. "
+                "Provide --calib_samples with a vision/audio JSON file."
+            )
+        if not is_multimodal and not any((args.calib_tasks, args.calib_samples)):
+            raise ValueError(
+                "For LLMs calibration data is required for compilation. "
+                "Provide --calib_tasks or --calib_samples."
+            )
 
     if args.pre_gen_pte:
         text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte"
@@ -735,13 +775,11 @@ def export_llama(args) -> None:
             args,
             decoder_model_config,
             runtime_tokenizer_path,
-            tokenizer,
-            chat_template,
+            tokenizer_wrapper,
             text_decoder_pte_path,
             encoder_pte_paths,
             tok_embedding_pte_path,
             attention_sink_evictor_pte_path,
-            calibration_data,
             is_multimodal,
         )
         print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
@@ -751,8 +789,7 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
         pte_filenames,
-        tokenizer,
-        calibration_data,
+        tokenizer_wrapper,
         is_multimodal,
     )
     if args.use_attention_sink:
@@ -797,13 +834,11 @@ def export_llama(args) -> None:
         args,
         decoder_model_config,
         runtime_tokenizer_path,
-        tokenizer,
-        chat_template,
+        tokenizer_wrapper,
         text_decoder_pte_path,
         encoder_pte_paths,
         tok_embedding_pte_path,
         attention_sink_evictor_pte_path,
-        calibration_data,
         is_multimodal,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama/masking_utils.py b/examples/qualcomm/oss_scripts/llama/masking_utils.py
index 7725b7589e1..a09cdf1240f 100644
--- a/examples/qualcomm/oss_scripts/llama/masking_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/masking_utils.py
@@ -5,10 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import List, Union
+from typing import List, Tuple, Union
 
 import torch
 
+PADDING_MASK_VALUE = -255.0
+
 
 def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: int):
     """
@@ -21,14 +23,14 @@ def create_causal_attn_mask(max_batch_size: int, ar_len: int, max_context_len: i
 
     ● = activate (can attend), ○ = inactivate (masked)
     """
-    mask = torch.full((ar_len, ar_len), -255.0)
+    mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE)
     mask_cond = torch.arange(ar_len)
     mask.masked_fill_(mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1), 0)
 
     if max_context_len != ar_len:
         mask = torch.cat(
             [
-                torch.ones(ar_len, max_context_len - ar_len) * -255.0,
+                torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE,
                 mask,
             ],
             dim=-1,
@@ -50,7 +52,7 @@ def create_sliding_window_attn_mask(
 
     ● = activate (can attend), ○ = inactivate (masked)
     """
-    mask = torch.full((ar_len, ar_len), -255.0)
+    mask = torch.full((ar_len, ar_len), PADDING_MASK_VALUE)
     mask_cond = torch.arange(ar_len)
     mask.masked_fill_(
         (mask_cond.view(1, ar_len) <= mask_cond.view(ar_len, 1))
@@ -61,7 +63,7 @@ def create_sliding_window_attn_mask(
     if max_context_len != ar_len:
         mask = torch.cat(
             [
-                torch.ones(ar_len, max_context_len - ar_len) * -255.0,
+                torch.ones(ar_len, max_context_len - ar_len) * PADDING_MASK_VALUE,
                 mask,
             ],
             dim=-1,
@@ -96,7 +98,6 @@ def mask(self) -> torch.Tensor:
     def smart_mask_init(self, pos):
         """
         Initialize the attention mask by smart mask initialization method after model forward.
-
         Args:
             pos (int): Current position in the sequence.
         """
@@ -114,6 +115,17 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         pass
 
+    def _extra_init_kwargs(self) -> dict:
+        return {}
+
+    def _mask_padding_positions(
+        self, input_ids: List[List[int]], max_seq_length: int
+    ) -> None:
+        """Mask positions beyond each sequence's actual length."""
+        actual_lens = torch.tensor([len(seq) for seq in input_ids])
+        pad_rows = torch.arange(max_seq_length).unsqueeze(0) >= actual_lens.unsqueeze(1)
+        self.mask.masked_fill_(pad_rows.unsqueeze(-1), PADDING_MASK_VALUE)
+
 
 class CausalAttentionMask(BaseAttentionMask):
     def __init__(self, max_batch_size: int, ar_len: int, max_context_len: int):
@@ -134,28 +146,22 @@ def smart_mask_init(self, pos):
     def smart_mask_update(self, pos, n_updates, _):
         """
         Smart Mask mechanism for attention mask updating
-
         Initial mask(5x15) layout (before any updates):
             Each row represents a query token in the autoregressive context.
             ● = activate (can attend), ○ = inactivate (masked)
-
             0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ●
-
         After 1st update (e.g., pos=0, n_updates=5, sliding_window=3):
             Newly added tokens are unmasked (set to 0).
-
             0 ● ● ● ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ○
             4 ● ● ● ● ● ○ ○ ○ ○ ○ ● ● ● ● ●
-
         After 2nd update (e.g., pos=5, n_updates=5):
-
             0 ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○ ○
             1 ● ● ● ● ● ● ● ● ● ● ● ● ○ ○ ○
             2 ● ● ● ● ● ● ● ● ● ● ● ● ● ○ ○
@@ -166,6 +172,16 @@ def smart_mask_update(self, pos, n_updates, _):
         end_pos = pos + n_updates
         self.mask[:, :, start_pos:end_pos] = 0
 
+    @classmethod
+    def from_input_ids(
+        cls, input_ids: List[List[int]], max_seq_length: int, **kwargs
+    ) -> "CausalAttentionMask":
+        """Build a causal mask and apply padding for variable-length sequences."""
+        mask = cls(len(input_ids), max_seq_length, max_seq_length)
+        mask._mask = mask._mask.clone()
+        mask._mask_padding_positions(input_ids, max_seq_length)
+        return mask
+
 
 class SlidingWindowAttentionMask(BaseAttentionMask):
     def __init__(
@@ -194,31 +210,24 @@ def smart_mask_init(self, pos):
     def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         Smart Mask mechanism for attention mask updating
-
         Initial mask(5x15) layout (before any updates):
             Each row represents a query token in the autoregressive context.
             ● = activate (can attend), ○ = inactivate (masked)
-
             0 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
-
         After 1st update (e.g., pos=0, n_updates=5, sliding_window=3):
             Newly added tokens are unmasked (set to 0).
             Earlier tokens lose access to older cache due to sliding window limits.
-
             0 ○ ○ ○ ● ● ○ ○ ○ ○ ○ ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ● ○ ○ ○ ○ ○ ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
-
-
         After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3):
             Sliding window shifts again, masking older positions and activate new position.
-
             0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○
             2 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○
@@ -240,7 +249,24 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset):
             if end_pos > available_cache_len:
                 # Mask tokens that are no longer within the sliding window
                 # TODO: [Optional]: it can be optimized by computing the exact start index
-                self.mask[:, i, : end_pos - available_cache_len] = -255.0
+                self.mask[:, i, : end_pos - available_cache_len] = PADDING_MASK_VALUE
+
+    def _extra_init_kwargs(self) -> dict:
+        return {"sliding_window": self.sliding_window}
+
+    @classmethod
+    def from_input_ids(
+        cls,
+        input_ids: List[List[int]],
+        max_seq_length: int,
+        sliding_window: int,
+        **kwargs,
+    ) -> "SlidingWindowAttentionMask":
+        """Build a sliding-window mask and apply padding for variable-length sequences."""
+        mask = cls(len(input_ids), max_seq_length, max_seq_length, sliding_window)
+        mask._mask = mask._mask.clone()
+        mask._mask_padding_positions(input_ids, max_seq_length)
+        return mask
 
 
 class AttentionMask:
@@ -257,3 +283,28 @@ def smart_mask_update(self, pos, n_updates, lade_pos_offset=None):
 
     def __iter__(self):
         return iter([mask.mask for mask in self.masks])
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, ...]:
+        return tuple(m.mask[idx] for m in self.masks)
+
+    @classmethod
+    def from_input_ids(
+        cls,
+        template: "AttentionMask",
+        input_ids: List[List[int]],
+        max_seq_length: int,
+    ) -> "AttentionMask":
+        """
+        Build a calibration AttentionMask that mirrors template's mask types.
+
+        Delegates construction to each mask's own classmethod so that adding a
+        new mask type only requires implementing from_input_ids on that class —
+        no edits needed here.
+        """
+        masks = [
+            type(base_mask).from_input_ids(
+                input_ids, max_seq_length, **base_mask._extra_init_kwargs()
+            )
+            for base_mask in template.masks
+        ]
+        return cls(masks)
diff --git a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
index 02f19a0b676..b16a5e2a252 100644
--- a/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
+++ b/examples/qualcomm/oss_scripts/llama/mix_precision_analyzer.py
@@ -26,7 +26,9 @@
 from executorch.devtools.inspector._intermediate_output_capturer import (
     IntermediateOutputCapturer,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.inference import DecoderInference
 from executorch.exir.debug_handle_utils import DEBUG_HANDLE_KEY
+from torch.utils.data import DataLoader
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.utils import compute_error
 
@@ -74,45 +76,49 @@ def __init__(
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
         }
 
-    def analyze(self, samples: List[Tuple], num_sharding: int = 5) -> "SqnrReport":
+    def analyze(
+        self,
+        decoder_inference: DecoderInference,
+        text_dataloader: DataLoader,
+        num_sharding: int = 5,
+    ) -> "SqnrReport":
         """
-        Evaluates both the fp32 and QDQ graphs using the provided input_samples
+        Evaluates both the fp32 and QDQ graphs using batches from text_dataloader
         and computes the per-node Signal-to-Quantization-Noise Ratio (SQNR).
 
         Args:
-            input_samples: A list of tuples containing tensors corresponding to the model's inputs.
-            num_sharding: Number of contiguous layer groups to bucket the model into for SQNR
-                aggregation. Rather than flagging individual layers, layers are grouped into
-                ``num_sharding`` consecutive ranges (e.g. layers 0-7, 8-15, …) and the SQNR
-                is averaged within each group. Because upgrading isolated layers is usually ineffective: quantization error from surrounding
-                low-precision layers accumulates and dominates downstream behavior.
+            decoder_inference: Provides get_inputs() to assemble each
+                batch into the compiled model's input signature.
+            text_dataloader: DataLoader for text-only calibration batches.
+            num_sharding: Number of contiguous layer groups to bucket the model
+                into for SQNR aggregation.
 
         Returns:
             An ``SqnrReport`` object containing the aggregated analysis results.
         """
-        input_samples = [sample for sample in samples if sample is not None]
-
-        if not input_samples:
-            logging.warning("No input samples provided for analysis.")
-            return SqnrReport(
-                self.model_name, defaultdict(list), [], self.analysis_recipe
-            )
-
         self._assign_debug_handles(self.fp32_gm)
         self._assign_debug_handles(self.qdq_gm)
 
-        num_samples = len(input_samples)
-        logging.info(f"num samples: {num_samples}")
-
-        # Accumulate SQNR per module path across all input samples
+        num_samples = 0
         path_sqnr_sum = defaultdict(float)
-        for sample in input_samples:
+        for text_batch in text_dataloader:
+            input_ids = text_batch["input_ids"]
+            attn_mask = text_batch["attention_mask"]
+            sample = tuple(decoder_inference.get_inputs(input_ids, attn_mask))
             fp_outputs = self._capture(self.fp32_gm, sample)
             qdq_outputs = self._capture(self.qdq_gm, sample)
             for path, sqnr in self._match_and_score(fp_outputs, qdq_outputs).items():
                 path_sqnr_sum[path] += sqnr
+            num_samples += 1
+
+        if num_samples == 0:
+            logging.warning("No input samples provided for analysis.")
+            return SqnrReport(
+                self.model_name, defaultdict(list), [], self.analysis_recipe
+            )
+
+        logging.info(f"num samples: {num_samples}")
 
-        # Average the SQNRs and group them by normalized layer ranges
         report = defaultdict(list)
         for path, total_sqnr in path_sqnr_sum.items():
             group = self._normalize_group_name(
diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py
index 2894777f776..954b73384fa 100644
--- a/examples/qualcomm/oss_scripts/llama/tokenizer.py
+++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py
@@ -8,19 +8,21 @@
 import json
 import logging
 import re
-import warnings
-from typing import Callable, List
+from typing import Dict, List
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     VISION_ENCODER,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ModelArgs
 from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
 
 from transformers import AutoTokenizer
 
+
+# Generic special tokens for multimodality, used for runtime identification.
 IMG_TOKEN = "<image>"
 AUDIO_TOKEN = "<audio>"
 
@@ -66,7 +68,32 @@ def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
         self.control_args = control_args
         self.config = config
         self.repo_id = config.repo_id
-        self.apply_chat_template = config.instruct_model
+        self._instruct_model = config.instruct_model
+
+        self.tokenizer = None
+        self.chat_template = None
+
+        params_path = (
+            config.params_path if control_args.params is None else control_args.params
+        )
+        with open(params_path) as f:
+            model_args = ModelArgs(**json.load(f))
+        self.vocab_size = model_args.vocab_size
+
+        self.runtime_tokenizer_path = self._init_tokenizer(
+            control_args.tokenizer_model, control_args.tokenizer_bin
+        )
+
+    def _init_tokenizer(self, tokenizer_model, tokenizer_bin) -> str:
+        if self.decoder_model in {"stories110m", "stories260k"}:
+            path, self.tokenizer = self._from_tokenizer_model_and_bin(
+                tokenizer_model, tokenizer_bin
+            )
+        elif "llama3_2" in self.decoder_model:
+            path, self.tokenizer = self._from_tokenizer_model(tokenizer_model)
+        else:
+            path, self.tokenizer, self.chat_template = self._from_hf()
+        return path
 
     def _from_tokenizer_model_and_bin(self, tokenizer_model, tokenizer_bin):
         tokenizer = get_tokenizer(tokenizer_model)
@@ -89,7 +116,7 @@ def _from_hf(self):
         tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
         chat_template = (
             tokenizer.apply_chat_template
-            if hasattr(tokenizer, "apply_chat_template") and self.apply_chat_template
+            if hasattr(tokenizer, "apply_chat_template") and self._instruct_model
             else None
         )
         tokenizer_artifacts = tokenizer.save_pretrained(self.artifact)
@@ -112,23 +139,6 @@ def _from_hf(self):
 
         return runtime_tokenizer_path, tokenizer, chat_template
 
-    def get_runtime_tokenizer(self, tokenizer_model, tokenizer_bin):
-        tokenizer = None
-        runtime_tokenizer_path = ""
-        chat_template = None
-        if self.decoder_model in {"stories110m", "stories260k"}:
-            runtime_tokenizer_path, tokenizer = self._from_tokenizer_model_and_bin(
-                tokenizer_model, tokenizer_bin
-            )
-        elif "llama3_2" in self.decoder_model:
-            runtime_tokenizer_path, tokenizer = self._from_tokenizer_model(
-                tokenizer_model
-            )
-        else:
-            runtime_tokenizer_path, tokenizer, chat_template = self._from_hf()
-
-        return runtime_tokenizer_path, tokenizer, chat_template
-
     def prepare_messages(self, prompts: List[str]):  # noqa: C901
         """
         Validate and normalize a multi-turn prompt sequence, then prepare it into
@@ -184,14 +194,9 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
 
         audio_paths = self.control_args.audio_path
         if hasattr(self.config, AUDIO_ENCODER):
-            # Load audio from user-specified path (URL or local file)
-            # fall back to the default audio URL if no audio is provided.
             if not audio_paths:
-                audio_paths = [getattr(self.config, AUDIO_ENCODER).audio_url]
-                warnings.warn(
-                    f"No audio path/URL provided, using default audio URL from huggingface: {audio_paths}",
-                    UserWarning,
-                    stacklevel=1,
+                raise ValueError(
+                    "No audio path/URL provided. Please specify --audio_path."
                 )
             num_audios = len(audio_paths)
             total_audio_tokens = sum(prompt.count(AUDIO_TOKEN) for prompt in prompts)
@@ -200,24 +205,17 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
             elif total_audio_tokens != num_audios:
                 raise ValueError(
                     f"Number of <audio> tokens ({total_audio_tokens}) does not match "
-                    f"number of audios ({num_audios}). Please check your prompts and audio paths."
-                    "Please check your prompts and audio paths.\n\n"
+                    f"number of audios ({num_audios}). Please check your prompts and audio paths.\n\n"
                     f"=== Prompt ===\n{prompts}\n"
                     f"=== Audio paths ===\n{audio_paths}"
                 )
 
         image_paths = self.control_args.image_path
         if hasattr(self.config, VISION_ENCODER):
-            # Load image from user-specified path (URL or local file)
-            # fall back to the default image URL if no image is provided.
             if not image_paths:
-                image_paths = [getattr(self.config, VISION_ENCODER).img_url]
-                warnings.warn(
-                    f"No image path/URL provided, using default image URL: {image_paths}",
-                    UserWarning,
-                    stacklevel=1,
+                raise ValueError(
+                    "No image path/URL provided. Please specify --image_path."
                 )
-
             num_images = len(image_paths)
             total_image_tokens = sum(prompt.count(IMG_TOKEN) for prompt in prompts)
 
@@ -226,8 +224,7 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
             elif total_image_tokens != num_images:
                 raise ValueError(
                     f"Number of <image> tokens ({total_image_tokens}) does not match "
-                    f"number of images ({num_images}). Please check your prompts and image paths."
-                    "Please check your prompts and image paths.\n\n"
+                    f"number of images ({num_images}). Please check your prompts and image paths.\n\n"
                     f"=== Prompt ===\n{prompts}\n"
                     f"=== Image paths ===\n{image_paths}"
                 )
@@ -334,26 +331,34 @@ def _split_prompt(self, prompt: str):
         pattern = f"({'|'.join(map(re.escape, split_tokens))})"
         return [part for part in re.split(pattern, prompt) if part]
 
-    def apply_prompt_template(
+    def make_chat_template(
         self,
-        chat_template: Callable,
         prompt: str,
         system_prompt: str = None,
-    ) -> str:
-        """
-        Apply chat template to format the prompt for different modalities.
+        assistant_text: str = None,
+    ) -> List[Dict]:
+        """Build a HuggingFace-format message list for runtime evaluation.
+
+        Converts a raw prompt into the structured message format expected by
+        ``apply_chat_template``
 
         Args:
-            chat_template: The chat template function from tokenizer
-            prompt: Input text prompt
-            system_prompt: Optional system prompt
+            prompt: Raw user prompt, may contain ``<image>`` or ``<audio>`` tokens.
+            system_prompt: Optional system message appended to the message list.
+            assistant_text: Optional assistant turn; disables generation prompt when set.
 
         Returns:
-            Formatted prompt string
+            HuggingFace-format message list
         """
 
         messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
         message = {"role": "user", "content": prompt}
+        if self.chat_template is None:
+            messages.append(message)
+            return messages
+
         if self.decoder_model in VLM_SPECIAL_TOKENS:
             contents = self._split_prompt(prompt)
             message["content"] = []
@@ -367,32 +372,43 @@ def apply_prompt_template(
                         {"type": "text", "text": content},
                     )
         elif self.decoder_model in ALM_SPECIAL_TOKENS:
+            specials = ALM_SPECIAL_TOKENS[self.decoder_model]
+
             contents = self._split_prompt(prompt)
             message["content"] = ""
             for content in contents:
                 if content == AUDIO_TOKEN:
-                    message["content"] += ALM_SPECIAL_TOKENS[self.decoder_model][
-                        AUDIO_TOKEN
-                    ]
+                    message["content"] += specials[AUDIO_TOKEN]
                 else:
                     message["content"] += content
 
         messages.append(message)
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
+        if assistant_text is not None:
+            messages.append({"role": "assistant", "content": assistant_text})
+
+        return messages
+
+    def apply_chat_template(
+        self,
+        messages: List[Dict],
+    ) -> str:
+        """Format a message list into a prompt string.
+
+        Intended for calibration dataset formatting where the input is already
+        a HuggingFace-format message list (e.g. loaded from --calib_samples JSON).
+
+        If chat_template is not set (non-instruct or non-HF models), falls back
+        to concatenating each message's 'content' field directly.
+        """
+        if self.chat_template is None:
+            return "".join(m["content"] for m in messages)
 
-        template_prompt = chat_template(
-            messages, tokenize=False, add_generation_prompt=True
+        template_prompt = self.chat_template(
+            messages, tokenize=False, add_generation_prompt=False
         )
 
-        # edge cases handling:
         # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
-        # This can happen after applying a prompt template, which might inject <bos> unintentionally.
-        # To prevent decoding issues, we explicitly remove <bos> token
-        if chat_template and self.decoder_model in {
-            "gemma-2b",
-            "gemma3-1b",
-        }:
+        if self.decoder_model in {"gemma-2b", "gemma3-1b"}:
             template_prompt = template_prompt.replace("<bos>", "")
 
         return template_prompt
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
index 0026354d5d3..149a376e918 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
@@ -14,9 +14,8 @@
 from dataclasses import dataclass
 from enum import Enum
 from functools import wraps
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional
 
-import torch
 from executorch.backends.qualcomm.serialization.qc_schema import (
     QnnExecuTorchBackendType,
 )
@@ -34,12 +33,17 @@
     StaticLLMQuantRecipe,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.utils.data import DataLoader
 from transformers import AutoConfig
 
 
 class Mode(Enum):
+    # AR-N graph compiled and deployed for runtime.
     PREFILL = 1
+    # AR-1 graph compiled and deployed for runtime.
     DECODE = 2
+    # Full AR sequence mode; used for quantization, never deployed.
+    # After convert_pt2e, its scale/zp are propagated to DECODE and PREFILL via _encoding_override.
     CALIBRATE = 3
 
 
@@ -103,6 +107,7 @@ def process_model_args(
     else:
         raise ValueError(f"Unsupported mode: {mode}")
 
+    # TODO: support multi_batch for CALIBRATION MODE
     model_args.max_batch_size = 1
     model_args.max_seq_len = control_args.max_seq_len
     model_args.max_context_len = control_args.max_context_len
@@ -162,9 +167,9 @@ def process(self, request: Any):
 class Request:
     @dataclass
     class CalibrationData:
-        datasets: List[Tuple[torch.Tensor]] = None
-        intermediate_outputs: List[Tuple[torch.Tensor]] = None
-        qdq_intermediate_outputs: List[Tuple[torch.Tensor]] = None
+        datasets: Optional[DataLoader] = None
+        intermediate_outputs: Optional[DataLoader] = None
+        qdq_intermediate_outputs: Optional[DataLoader] = None
 
     @dataclass
     class Data:
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
index 720ddb97800..9bab682eac8 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -9,13 +9,11 @@
 import inspect
 import json
 import logging
-import os
 import re
-import time
 import types
 
 from functools import partial
-from typing import Any, Dict, List
+from typing import Dict, List
 
 import torch
 
@@ -48,6 +46,11 @@
     LLM_VARIANT_ARCHS,
     LLMModelConfig,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.dataset import (
+    DataConfig,
+    DatasetBuilder,
+    ModalityEncoderDataset,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     DECODE_QDQ_FILENAME,
@@ -58,16 +61,19 @@
     TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
-    _modality_inputs_merger,
-    graph_module_inference,
-)
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
     GraniteSpeechEncoder,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.evaluator.lm_eval_adapter import (
+    run_lm_eval,
+)
+from executorch.examples.qualcomm.oss_scripts.llama.inference import (
+    DecoderInference,
+    EncoderInference,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.mix_precision_analyzer import (
     PerLayerSqnrAnalyzer,
     save_suggest_recipes,
@@ -79,9 +85,11 @@
     LlamaModel,
     ModelArgs,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.quantize import PTQStrategy
 from executorch.examples.qualcomm.oss_scripts.llama.static_llm_quant_recipe import (
     StaticLLMQuantRecipe,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
 from executorch.examples.qualcomm.oss_scripts.llama.wrappers.base_component import (
     Component,
     get_model_specific_kwargs,
@@ -97,16 +105,11 @@
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.extension.llm.custom_ops import model_sharding
 from executorch.extension.llm.export.builder import DType
+from torch.utils.data import DataLoader
 from torchao.prototype.spinquant import apply_spinquant
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-from transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoModelForImageTextToText,
-    AutoModelForSpeechSeq2Seq,
-    AutoModelForVision2Seq,
-)
+from transformers import AutoModel, AutoModelForSpeechSeq2Seq
 
 
 def is_node_src_start_with_name(node: torch.fx.Node, kv_cache_prefix: str) -> bool:
@@ -201,6 +204,19 @@ def __init__(
             self.dep_table[SplitGraph] = [FoldQDQ]
             self.dep_table[TagQuantIO] = [SplitGraph]
 
+        self._decoder_inference = (
+            DecoderInference(
+                get_example_inputs=self.get_example_inputs,
+                audio_token_id=self.meta.get("audio_token_id", None),
+                image_token_id=self.meta.get("image_token_id", None),
+                max_context_len=self.meta["get_max_context_len"],
+                max_batch_size=self.meta["get_max_batch_size"],
+                use_i64_token=self.control_args.embedding_quantize is not None,
+            )
+            if self.decoder is not None
+            else None
+        )
+
     def _prepare_model(self):  # noqa: C901
         if (instance := self._get_model_instance()) is None:
             return None, None
@@ -399,6 +415,11 @@ def _get_model_instance(self) -> LlamaModel:
 
         return tok_embedding, decoder
 
+    @property
+    def attn_mask(self):
+        """Attention mask for this decoder graph, used as a schema for dataset construction."""
+        return self.example_input[1]
+
     def _save_logits_quant_attrs(self):
         for node in self.decoder.graph.nodes:
             if node.op == "output":
@@ -482,7 +503,7 @@ def _quant_recipe_suggestion(
         self,
         fp32_gm: torch.fx.GraphModule,
         qdq_gm: torch.fx.GraphModule,
-        input_sample: tuple,
+        text_dataloader: DataLoader,
         recipe: StaticLLMQuantRecipe,
     ):
         """
@@ -503,148 +524,16 @@ def _quant_recipe_suggestion(
             fp32_gm=fp32_gm,
             qdq_gm=qdq_gm,
             analysis_recipe=recipe,
-        ).analyze(input_sample)
+        ).analyze(
+            self._decoder_inference,
+            text_dataloader,
+        )
         report.save_analysis_summary()
         suggest_recipe_overrides = report.suggest_recipe_overrides()
         save_suggest_recipes(report, suggest_recipe_overrides)
 
-    def _auto_tune_calibration_threads(self):
-        """Find the optimal thread count for calibration via quick microbenchmark.
-
-        AR1 decode calibration is SGEMV-dominated (memory-bandwidth-bound).
-        The default thread count (os.cpu_count()) is typically far too high,
-        causing massive OpenMP sync overhead. This runs a few forward passes
-        at candidate thread counts and picks the fastest.
-        """
-        # Use sched_getaffinity when available — it respects cgroup/taskset
-        # constraints (e.g. containers), unlike os.cpu_count() which returns
-        # the host total regardless of pinning.
-        available = (
-            len(os.sched_getaffinity(0))
-            if hasattr(os, "sched_getaffinity")
-            else (os.cpu_count() or 1)
-        )
-        baseline = min(torch.get_num_threads(), available)
-        # Sample fractions of the thread ceiling from low through the
-        # bandwidth-saturation knee up to the current default.
-        fractions = (1 / 8, 1 / 4, 3 / 8, 1 / 2, 2 / 3, 3 / 4, 1.0)
-        candidates = sorted(
-            {1, baseline} | {max(1, round(baseline * f)) for f in fractions}
-        )
-        original = torch.get_num_threads()
-        best_threads, best_time = original, float("inf")
-        try:
-            for n_threads in candidates:
-                torch.set_num_threads(n_threads)
-                try:
-                    with torch.no_grad():
-                        self.decoder(*self.export_input)  # warmup
-                        t0 = time.perf_counter()
-                        for _ in range(3):
-                            self.decoder(*self.export_input)
-                        elapsed = time.perf_counter() - t0
-                    if elapsed < best_time:
-                        best_threads, best_time = n_threads, elapsed
-                except Exception:
-                    logging.debug("Auto-tune: threads=%d failed, skipping", n_threads)
-                    continue
-        finally:
-            torch.set_num_threads(original)
-        if best_time == float("inf"):
-            logging.warning(
-                "Auto-tune: all candidates %s failed, falling back to %d threads",
-                candidates,
-                baseline,
-            )
-            return baseline
-        logging.info(
-            "Auto-tune calibration threads: tested %s, best=%d (%.1fms/fwd)",
-            candidates,
-            best_threads,
-            best_time / 3 * 1000,
-        )
-        return best_threads
-
-    def _calibrate(
-        self,
-        model,
-        tokenizer,
-        event,
-        user_calibration_data,
-        tok_embedding=None,
-        intermediate_outputs=None,
-        collect_input_samples=False,
-    ):
-        """
-        Calibrate the model using either task-based evaluation or prompt-based inference.
-
-        This method performs Post-Training Quantization (PTQ) calibration by running inference
-        on the model with either:
-        1. Task-based datasets by lm_eval for text-only models in perplexity evaluation
-        2. User-provided prompts for both text-only and multimodal models
-
-        Args:
-            model: The decoder model to calibrate (GraphModule after prepare_pt2e)
-            tokenizer: Tokenizer for encoding text inputs
-            event: Event name for logging (e.g., "prepare_pt2e", "convert_pt2e")
-            tok_embedding: Optional text embedding module (required only for multimodal models)
-            intermediate_outputs: Optional pre-computed embeddings from vision/audio encoder
-                                 (required only for multimodal models)
-        """
-        # Determine if this is a multimodal model
-        is_multimodal = tok_embedding is not None
-
-        # Determine if task-based calibration is requested
-        has_task_calibration = self.control_args.calib_tasks is not None
-
-        # Task-based calibration: Only for text-only LLMs
-        # Multimodal models (VLMs) cannot use task-based evaluation currently.
-        input_samples = []
-        if has_task_calibration and not is_multimodal:
-            result = graph_module_inference(
-                use_kv_cache=self.meta["get_use_kv_cache"],
-                get_example_inputs=self.get_example_inputs,
-                module=model,
-                tokenizer=tokenizer,
-                ar_len=self.meta["get_ar_len"],
-                max_seq_len=self.meta["get_max_context_len"],
-                tasks=self.control_args.calib_tasks,
-                tasks_limit=self.control_args.calib_limit,
-                num_fewshot=self.control_args.calib_num_fewshot,
-                use_i64_token=self.control_args.embedding_quantize is not None,
-                event_name=f"{event}_tasks",
-                seq_mse_candidates=self.config.seq_mse_candidates,
-                collect_input_samples=collect_input_samples,
-            )
-            if result.input_samples:
-                input_samples.extend(result.input_samples)
-
-        # the user's prompt helps calibrate the special tokens.
-        if user_calibration_data:
-            for turn in zip(intermediate_outputs, user_calibration_data):
-                hidden_states, prompt = turn
-                result = graph_module_inference(
-                    use_kv_cache=self.meta["get_use_kv_cache"],
-                    get_example_inputs=self.get_example_inputs,
-                    hidden_states=hidden_states,  # hidden_states for multimodal
-                    module=model,
-                    tok_embedding=tok_embedding,
-                    audio_token_id=self.meta.get("audio_token_id", None),
-                    image_token_id=self.meta.get("image_token_id", None),
-                    tokenizer=tokenizer,
-                    ar_len=self.meta["get_ar_len"],
-                    max_seq_len=self.meta["get_max_context_len"],
-                    prompt=torch.Tensor(prompt).to(torch.long),
-                    use_i64_token=self.control_args.embedding_quantize is not None,
-                    event_name=f"{event}_prompt",
-                    collect_input_samples=collect_input_samples,
-                )
-                if result.input_samples:
-                    input_samples.extend(result.input_samples)
-        return input_samples
-
     @log_info
-    def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
+    def quantize(self, request: Request):  # noqa: C901
         if self.quant_recipe is None:
             return
 
@@ -690,6 +579,7 @@ def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
         )
 
         with torch.no_grad():
+            graph_module = None
             self.decoder = torch.export.export(
                 self.decoder, self.export_input, strict=True
             ).module()
@@ -705,6 +595,23 @@ def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
                     strict=True,
                 ).module()
 
+            if (
+                self.control_args.verbose
+                and self.mode == Mode.CALIBRATE
+                and not self.apply_embedding
+            ):
+                run_lm_eval(
+                    module=self.decoder,
+                    get_example_inputs=self.get_example_inputs,
+                    tokenizer=data.tokenizer,
+                    max_seq_length=self.meta["get_max_context_len"],
+                    tasks=self.control_args.eval_tasks,
+                    use_i64_token=self.control_args.embedding_quantize is not None,
+                    num_fewshot=self.control_args.eval_num_fewshot,
+                    limit=self.control_args.eval_limit,
+                    event_name="export_tasks",
+                )
+
             self.decoder = prepare_pt2e(self.decoder, quantizer)
             if self.apply_embedding:
                 self.tok_embedding = prepare_pt2e(
@@ -712,33 +619,22 @@ def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
                 )
 
             if self.mode == Mode.CALIBRATE:
-                audio_turns = request.method_data[
-                    AUDIO_ENCODER
-                ].calibration_data.intermediate_outputs
-                vision_turns = request.method_data[
-                    VISION_ENCODER
-                ].calibration_data.intermediate_outputs
-                if audio_turns is None:
-                    audio_turns = [
-                        [] for _ in range(len(data.calibration_data.datasets))
-                    ]
-                if vision_turns is None:
-                    vision_turns = [
-                        [] for _ in range(len(data.calibration_data.datasets))
-                    ]
-                intermediate_outputs = [
-                    [*audio_turn, *vision_turn]
-                    for audio_turn, vision_turn in zip(audio_turns, vision_turns)
-                ]
-                input_samples = self._calibrate(
-                    model=self.decoder,
-                    tokenizer=data.tokenizer,
-                    event="prepare_pt2e",
-                    user_calibration_data=calibration_tokens,
+                calibration_dataloaders = {
+                    AUDIO_ENCODER: request.method_data[
+                        AUDIO_ENCODER
+                    ].calibration_data.intermediate_outputs,
+                    VISION_ENCODER: request.method_data[
+                        VISION_ENCODER
+                    ].calibration_data.intermediate_outputs,
+                    TEXT_DECODER: data.calibration_data.datasets,
+                }
+                PTQStrategy(
+                    inference=self._decoder_inference,
+                    module=self.decoder,
+                    seq_mse_candidates=self.config.seq_mse_candidates,
                     tok_embedding=self.tok_embedding,
-                    intermediate_outputs=intermediate_outputs,
-                    collect_input_samples=self.control_args.quant_recipe_suggestion,
-                )
+                ).quantize(calib_loader=calibration_dataloaders)
+                logging.info("Calibration complete for prepare_pt2e")
             else:
                 # one dummy inference to remove affine observer
                 # error happened in convert_pt2e
@@ -753,39 +649,32 @@ def quantize(self, request: Request, calibration_tokens=None):  # noqa: C901
                 self._quant_recipe_suggestion(
                     graph_module,
                     self.decoder,
-                    input_samples,
+                    calibration_dataloaders[TEXT_DECODER],
                     self.quant_recipe.recipe,
                 )
 
+            # FP32 model used for quant-recipe-suggestion reference; release after use.
+            del graph_module
+            gc.collect()
+
             if self.apply_embedding:
                 self.tok_embedding = convert_pt2e(self.tok_embedding)
 
-            if self.control_args.verbose and self.mode == Mode.CALIBRATE:
-                audio_turns = request.method_data[
-                    AUDIO_ENCODER
-                ].calibration_data.qdq_intermediate_outputs
-                vision_turns = request.method_data[
-                    VISION_ENCODER
-                ].calibration_data.qdq_intermediate_outputs
-                if audio_turns is None:
-                    audio_turns = [
-                        [] for _ in range(len(data.calibration_data.datasets))
-                    ]
-                if vision_turns is None:
-                    vision_turns = [
-                        [] for _ in range(len(data.calibration_data.datasets))
-                    ]
-                qdq_intermediate_outputs = [
-                    [*audio_turn, *vision_turn]
-                    for audio_turn, vision_turn in zip(audio_turns, vision_turns)
-                ]
-                self._calibrate(
-                    model=self.decoder,
+            if (
+                self.control_args.verbose
+                and self.mode == Mode.CALIBRATE
+                and not self.apply_embedding
+            ):
+                run_lm_eval(
+                    module=self.decoder,
+                    get_example_inputs=self.get_example_inputs,
                     tokenizer=data.tokenizer,
-                    event="convert_pt2e",
-                    user_calibration_data=calibration_tokens,
-                    tok_embedding=self.tok_embedding,
-                    intermediate_outputs=qdq_intermediate_outputs,
+                    max_seq_length=self.meta["get_max_context_len"],
+                    tasks=self.control_args.eval_tasks,
+                    use_i64_token=self.control_args.embedding_quantize is not None,
+                    num_fewshot=self.control_args.eval_num_fewshot,
+                    limit=self.control_args.eval_limit,
+                    event_name="convert_pt2e_tasks",
                 )
 
         # setup quantized IO
@@ -822,8 +711,13 @@ def __init__(
             Mode.PREFILL,
             apply_embedding=apply_embedding,
         )
-        self.calibration_prefill = TextDecoder(  # for quantization only
-            control_args, config, Mode.CALIBRATE, apply_embedding=apply_embedding
+        # Full AR sequence with KV cache; used only for quantization.
+        # Scales/zp collected here are propagated to decode and prefill graphs via _encoding_override.
+        self.calibration_prefill = TextDecoder(
+            control_args,
+            config,
+            Mode.CALIBRATE,
+            apply_embedding=apply_embedding,
         )
 
         self.control_args = control_args
@@ -980,149 +874,10 @@ def parameter_override(quantized_node, unquantized_node):
 
         unquantized_model.recompile()
 
-    def _generate_tokens_from_hf(self, model: AutoModel, data, intermediate_outputs):
-        from pytorch_tokenizers.tiktoken import TiktokenTokenizer
-
-        tok_embedding = self.decode.tok_embedding
-        audio_token_id = self.decode.meta.get("audio_token_id")
-        image_token_id = self.decode.meta.get("image_token_id")
-        use_i64_token = self.decode.control_args.embedding_quantize is not None
-        max_seq_len = self.decode.meta["get_max_context_len"]
-        tokenizer = data.tokenizer
-        is_multimodal = all(
-            [
-                tok_embedding,
-                audio_token_id or image_token_id,
-            ]
-        )
-
-        calibration_tokens = []
-        for hidden_states, prompt in zip(
-            intermediate_outputs, data.calibration_data.datasets
-        ):
-            if isinstance(tokenizer, TiktokenTokenizer):
-                token_ids = tokenizer.encode(
-                    prompt, bos=True, eos=False, allowed_special="all"
-                )
-            else:
-                token_ids = tokenizer.encode(prompt, bos=True, eos=False)
-            input_ids = torch.tensor([token_ids], dtype=torch.int64)
-
-            with torch.no_grad():
-                if is_multimodal and hidden_states:
-                    token_dtype = torch.int64 if use_i64_token else torch.int32
-                    text_embeds = tok_embedding(input_ids.to(token_dtype))
-                    merged_embeds = _modality_inputs_merger(
-                        input_ids,
-                        text_embeds,
-                        torch.cat(hidden_states, dim=1),
-                        audio_token_id or image_token_id,
-                    )
-                    generated_ids = model.generate(
-                        inputs_embeds=merged_embeds,
-                        max_new_tokens=max_seq_len - len(token_ids),
-                        eos_token_id=tokenizer.eos_id,
-                        do_sample=False,
-                    )
-                    full_tokens = token_ids + generated_ids[0].tolist()
-                else:
-                    output_ids = model.generate(
-                        input_ids=input_ids,
-                        max_new_tokens=max_seq_len - len(token_ids),
-                        eos_token_id=tokenizer.eos_id,
-                        do_sample=False,
-                    )
-                    full_tokens = output_ids[0].tolist()
-
-            calibration_tokens.append(full_tokens)
-
-        return calibration_tokens
-
-    def _generate_calibration_tokens(self, request: Request):
-        data = request.method_data[TEXT_DECODER]
-        audio_turns = request.method_data[
-            AUDIO_ENCODER
-        ].calibration_data.intermediate_outputs
-        vision_turns = request.method_data[
-            VISION_ENCODER
-        ].calibration_data.intermediate_outputs
-        if audio_turns is None:
-            audio_turns = [[] for _ in range(len(data.calibration_data.datasets))]
-        if vision_turns is None:
-            vision_turns = [[] for _ in range(len(data.calibration_data.datasets))]
-        intermediate_outputs = [
-            [*audio_turn, *vision_turn]
-            for audio_turn, vision_turn in zip(audio_turns, vision_turns)
-        ]
-
-        if self.config.repo_id:
-            if self.control_args.decoder_model == "smolvlm_500m_instruct":
-                hf_model = AutoModelForVision2Seq.from_pretrained(
-                    self.config.repo_id, torch_dtype=torch.float32
-                )
-
-            elif self.control_args.decoder_model == "internvl3_1b":
-                hf_model = AutoModelForImageTextToText.from_pretrained(
-                    self.config.repo_id, torch_dtype=torch.float32
-                )
-
-            elif self.control_args.decoder_model == "granite_speech_3_3-2b":
-                hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                    self.config.repo_id, torch_dtype=torch.float32
-                )
-            else:
-                hf_model = AutoModelForCausalLM.from_pretrained(
-                    self.config.repo_id,
-                )
-            calibration_tokens = self._generate_tokens_from_hf(
-                model=hf_model,
-                data=data,
-                intermediate_outputs=intermediate_outputs,
-            )
-        else:
-            # Auto-tune thread count for the without-cache calibration pass.
-            calib_threads = getattr(self.control_args, "calibration_num_threads", 0)
-            if calib_threads <= 0:
-                calib_threads = self.decode._auto_tune_calibration_threads()
-            original_threads = torch.get_num_threads()
-            torch.set_num_threads(calib_threads)
-            try:
-                calibration_tokens = []
-                for hidden_states, prompt in zip(
-                    intermediate_outputs, data.calibration_data.datasets
-                ):
-                    result = graph_module_inference(
-                        use_kv_cache=self.decode.meta["get_use_kv_cache"],
-                        get_example_inputs=self.decode.get_example_inputs,
-                        hidden_states=hidden_states,
-                        module=self.decode.decoder,
-                        tok_embedding=self.decode.tok_embedding,
-                        image_token_id=self.decode.meta.get("image_token_id", None),
-                        tokenizer=data.tokenizer,
-                        ar_len=self.decode.meta["get_ar_len"],
-                        max_seq_len=self.decode.meta["get_max_context_len"],
-                        prompt=prompt,
-                        use_i64_token=self.decode.control_args.embedding_quantize
-                        is not None,
-                        event_name="generated_user_prompt",
-                    )
-                    calibration_tokens.append(result.token_list)
-            finally:
-                torch.set_num_threads(original_threads)
-
-        return calibration_tokens
-
     def quantize(self, request: Request):
         if request.method_data[TEXT_DECODER].skip_quantize:
             return
-
-        if self.control_args.skip_user_prompt_calibration:
-            calibration_tokens = None
-        else:
-            calibration_tokens = self._generate_calibration_tokens(request)
-        self.calibration_prefill.quantize(
-            request, calibration_tokens=calibration_tokens
-        )
+        self.calibration_prefill.quantize(request)
 
     @log_info
     def compile(self, request: Request):  # noqa: C901
@@ -1157,12 +912,21 @@ def compile(self, request: Request):  # noqa: C901
 
             # Saving Decode QDQ Model EP for SQNR evaluation
             qdq_ep = torch.export.export(
-                self.decode.decoder, self.decode.export_input, strict=True
+                self.calibration_prefill.decoder,
+                self.calibration_prefill.export_input,
+                strict=True,
             )
             qdq_ep_path = f"{self.decode.control_args.artifact}/{DECODE_QDQ_FILENAME}"
             torch.export.save(qdq_ep, qdq_ep_path)
             logging.info(f"QDQ EP saved to {qdq_ep_path}")
 
+            if self.apply_embedding:
+                self._encoding_override(
+                    quantized_model=self.calibration_prefill.tok_embedding,
+                    unquantized_model=self.decode.tok_embedding,
+                    override_kv_cache=False,
+                )
+
             # For hybrid mode, override encoding of prefill model.
             if (
                 self.prefill.decoder is not None
@@ -1328,6 +1092,8 @@ def __init__(
             # metadata
             self.config = config
 
+            self._encoder_inference = EncoderInference()
+
         self.pass_manager_cls = get_qnn_pass_manager_cls()
         self.passes_job = self.pass_manager_cls.get_capture_program_passes()
         self.dep_table = (
@@ -1379,7 +1145,7 @@ def compile(self, request: Request):
             self.dep_table[TagQuantIO] = [SplitGraph]
 
             if not request_data.skip_quantize:
-                fixed_point_type = {"io_type": torch.uint16}
+                fixed_point_type = {"io_type": torch.float32}
 
                 # setup quantized IO
                 self.passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
@@ -1412,10 +1178,17 @@ def compile(self, request: Request):
 
     def _calibrate(self, model, calibration_datasets):
         outputs = []
-        for turn in calibration_datasets:
-            outputs_each_turn = [model(*data) for data in turn]
-            outputs.append(outputs_each_turn)
-        return outputs
+        for batch in calibration_datasets:
+            outputs_each_batch = [
+                self._encoder_inference.predict_step(model, data)
+                for data in batch["inputs"]
+            ]
+            outputs.append(outputs_each_batch)
+        return DataLoader(
+            ModalityEncoderDataset(outputs),
+            batch_size=1,
+            shuffle=False,
+        )
 
     def quantize(self, request: Request):
         if self.model is None:
@@ -1459,6 +1232,8 @@ def quantize(self, request: Request):
 
 class MultiModalManager(Component):
     def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
+        self.control_args = control_args
+        self.config = config
         self.audio_encoder = Modality(
             control_args,
             config,
@@ -1517,12 +1292,20 @@ def compile(
     @log_info
     def quantize(
         self,
-        calibration_data: Dict[str, List[Any]],
+        tokenizer_wrapper: TokenizerWrapper,
         skip_quantize: Dict[str, bool],
-        tokenizer,
         backend,
         soc_model,
     ):
+        data_config = DataConfig.from_args(self.control_args)
+        dataset_builder = DatasetBuilder(
+            data_config=data_config,
+            llm_config=self.config,
+            tokenizer_wrapper=tokenizer_wrapper,
+            attn_mask=self.text_decoder.calibration_prefill.attn_mask,
+        )
+        calibration_data = dataset_builder.build_calib_dataloaders()
+
         quantize_request = Request(
             inspect.currentframe().f_code.co_name,
             {
@@ -1531,7 +1314,7 @@ def quantize(
                         datasets=calibration_data[m]
                     ),
                     skip_quantize=skip_quantize.get(m, False),
-                    tokenizer=tokenizer,
+                    tokenizer=tokenizer_wrapper.tokenizer,
                     backend=backend,
                     soc_model=soc_model,
                 )
diff --git a/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py b/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
index f59dc548c44..8dc334baf28 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
+++ b/examples/qualcomm/oss_scripts/llm_utils/decoder_model_wrapper.py
@@ -38,7 +38,10 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_seq_len = getattr(cache_config, "max_cache_len", None)
+            if isinstance(cache_config, dict):
+                max_seq_len = cache_config.get("max_cache_len", None)
+            else:
+                max_seq_len = getattr(cache_config, "max_cache_len", None)
             if max_seq_len is not None:
                 metadata["get_max_seq_len"] = max_seq_len
 
@@ -115,7 +118,7 @@ def _qnn_attention_mask(
 
     # Simplest and most efficient way to obtain a causal mask
     causal_mask = kv_arange <= reshaped_cache_position
-    atten_mask = torch.full((causal_mask.shape[0], kv_length), torch.tensor(-65504.0))
+    atten_mask = torch.full((causal_mask.shape[0], kv_length), -65504.0)
     atten_mask = atten_mask.masked_fill(causal_mask, 0)
     atten_mask = atten_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
 
@@ -133,7 +136,7 @@ def __init__(self, model):
         logging.info(f"Metadata to be recorded in PTE: {self._metadata}")
         self.exportable_module = TorchExportableModuleForDecoderOnlyLM(
             self.model,
-            max_batch_size=1,
+            batch_size=1,
             max_cache_len=self._metadata.get("get_max_seq_len"),
         )
         self._register_attention_mask_for_4_53(self.exportable_module)
@@ -154,7 +157,9 @@ def get_example_inputs(self):
         return (example_input_ids, example_cache_position)
 
     def forward(self, input_ids: torch.Tensor, cache_position: torch.Tensor):
-        return self.exportable_module(input_ids, cache_position)
+        return self.exportable_module(
+            input_ids=input_ids, cache_position=cache_position
+        )
 
     def get_metadata(self):
         return self._metadata
diff --git a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
index 89277bcaac8..f9d1b8993a3 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
+++ b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
@@ -171,16 +171,6 @@ def pt2e_calibrate(
         calibration_data,
         tokenizer_path,
     ):
-        try:
-            from executorch.examples.qualcomm.oss_scripts.llm_utils.eval_decoder_model_qnn import (
-                GraphModuleCalibrationWrapper,
-            )
-            from lm_eval.evaluator import simple_evaluate
-        except ImportError:
-            raise ImportError(
-                "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
-            )
-
         tokenizer = get_tokenizer(tokenizer_path)
         logging.info(
             f"Calibrating with tasks: {calibration_tasks}, limit: {calibration_limit}, calibration_data: {calibration_data}, tokenizer_path: {tokenizer_path}, seq_length: {self.config.max_seq_len}"
@@ -211,6 +201,17 @@ def calibrate_template(
             max_len=calibration_seq_length,
         )
         if calibration_tasks is not None and calibration_limit is not None:
+            # Import lazily so only import lm_eval when user use it.
+            try:
+                from executorch.examples.qualcomm.oss_scripts.llm_utils.eval_decoder_model_qnn import (
+                    GraphModuleCalibrationWrapper,
+                )
+                from lm_eval.evaluator import simple_evaluate
+            except ImportError:
+                raise ImportError(
+                    "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
+                )
+
             eval_wrapper = GraphModuleCalibrationWrapper(
                 model=self.graph_module,
                 tokenizer=tokenizer,
diff --git a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
index 70641af8fb7..7876a5b54b3 100644
--- a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
+++ b/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
@@ -14,7 +14,6 @@
 
 import torch
 from executorch.backends.qualcomm.export_utils import (
-    get_backend_type,
     QnnConfig,
     setup_common_args_and_variables,
     SimpleADB,
@@ -75,7 +74,7 @@ def compile(args: argparse.Namespace, qnn_config: QnnConfig):  # noqa: C901
             args.calibration_limit,
             args.prompt,
             tokenizer_json_path,
-            get_backend_type(qnn_config.backend),
+            qnn_config.backend,
             qnn_config.soc_model,
         )
 
@@ -158,7 +157,7 @@ def post_process():
             runner="examples/models/llama/llama_main",
         )
         # No pregen inputs, input_list is not required
-        adb.push(inputs=[], input_list="", files=[tokenizer_json_path])
+        adb.push(inputs=[], files=[tokenizer_json_path])
         adb.execute(custom_runner_cmd=runner_cmd)
 
         adb.pull(host_output_path=args.artifact, callback=post_process)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index f63f20717d1..9bdba810138 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -390,9 +390,10 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                         "If the data is too large and it's not preferred to copy, please tag the "
                         "constant node like node.['no_copy'] = True and they won't be copied."
                     )
-                # tag the data node with the same tag as the last user
+                # Pick a deterministic consumer tag so a constant shared across
+                # partitions is assigned reproducibly across runs.
                 if len(user_tags) > 0:
-                    node.meta["delegation_tag"] = user_tags.pop()
+                    node.meta["delegation_tag"] = min(user_tags)
 
 
 def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
@@ -429,9 +430,10 @@ def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
                     "If the data is too large and it's not preferred to copy, please tag the "
                     "constant node like node.['no_copy'] = True and they won't be copied."
                 )
-            # tag the data node with the same tag as the last user
+            # Pick a deterministic consumer tag so a buffer shared across
+            # partitions is assigned reproducibly across runs.
             if len(user_tags) > 0:
-                node.meta["delegation_tag"] = user_tags.pop()
+                node.meta["delegation_tag"] = min(user_tags)
 
 
 def is_shape_dynamic(node: torch.fx.Node) -> bool:
diff --git a/exir/pass_base.py b/exir/pass_base.py
index 910adf64de9..c657ac53a91 100644
--- a/exir/pass_base.py
+++ b/exir/pass_base.py
@@ -97,6 +97,52 @@ def _unstack_pytree(xs) -> List[PyTree]:  # pyre-ignore
     return pytrees
 
 
+@dataclass(frozen=True)
+class _SymbolicTensorSnapshot:
+    shape: Tuple[Optional[str], ...]
+
+
+def _symbolic_scalar_snapshot(
+    value: Argument,
+) -> Optional[Tuple[str, str]]:
+    if isinstance(value, torch.SymInt):
+        return ("SymInt", str(value))
+    if isinstance(value, torch.SymFloat):
+        return ("SymFloat", str(value))
+    if isinstance(value, torch.SymBool):
+        return ("SymBool", str(value))
+    return None
+
+
+def _leaf_symbolic_snapshot(value: Argument) -> Any:
+    scalar_snapshot = _symbolic_scalar_snapshot(value)
+    if scalar_snapshot is not None:
+        return scalar_snapshot
+
+    if isinstance(value, FakeTensor):
+        dims = []
+        has_symbolic_dim = False
+        for dim in value.shape:
+            dim_snapshot = _symbolic_scalar_snapshot(dim)
+            if dim_snapshot is None:
+                dims.append(None)
+            else:
+                has_symbolic_dim = True
+                dims.append(dim_snapshot[1])
+        if has_symbolic_dim:
+            return _SymbolicTensorSnapshot(tuple(dims))
+
+    return None
+
+
+def _extract_symbolic_snapshot(value: Argument) -> Any:
+    snapshot = pytree.tree_map(_leaf_symbolic_snapshot, value)
+    leaves = pytree.tree_leaves(snapshot)
+    if any(leaf is not None for leaf in leaves):
+        return snapshot
+    return None
+
+
 class NodeMetadata:
     def __init__(self, data: Dict[str, Any]) -> None:
         self.data: Dict[str, Any] = data.copy()
@@ -480,6 +526,50 @@ def __init__(self) -> None:
         self._initialized = True
         self.node_debug_str: Optional[str] = None
 
+    def should_preserve_symbolic_input_metadata(self) -> bool:
+        """Returns whether replay should validate symbolic input preservation.
+
+        Override to ``False`` for passes that intentionally change symbolic
+        input metadata during replay.
+        """
+        return True
+
+    def _capture_symbolic_input_snapshots(
+        self, graph_module: fx.GraphModule
+    ) -> List[Any]:
+        return [
+            _extract_symbolic_snapshot(node.meta.get("val"))
+            for node in graph_module.graph.nodes
+            if node.op == "placeholder"
+        ]
+
+    def _validate_symbolic_input_snapshots(
+        self,
+        graph_module: fx.GraphModule,
+        new_graph_module: fx.GraphModule,
+    ) -> None:
+        if not self.should_preserve_symbolic_input_metadata():
+            return
+
+        symbolic_inputs = self._capture_symbolic_input_snapshots(graph_module)
+        if all(snapshot is None for snapshot in symbolic_inputs):
+            return
+
+        new_symbolic_inputs = self._capture_symbolic_input_snapshots(new_graph_module)
+        for input_index, snapshot in enumerate(symbolic_inputs):
+            if snapshot is None:
+                continue
+            if input_index >= len(new_symbolic_inputs):
+                raise ExportPassBaseError(
+                    f"Input at position {input_index} did not preserve symbolic metadata across pass replay."
+                )
+
+            current_snapshot = new_symbolic_inputs[input_index]
+            if current_snapshot != snapshot:
+                raise ExportPassBaseError(
+                    f"Input at position {input_index} did not preserve symbolic metadata across pass replay."
+                )
+
     def _fx(
         self,
         kind: str,
@@ -691,6 +781,7 @@ def call_submodule(
             interpreter.run(*inputs_data)
 
         new_graph_module = torch.fx.GraphModule(self.tracer.root, self.tracer.graph)
+        self._validate_symbolic_input_snapshots(graph_module, new_graph_module)
 
         # Preserve GraphModule-level metadata from the input module.
         new_graph_module.meta = graph_module.meta.copy()
diff --git a/exir/tensor.py b/exir/tensor.py
index fa1287fbd85..a4e480ffce0 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -10,8 +10,6 @@
 # pyre-ignore-all-errors[16]
 from __future__ import annotations
 
-import copy
-
 import math
 import typing
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
@@ -112,7 +110,7 @@ def stride_from_dim_order(sizes: List[int], dim_order: List[int]) -> List[int]:
     """
     if len(sizes) == 0:
         return []
-    strides = copy.deepcopy(sizes)
+    strides = list(sizes)
     ndim = len(sizes)
     strides[dim_order[ndim - 1]] = 1
     for i in range(ndim - 2, -1, -1):
diff --git a/exir/tests/test_pass_infra.py b/exir/tests/test_pass_infra.py
index 7df6b76b93a..59406b13f8f 100644
--- a/exir/tests/test_pass_infra.py
+++ b/exir/tests/test_pass_infra.py
@@ -15,7 +15,9 @@
 from executorch.exir.pass_base import (
     ExportedProgramPassBase,
     ExportedProgramPassResult,
+    ExportPass,
     ExportPassBaseError,
+    NodeMetadata,
     ProxyValue,
 )
 from executorch.exir.pass_manager import ExportedProgramPassManager, PassManager
@@ -449,3 +451,109 @@ def f(x: torch.Tensor) -> torch.Tensor:
 
         with self.assertRaisesRegex(Exception, "call_method"):
             pm(exported_program)
+
+
+class TestPassBaseSymbolicInputs(unittest.TestCase):
+    class SymSizeModule(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return x.view(x.size(0), -1)
+
+    @staticmethod
+    def _find_input_node(gm: torch.fx.GraphModule) -> torch.fx.Node:
+        for node in gm.graph.nodes:
+            if node.op == "placeholder" and "val" in node.meta:
+                return node
+        raise AssertionError("Expected to find an input placeholder")
+
+    @staticmethod
+    def _symbolic_input_shape(node: torch.fx.Node) -> tuple[str | None, ...]:
+        value = node.meta["val"]
+        assert isinstance(value, torch.Tensor)
+        return tuple(
+            str(dim) if isinstance(dim, torch.SymInt) else None for dim in value.shape
+        )
+
+    def _export_dynamic_graph_module(self) -> torch.fx.GraphModule:
+        exported = export(
+            self.SymSizeModule(),
+            (torch.randn(2, 3),),
+            dynamic_shapes=({0: Dim("batch", min=1, max=8)},),
+            strict=True,
+        )
+        return to_edge(exported).exported_program().graph_module
+
+    def test_export_pass_preserves_symbolic_input_metadata(self) -> None:
+        graph_module = self._export_dynamic_graph_module()
+        original_input = self._find_input_node(graph_module)
+        original_snapshot = self._symbolic_input_shape(original_input)
+        self.assertTrue(any(dim is not None for dim in original_snapshot))
+
+        new_graph_module = ExportPass()(graph_module).graph_module
+        new_input = self._find_input_node(new_graph_module)
+
+        self.assertEqual(self._symbolic_input_shape(new_input), original_snapshot)
+
+    def test_export_pass_matches_symbolic_inputs_by_position(self) -> None:
+        class RenamePlaceholderPass(ExportPass):
+            def placeholder(
+                self,
+                name: str,
+                arg: torch.Tensor,
+                meta: NodeMetadata,
+            ) -> ProxyValue:
+                return super().placeholder(f"renamed_{name}", arg, meta)
+
+        new_graph_module = RenamePlaceholderPass()(
+            self._export_dynamic_graph_module()
+        ).graph_module
+        new_input = self._find_input_node(new_graph_module)
+
+        self.assertEqual(new_input.name, "renamed_x")
+        self.assertTrue(
+            any(dim is not None for dim in self._symbolic_input_shape(new_input))
+        )
+
+    def test_export_pass_rejects_collapsed_symbolic_input_metadata(self) -> None:
+        class CollapseSymbolicInputPass(ExportPass):
+            def placeholder(
+                self,
+                name: str,
+                arg: torch.Tensor,
+                meta: NodeMetadata,
+            ) -> ProxyValue:
+                proxy = super().placeholder(name, arg, meta)
+                if any(isinstance(dim, torch.SymInt) for dim in arg.shape):
+                    proxy.node.meta["val"] = torch.empty(2, 3, device="meta")
+                return proxy
+
+        with self.assertRaisesRegex(
+            ExportPassBaseError,
+            "Input at position 0 did not preserve symbolic metadata",
+        ):
+            CollapseSymbolicInputPass()(self._export_dynamic_graph_module())
+
+    def test_export_pass_can_disable_symbolic_input_validation(self) -> None:
+        class CollapseSymbolicInputPass(ExportPass):
+            def should_preserve_symbolic_input_metadata(self) -> bool:
+                return False
+
+            def placeholder(
+                self,
+                name: str,
+                arg: torch.Tensor,
+                meta: NodeMetadata,
+            ) -> ProxyValue:
+                proxy = super().placeholder(name, arg, meta)
+                if any(isinstance(dim, torch.SymInt) for dim in arg.shape):
+                    proxy.node.meta["val"] = torch.empty(2, 3, device="meta")
+                return proxy
+
+        graph_module = self._export_dynamic_graph_module()
+        original_snapshot = self._symbolic_input_shape(
+            self._find_input_node(graph_module)
+        )
+
+        new_graph_module = CollapseSymbolicInputPass()(graph_module).graph_module
+        new_input = self._find_input_node(new_graph_module)
+
+        self.assertNotEqual(self._symbolic_input_shape(new_input), original_snapshot)
diff --git a/exir/tests/test_tensor.py b/exir/tests/test_tensor.py
index 25bf2ea451e..6435ca98a13 100644
--- a/exir/tests/test_tensor.py
+++ b/exir/tests/test_tensor.py
@@ -388,6 +388,26 @@ def test_strides_from_dim_order(self) -> None:
         strides = stride_from_dim_order(sizes, dim_order)
         self.assertEqual(expected_strides, strides)
 
+    def test_strides_from_dim_order_with_symbolic_sizes(self) -> None:
+        class ViewModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.view(x.shape[0], -1)
+
+        exported_program = torch.export.export(
+            ViewModule(),
+            (torch.randn(2, 3, 4),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=8)}},
+        )
+        placeholder = next(
+            node
+            for node in exported_program.graph_module.graph.nodes
+            if node.op == "placeholder"
+        )
+        sizes = list(placeholder.meta["val"].shape)
+
+        self.assertIsInstance(sizes[0], torch.SymInt)
+        self.assertEqual([12, 4, 1], stride_from_dim_order(sizes, [0, 1, 2]))
+
     def test_num_bytes_from_shape_and_dtype(self) -> None:
         shape = (2, 3, 4)
         self.assertEqual(24, num_bytes_from_shape_and_dtype(shape, torch.int8))
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index 8e1c2bf0143..c7a823c26df 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -67,12 +67,12 @@ struct type_map<torch::executor::Tensor> final {
 
 // Optional.
 template <class T>
-struct type_map<torch::executor::optional<T>> final {
+struct type_map<std::optional<T>> final {
   using type = std::optional<typename type_map<T>::type>;
 };
 
 template <class T>
-struct type_map<torch::executor::optional<T>&> final {
+struct type_map<std::optional<T>&> final {
   using type = std::optional<typename type_map<T>::type>&;
 };
 
@@ -177,7 +177,7 @@ struct type_convert<
                 typename remove_const_ref<AOptional>::type::value_type>> &&
         std::is_same_v<
             typename remove_const_ref<EOptional>::type,
-            torch::executor::optional<
+            std::optional<
                 typename remove_const_ref<EOptional>::type::value_type>>>>
     final {
  public:
diff --git a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
index b76596b9963..3abc84897ce 100644
--- a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
+++ b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
@@ -32,8 +32,8 @@ Tensor& add_1_out(const Tensor& a, Tensor& out) {
 }
 
 Tensor& add_optional_scalar_out(
-    torch::executor::optional<int64_t> s1,
-    torch::executor::optional<int64_t> s2,
+    std::optional<int64_t> s1,
+    std::optional<int64_t> s2,
     Tensor& out) {
   if (s1.has_value()) {
     out.mutable_data_ptr<int64_t>()[0] += s1.value();
@@ -45,8 +45,8 @@ Tensor& add_optional_scalar_out(
 }
 
 Tensor& add_optional_tensor_out(
-    torch::executor::optional<torch::executor::Tensor> s1,
-    torch::executor::optional<torch::executor::Tensor> s2,
+    std::optional<torch::executor::Tensor> s1,
+    std::optional<torch::executor::Tensor> s2,
     Tensor& out) {
   if (s1.has_value()) {
     out.mutable_data_ptr<int64_t>()[0] +=
@@ -78,8 +78,7 @@ Tensor& sum_arrayref_tensor_out(
 }
 
 Tensor& sum_arrayref_optional_tensor_out(
-    torch::executor::ArrayRef<
-        torch::executor::optional<torch::executor::Tensor>> a,
+    torch::executor::ArrayRef<std::optional<torch::executor::Tensor>> a,
     Tensor& out) {
   for (int i = 0; i < a.size(); i++) {
     if (a[i].has_value()) {
@@ -169,20 +168,19 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_Tuple_TensorRef3x) {
 TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_Optionals) {
   // Scalar.
   EXPECT_TRUE((std::is_same<
-               type_map<torch::executor::optional<int64_t>>::type,
+               type_map<std::optional<int64_t>>::type,
                std::optional<int64_t>>::value));
   // Tensor.
+  EXPECT_TRUE((std::is_same<
+               type_map<std::optional<torch::executor::Tensor>>::type,
+               std::optional<at::Tensor>>::value));
+  // ArrayRef.
   EXPECT_TRUE(
       (std::is_same<
-          type_map<torch::executor::optional<torch::executor::Tensor>>::type,
-          std::optional<at::Tensor>>::value));
-  // ArrayRef.
-  EXPECT_TRUE((std::is_same<
-               type_map<torch::executor::optional<
-                   torch::executor::ArrayRef<int64_t>>>::type,
-               std::optional<c10::ArrayRef<int64_t>>>::value));
+          type_map<std::optional<torch::executor::ArrayRef<int64_t>>>::type,
+          std::optional<c10::ArrayRef<int64_t>>>::value));
   EXPECT_TRUE((std::is_same<
-               type_map<torch::executor::optional<
+               type_map<std::optional<
                    torch::executor::ArrayRef<torch::executor::Tensor>>>::type,
                std::optional<c10::ArrayRef<at::Tensor>>>::value));
 }
@@ -198,13 +196,13 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestTypeMap_ArrayRef) {
           type_map<torch::executor::ArrayRef<torch::executor::Tensor>>::type,
           c10::ArrayRef<at::Tensor>>::value));
   // Optionals.
+  EXPECT_TRUE(
+      (std::is_same<
+          type_map<torch::executor::ArrayRef<std::optional<int64_t>>>::type,
+          c10::ArrayRef<std::optional<int64_t>>>::value));
   EXPECT_TRUE((std::is_same<
                type_map<torch::executor::ArrayRef<
-                   torch::executor::optional<int64_t>>>::type,
-               c10::ArrayRef<std::optional<int64_t>>>::value));
-  EXPECT_TRUE((std::is_same<
-               type_map<torch::executor::ArrayRef<
-                   torch::executor::optional<torch::executor::Tensor>>>::type,
+                   std::optional<torch::executor::Tensor>>>::type,
                c10::ArrayRef<std::optional<at::Tensor>>>::value));
 }
 
@@ -253,17 +251,16 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalScalar) {
   // Convert optional at to et.
   auto optional_at_in = std::optional<int64_t>();
   auto optional_et =
-      type_convert<std::optional<int64_t>, torch::executor::optional<int64_t>>(
+      type_convert<std::optional<int64_t>, std::optional<int64_t>>(
           optional_at_in)
           .call();
   EXPECT_TRUE(
-      (std::is_same<decltype(optional_et), torch::executor::optional<int64_t>>::
-           value));
+      (std::is_same<decltype(optional_et), std::optional<int64_t>>::value));
 
   // Convert optional et to at.
-  auto optional_et_in = torch::executor::optional<int64_t>();
+  auto optional_et_in = std::optional<int64_t>();
   auto optional_at_out =
-      type_convert<torch::executor::optional<int64_t>, std::optional<int64_t>>(
+      type_convert<std::optional<int64_t>, std::optional<int64_t>>(
           optional_et_in)
           .call();
   EXPECT_TRUE(
@@ -273,20 +270,19 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalScalar) {
 TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_OptionalTensor) {
   // Convert optional at to et.
   auto optional_at_in = std::optional<at::Tensor>();
-  auto optional_et =
-      type_convert<
-          std::optional<at::Tensor>,
-          torch::executor::optional<torch::executor::Tensor>>(optional_at_in)
-          .call();
+  auto optional_et = type_convert<
+                         std::optional<at::Tensor>,
+                         std::optional<torch::executor::Tensor>>(optional_at_in)
+                         .call();
   EXPECT_TRUE((std::is_same<
                decltype(optional_et),
-               torch::executor::optional<torch::executor::Tensor>>::value));
+               std::optional<torch::executor::Tensor>>::value));
 
   // Convert optional et to at.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
-  auto et_in = torch::executor::optional<torch::executor::Tensor>(tf.ones({3}));
+  auto et_in = std::optional<torch::executor::Tensor>(tf.ones({3}));
   auto optional_at_out = type_convert<
-                             torch::executor::optional<torch::executor::Tensor>,
+                             std::optional<torch::executor::Tensor>,
                              std::optional<at::Tensor>>(optional_et)
                              .call();
   EXPECT_TRUE(
@@ -519,9 +515,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   const std::optional<int64_t> const_optional_at_in =
       std::optional<int64_t>(42);
   auto const_optional_et =
-      type_convert<
-          const std::optional<int64_t>,
-          torch::executor::optional<int64_t>>(const_optional_at_in)
+      type_convert<const std::optional<int64_t>, std::optional<int64_t>>(
+          const_optional_at_in)
           .call();
   EXPECT_TRUE(const_optional_et.has_value());
   EXPECT_EQ(const_optional_et.value(), 42);
@@ -529,7 +524,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   // Test optional scalar reference conversion
   std::optional<int64_t> optional_at_ref_in = std::optional<int64_t>(24);
   auto optional_et_from_ref =
-      type_convert<std::optional<int64_t>&, torch::executor::optional<int64_t>>(
+      type_convert<std::optional<int64_t>&, std::optional<int64_t>>(
           optional_at_ref_in)
           .call();
   EXPECT_TRUE(optional_et_from_ref.has_value());
@@ -539,9 +534,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   const std::optional<int64_t> const_optional_at_ref_in =
       std::optional<int64_t>(84);
   auto const_optional_et_from_ref =
-      type_convert<
-          const std::optional<int64_t>&,
-          torch::executor::optional<int64_t>>(const_optional_at_ref_in)
+      type_convert<const std::optional<int64_t>&, std::optional<int64_t>>(
+          const_optional_at_ref_in)
           .call();
   EXPECT_TRUE(const_optional_et_from_ref.has_value());
   EXPECT_EQ(const_optional_et_from_ref.value(), 84);
@@ -551,8 +545,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({5}));
   auto const_optional_tensor_converter = type_convert<
       const std::optional<at::Tensor>,
-      torch::executor::optional<torch::executor::Tensor>>(
-      const_optional_tensor_at_in);
+      std::optional<torch::executor::Tensor>>(const_optional_tensor_at_in);
   auto const_optional_tensor_et = const_optional_tensor_converter.call();
   EXPECT_TRUE(const_optional_tensor_et.has_value());
   EXPECT_EQ(const_optional_tensor_et.value().const_data_ptr<int64_t>()[0], 5);
@@ -562,8 +555,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({7}));
   auto optional_tensor_converter_from_ref = type_convert<
       std::optional<at::Tensor>&,
-      torch::executor::optional<torch::executor::Tensor>>(
-      optional_tensor_at_ref_in);
+      std::optional<torch::executor::Tensor>>(optional_tensor_at_ref_in);
   auto optional_tensor_et_from_ref = optional_tensor_converter_from_ref.call();
   EXPECT_TRUE(optional_tensor_et_from_ref.has_value());
   EXPECT_EQ(
@@ -574,8 +566,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
       std::optional<at::Tensor>(torch::tensor({9}));
   auto const_optional_tensor_converter_from_ref = type_convert<
       const std::optional<at::Tensor>&,
-      torch::executor::optional<torch::executor::Tensor>>(
-      const_optional_tensor_at_ref_in);
+      std::optional<torch::executor::Tensor>>(const_optional_tensor_at_ref_in);
   auto const_optional_tensor_et_from_ref =
       const_optional_tensor_converter_from_ref.call();
   EXPECT_TRUE(const_optional_tensor_et_from_ref.has_value());
@@ -586,9 +577,8 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   // Test empty const optional conversions
   const std::optional<int64_t> empty_const_optional_at_in = std::nullopt;
   auto empty_const_optional_et =
-      type_convert<
-          const std::optional<int64_t>,
-          torch::executor::optional<int64_t>>(empty_const_optional_at_in)
+      type_convert<const std::optional<int64_t>, std::optional<int64_t>>(
+          empty_const_optional_at_in)
           .call();
   EXPECT_FALSE(empty_const_optional_et.has_value());
 
@@ -597,7 +587,7 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestConvert_ConstRefOptionals) {
   auto empty_const_optional_tensor_et =
       type_convert<
           const std::optional<at::Tensor>,
-          torch::executor::optional<torch::executor::Tensor>>(
+          std::optional<torch::executor::Tensor>>(
           empty_const_optional_tensor_at_in)
           .call();
   EXPECT_FALSE(empty_const_optional_tensor_et.has_value());
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index bc5c17ef33f..206499b26eb 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -13,6 +13,7 @@
 #include <cstddef>
 #include <cstring>
 #include <limits>
+#include <new>
 
 #include <executorch/runtime/platform/compat_unistd.h>
 #include <fcntl.h>
@@ -44,7 +45,12 @@ namespace extension {
 
 namespace {
 inline void* et_aligned_alloc(size_t size, std::align_val_t alignment) {
-  return ::operator new(size, alignment);
+  // Use the nothrow form so allocation failure returns nullptr instead of
+  // throwing std::bad_alloc. ExecuTorch is built exception-free and callers
+  // (e.g. FileDataLoader::load) check for nullptr and return
+  // Error::MemoryAllocationFailed; a throw here would unwind with no landing
+  // pad and abort the process.
+  return ::operator new(size, alignment, std::nothrow);
 }
 
 inline void et_aligned_free(void* ptr, std::align_val_t alignment) {
diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp
index 7dc872995a5..bcf17e4afee 100644
--- a/extension/data_loader/test/file_data_loader_test.cpp
+++ b/extension/data_loader/test/file_data_loader_test.cpp
@@ -8,7 +8,9 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 
+#include <atomic>
 #include <cstring>
+#include <new>
 
 #include <gtest/gtest.h>
 
@@ -25,6 +27,59 @@ using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 
+namespace {
+// When set, the replacement nothrow aligned operator new below returns nullptr,
+// simulating an allocation failure without needing a real OOM.
+std::atomic<bool> g_fail_aligned_nothrow_alloc{false};
+
+// RAII guard to ensure flag is reset even if test asserts early.
+struct FailAllocGuard {
+  FailAllocGuard() {
+    g_fail_aligned_nothrow_alloc.store(true, std::memory_order_relaxed);
+  }
+  ~FailAllocGuard() {
+    g_fail_aligned_nothrow_alloc.store(false, std::memory_order_relaxed);
+  }
+};
+} // namespace
+
+// Detect ASAN to avoid multiple definition link error and to skip test when
+// ASAN runtime provides its own strong operator new.
+#if defined(__SANITIZE_ADDRESS__) || \
+    (defined(__has_feature) && __has_feature(address_sanitizer))
+#define ET_TEST_ASAN_ENABLED 1
+#else
+#define ET_TEST_ASAN_ENABLED 0
+#endif
+
+#if !ET_TEST_ASAN_ENABLED
+// Replaces the global nothrow aligned allocation function for this test binary
+// so FileDataLoader's segment allocation can be made to fail on demand. When
+// the toggle is off it forwards to the real aligned allocator. We call the
+// throwing aligned new inside a try/catch and convert exceptions to nullptr
+// to emulate nothrow semantics without recursing into this same nothrow
+// overload (calling ::operator new(size, alignment, std::nothrow) here would
+// infinite-loop). Memory allocated here is released through the default
+// operator delete, which is not replaced.
+// This is a strong (non-weak) replacement so it reliably overrides libc++'s
+// default on all platforms (a weak definition loses to libc++'s own weak
+// definition on Apple's linker, leaving the override silently unused). Under
+// ASAN this whole block is excluded so it can't clash with ASAN's allocator.
+void* operator new(
+    std::size_t size,
+    std::align_val_t alignment,
+    const std::nothrow_t& /* tag */) noexcept {
+  if (g_fail_aligned_nothrow_alloc.load(std::memory_order_relaxed)) {
+    return nullptr;
+  }
+  try {
+    return ::operator new(size, alignment);
+  } catch (...) {
+    return nullptr;
+  }
+}
+#endif // !ET_TEST_ASAN_ENABLED
+
 class FileDataLoaderTest : public ::testing::TestWithParam<size_t> {
  protected:
   void SetUp() override {
@@ -147,6 +202,46 @@ TEST_P(FileDataLoaderTest, OutOfBoundsLoadFails) {
   }
 }
 
+#if !ET_TEST_ASAN_ENABLED
+TEST_P(FileDataLoaderTest, AllocationFailureDuringLoadReturnsError) {
+  // Create a temp file; contents don't matter.
+  uint8_t data[256] = {};
+  TempFile tf(data, sizeof(data));
+
+  Result<FileDataLoader> fdl =
+      FileDataLoader::from(tf.path().c_str(), alignment());
+  ASSERT_EQ(fdl.error(), Error::Ok);
+
+  // Force the segment allocation inside load() to fail. The loader must surface
+  // Error::MemoryAllocationFailed rather than letting std::bad_alloc escape,
+  // which would abort the process in the exception-free runtime.
+  FailAllocGuard fail_guard;
+  Result<FreeableBuffer> fb = fdl->load(
+      /*offset=*/0,
+      /*size=*/sizeof(data),
+      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program));
+
+  EXPECT_EQ(fb.error(), Error::MemoryAllocationFailed);
+}
+#endif // !ET_TEST_ASAN_ENABLED
+
+#if !ET_TEST_ASAN_ENABLED
+TEST_P(FileDataLoaderTest, AllocationFailureDuringFromReturnsError) {
+  // Create a temp file; contents don't matter.
+  uint8_t data[256] = {};
+  TempFile tf(data, sizeof(data));
+
+  // Force the filename allocation inside from() to fail. FileDataLoader::from
+  // copies the filename using et_aligned_alloc and must return
+  // Error::MemoryAllocationFailed on nullptr rather than throwing.
+  FailAllocGuard fail_guard;
+  Result<FileDataLoader> fdl =
+      FileDataLoader::from(tf.path().c_str(), alignment());
+
+  EXPECT_EQ(fdl.error(), Error::MemoryAllocationFailed);
+}
+#endif // !ET_TEST_ASAN_ENABLED
+
 TEST_P(FileDataLoaderTest, FromMissingFileFails) {
   // Wrapping a file that doesn't exist should fail.
   Result<FileDataLoader> fdl = FileDataLoader::from(
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 845778f45c2..342e29e63fc 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -49,7 +49,7 @@ bool is_aligned(const void* data) {
 }
 
 Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
-    executorch::aten::string_view key,
+    std::string_view key,
     const flatbuffers::Vector<
         flatbuffers::Offset<flat_tensor_flatbuffer::NamedData>>* named_data,
     const flatbuffers::Vector<
@@ -127,7 +127,7 @@ Result<const TensorLayout> create_tensor_layout(
 } // namespace
 
 ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_tensor_layout(
-    executorch::aten::string_view key) const {
+    std::string_view key) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
   if (!segment_end_offset.ok()) {
     return segment_end_offset.error();
@@ -144,7 +144,7 @@ ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_tensor_layout(
 }
 
 ET_NODISCARD Result<FreeableBuffer> FlatTensorDataMap::get_data(
-    executorch::aten::string_view key) const {
+    std::string_view key) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
   if (!segment_end_offset.ok()) {
     return segment_end_offset.error();
@@ -170,7 +170,7 @@ ET_NODISCARD Result<FreeableBuffer> FlatTensorDataMap::get_data(
 }
 
 ET_NODISCARD Error FlatTensorDataMap::load_data_into(
-    ET_UNUSED executorch::aten::string_view key,
+    ET_UNUSED std::string_view key,
     ET_UNUSED void* buffer,
     ET_UNUSED size_t size) const {
   Result<uint64_t> segment_end_offset = get_segment_end_offset(header_);
diff --git a/extension/flat_tensor/flat_tensor_data_map.h b/extension/flat_tensor/flat_tensor_data_map.h
index 751e312f7ef..7b66eeab470 100644
--- a/extension/flat_tensor/flat_tensor_data_map.h
+++ b/extension/flat_tensor/flat_tensor_data_map.h
@@ -54,7 +54,7 @@ class FlatTensorDataMap final
   ET_NODISCARD
   executorch::runtime::Result<
       const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
-  get_tensor_layout(executorch::aten::string_view key) const override;
+  get_tensor_layout(std::string_view key) const override;
 
   /**
    * Retrieve read-only data for the specified key.
@@ -65,7 +65,7 @@ class FlatTensorDataMap final
    */
   ET_NODISCARD
   executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
-      executorch::aten::string_view key) const override;
+      std::string_view key) const override;
 
   /**
    * Loads the data of the specified tensor into the provided buffer.
@@ -78,7 +78,7 @@ class FlatTensorDataMap final
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD executorch::runtime::Error load_data_into(
-      executorch::aten::string_view key,
+      std::string_view key,
       void* buffer,
       size_t size) const override;
 
diff --git a/extension/llm/modules/turboquant/kv_cache.py b/extension/llm/modules/turboquant/kv_cache.py
index 12c01721a15..684f763b44e 100644
--- a/extension/llm/modules/turboquant/kv_cache.py
+++ b/extension/llm/modules/turboquant/kv_cache.py
@@ -158,9 +158,13 @@ def update(self, input_pos, k_val, v_val):
         k_packed, k_norms = self._compress(k_val)
         v_packed, v_norms = self._compress(v_val)
 
-        self.k_packed[:, :, input_pos] = k_packed
-        self.k_norms[:, :, input_pos] = k_norms
-        self.v_packed[:, :, input_pos] = v_packed
-        self.v_norms[:, :, input_pos] = v_norms
+        # index_copy_ (not self.x[:, :, input_pos] = ...) keeps the decode
+        # write CUDA-graph-capturable: a static scatter along the position
+        # dim, matching the model's flat global KV cache. Plain index
+        # assignment lowers to index_put_, which breaks cuda_graph capture.
+        self.k_packed.index_copy_(2, input_pos, k_packed)
+        self.k_norms.index_copy_(2, input_pos, k_norms)
+        self.v_packed.index_copy_(2, input_pos, v_packed)
+        self.v_norms.index_copy_(2, input_pos, v_norms)
 
         return self.k_packed, self.k_norms, self.v_packed, self.v_norms
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
index 630395e006c..d76f741fbf4 100644
--- a/extension/named_data_map/merged_data_map.cpp
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -12,13 +12,13 @@
 #include <unordered_map>
 #include <vector>
 
-using executorch::aten::string_view;
 using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
 using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
 using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
+using std::string_view;
 
 namespace executorch::extension {
 namespace ET_MERGED_DATA_MAP_NAMESPACE {
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
index 42490ec3d58..cc291b4d093 100644
--- a/extension/named_data_map/merged_data_map.h
+++ b/extension/named_data_map/merged_data_map.h
@@ -48,7 +48,7 @@ class MergedDataMap final
   ET_NODISCARD
   executorch::runtime::Result<
       const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
-  get_tensor_layout(executorch::aten::string_view key) const override;
+  get_tensor_layout(std::string_view key) const override;
 
   /**
    * Retrieve read-only data for the specified key.
@@ -59,7 +59,7 @@ class MergedDataMap final
    */
   ET_NODISCARD
   executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
-      executorch::aten::string_view key) const override;
+      std::string_view key) const override;
 
   /**
    * Loads the data of the specified tensor into the provided buffer.
@@ -72,7 +72,7 @@ class MergedDataMap final
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD executorch::runtime::Error load_data_into(
-      executorch::aten::string_view key,
+      std::string_view key,
       void* buffer,
       size_t size) const override;
 
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index 812d3e8fab3..519842db598 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -19,8 +19,7 @@ namespace executor {
 namespace native {
 
 using Tensor = executorch::aten::Tensor;
-using TensorOptList =
-    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>>;
+using TensorOptList = executorch::aten::ArrayRef<std::optional<Tensor>>;
 
 Tensor& index_put_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
index 1fa7a903e7f..a2e1afeff32 100644
--- a/kernels/portable/cpu/op_log_softmax.cpp
+++ b/kernels/portable/cpu/op_log_softmax.cpp
@@ -70,7 +70,7 @@ Tensor& log_softmax_out(
                   size,
                   stride);
 
-              ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
+              const ACC exp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
                     return std::exp(
                         static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
@@ -81,13 +81,13 @@ Tensor& log_softmax_out(
                   in_data + base,
                   size,
                   stride);
-              temp_sum = std::log(temp_sum);
+              const ACC log_sum = std::log(exp_sum);
 
               apply_unary_map_fn(
-                  [max_in, temp_sum](const CTYPE val_in) {
+                  [max_in, log_sum](const CTYPE val_in) {
                     return static_cast<CTYPE>(
                         static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
-                        temp_sum);
+                        log_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/portable/cpu/op_native_dropout.cpp b/kernels/portable/cpu/op_native_dropout.cpp
index fae7928568d..dc72fb54599 100644
--- a/kernels/portable/cpu/op_native_dropout.cpp
+++ b/kernels/portable/cpu/op_native_dropout.cpp
@@ -17,7 +17,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     double prob,
-    torch::executor::optional<bool> train,
+    std::optional<bool> train,
     Tensor& out,
     Tensor& mask) {
   std::tuple<Tensor&, Tensor&> ret(out, mask);
diff --git a/kernels/test/op_native_dropout_test.cpp b/kernels/test/op_native_dropout_test.cpp
index 931205f54a5..fec00c87862 100644
--- a/kernels/test/op_native_dropout_test.cpp
+++ b/kernels/test/op_native_dropout_test.cpp
@@ -25,7 +25,7 @@ class OpNativeDropoutTest : public OperatorTest {
   void op_native_dropout_out(
       const Tensor& self,
       double prob,
-      executorch::aten::optional<bool> train,
+      std::optional<bool> train,
       Tensor& out,
       Tensor& mask) {
     torch::executor::aten::native_dropout_outf(
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index f539414aec9..ac4fb9a126e 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -8,6 +8,9 @@
 
 #pragma once
 
+#include <optional>
+#include <string_view>
+
 #include <executorch/runtime/core/error.h> // @manual
 #include <executorch/runtime/core/result.h> // @manual
 #include <executorch/runtime/core/tensor_shape_dynamism.h> // @manual
@@ -183,8 +186,7 @@ using quint2x4 = torch::executor::quint2x4;
 using IntArrayRef = torch::executor::IntArrayRef;
 
 template <typename T>
-using OptionalArrayRef =
-    torch::executor::optional<torch::executor::ArrayRef<T>>;
+using OptionalArrayRef = std::optional<torch::executor::ArrayRef<T>>;
 using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
 
 using torch::executor::compute_numel;
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index f48b50a0786..3e8e36b442e 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -51,7 +51,7 @@ using ScalarType = at::ScalarType;
 namespace executorch {
 namespace aten {
 using ScalarType = torch::executor::ScalarType;
-using string_view = torch::executor::string_view;
+using string_view = std::string_view;
 } // namespace aten
 } // namespace executorch
 #endif // USE_ATEN_LIB
diff --git a/runtime/core/memory_allocator.h b/runtime/core/memory_allocator.h
index 001ebd7ac4f..4d8f8da3b4f 100644
--- a/runtime/core/memory_allocator.h
+++ b/runtime/core/memory_allocator.h
@@ -178,6 +178,22 @@ class MemoryAllocator {
     return size_;
   }
 
+  // Returns the number of bytes currently allocated from this allocator. The
+  // default implementation reports the bump cursor's offset from the base
+  // (cur_ - begin_); subclasses backed by a different allocator should override
+  // this to match their own accounting.
+  virtual size_t used_size() const {
+    return static_cast<size_t>(cur_ - begin_);
+  }
+
+  // Returns the number of bytes still available for allocation, not accounting
+  // for any alignment padding a future allocation may require. The default
+  // implementation reports end_ - cur_; subclasses should override to stay
+  // consistent with used_size().
+  virtual size_t free_size() const {
+    return static_cast<size_t>(end_ - cur_);
+  }
+
   // Resets the current pointer to the base address. It does nothing to
   // the contents.
   virtual void reset() {
diff --git a/runtime/core/named_data_map.h b/runtime/core/named_data_map.h
index c6b6aa4bb7b..dbd5b21a66f 100644
--- a/runtime/core/named_data_map.h
+++ b/runtime/core/named_data_map.h
@@ -31,7 +31,7 @@ class NamedDataMap {
    * @return Result containing TensorLayout.
    */
   ET_NODISCARD virtual Result<const TensorLayout> get_tensor_layout(
-      executorch::aten::string_view key) const = 0;
+      std::string_view key) const = 0;
   /**
    * Get data by key.
    *
@@ -39,7 +39,7 @@ class NamedDataMap {
    * @return Result containing a FreeableBuffer.
    */
   ET_NODISCARD virtual Result<FreeableBuffer> get_data(
-      executorch::aten::string_view key) const = 0;
+      std::string_view key) const = 0;
 
   /**
    * Loads data corresponding to the key into the provided buffer.
@@ -51,10 +51,8 @@ class NamedDataMap {
    * `size` bytes of memory.
    * @returns an Error indicating if the load was successful.
    */
-  ET_NODISCARD virtual Error load_data_into(
-      executorch::aten::string_view key,
-      void* buffer,
-      size_t size) const = 0;
+  ET_NODISCARD virtual Error
+  load_data_into(std::string_view key, void* buffer, size_t size) const = 0;
 
   /**
    * Get the number of keys in the NamedDataMap.
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
index 31ad06fd093..deff1f1b2cc 100644
--- a/runtime/core/portable_type/optional.h
+++ b/runtime/core/portable_type/optional.h
@@ -10,16 +10,16 @@
 
 #include <optional>
 
+#include <executorch/runtime/platform/compiler.h>
+
 namespace executorch {
 namespace runtime {
 namespace etensor {
 
-// NOLINTNEXTLINE(misc-unused-using-decls)
-using std::nullopt;
-// NOLINTNEXTLINE(misc-unused-using-decls)
-using std::nullopt_t;
-// NOLINTNEXTLINE(misc-unused-using-decls)
-using std::optional;
+template <typename T>
+using optional ET_DEPRECATED = std::optional<T>;
+using nullopt_t ET_DEPRECATED = std::nullopt_t;
+ET_DEPRECATED inline constexpr std::nullopt_t nullopt{std::nullopt};
 
 } // namespace etensor
 } // namespace runtime
@@ -29,8 +29,9 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::etensor::nullopt;
-using ::executorch::runtime::etensor::nullopt_t;
-using ::executorch::runtime::etensor::optional;
+template <typename T>
+using optional ET_DEPRECATED = std::optional<T>;
+using nullopt_t ET_DEPRECATED = std::nullopt_t;
+ET_DEPRECATED inline constexpr std::nullopt_t nullopt{std::nullopt};
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
index 8e28fa022cc..f1f25f0b881 100644
--- a/runtime/core/portable_type/string_view.h
+++ b/runtime/core/portable_type/string_view.h
@@ -10,11 +10,13 @@
 
 #include <string_view>
 
+#include <executorch/runtime/platform/compiler.h>
+
 namespace executorch {
 namespace runtime {
 namespace etensor {
 
-using std::string_view;
+using string_view ET_DEPRECATED = std::string_view;
 
 } // namespace etensor
 } // namespace runtime
@@ -24,6 +26,6 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::etensor::string_view;
+using string_view ET_DEPRECATED = std::string_view;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
index b1f57a93ab5..15fda045875 100644
--- a/runtime/core/portable_type/test/CMakeLists.txt
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -19,14 +19,8 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_test_srcs
-    bfloat16_test.cpp
-    dont_shadow_complex_test.c
-    half_test.cpp
-    optional_test.cpp
-    scalar_test.cpp
-    tensor_impl_test.cpp
-    tensor_test.cpp
+set(_test_srcs bfloat16_test.cpp dont_shadow_complex_test.c half_test.cpp
+               scalar_test.cpp tensor_impl_test.cpp tensor_test.cpp
 )
 
 et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS)
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index a6671d7d400..ce831f89327 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -15,14 +15,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_test(
-        name = "optional_test",
-        srcs = ["optional_test.cpp"],
-        deps = [
-            "//executorch/runtime/core/portable_type:portable_type",
-        ],
-    )
-
     runtime.cxx_test(
         name = "tensor_test",
         srcs = ["tensor_test.cpp"],
diff --git a/runtime/core/test/memory_allocator_test.cpp b/runtime/core/test/memory_allocator_test.cpp
index fee95a6407e..922ec9a828c 100644
--- a/runtime/core/test/memory_allocator_test.cpp
+++ b/runtime/core/test/memory_allocator_test.cpp
@@ -52,6 +52,89 @@ TEST_F(MemoryAllocatorTest, MemoryAllocator) {
   ASSERT_NE(nullptr, allocator.allocate(16));
 }
 
+TEST_F(MemoryAllocatorTest, UsedAndFreeSize) {
+  constexpr size_t mem_size = 64;
+  std::array<uint8_t, mem_size> mem_pool{};
+  MemoryAllocator allocator(mem_size, mem_pool.data());
+
+  EXPECT_EQ(allocator.used_size(), 0u);
+  EXPECT_EQ(allocator.free_size(), mem_size);
+
+  void* p1 = allocator.allocate(8, /*alignment=*/8);
+  ASSERT_NE(p1, nullptr);
+  // Independently derive the expected bump offset from the returned block, so
+  // the free_size() check is a real comparison and not the identity
+  // free_size() == size() - used_size().
+  const size_t expected_used1 = static_cast<size_t>(
+      static_cast<uint8_t*>(p1) + 8 - allocator.base_address());
+  EXPECT_EQ(allocator.used_size(), expected_used1);
+  EXPECT_EQ(allocator.free_size(), mem_size - expected_used1);
+
+  void* p2 = allocator.allocate(8, /*alignment=*/8);
+  ASSERT_NE(p2, nullptr);
+  const size_t expected_used2 = static_cast<size_t>(
+      static_cast<uint8_t*>(p2) + 8 - allocator.base_address());
+  EXPECT_GT(expected_used2, expected_used1);
+  EXPECT_EQ(allocator.used_size(), expected_used2);
+  EXPECT_EQ(allocator.free_size(), mem_size - expected_used2);
+
+  allocator.reset();
+  EXPECT_EQ(allocator.used_size(), 0u);
+  EXPECT_EQ(allocator.free_size(), mem_size);
+}
+
+TEST_F(MemoryAllocatorTest, UsedAndFreeSizeZeroCapacity) {
+  MemoryAllocator allocator(0, nullptr);
+  EXPECT_EQ(allocator.used_size(), 0u);
+  EXPECT_EQ(allocator.free_size(), 0u);
+}
+
+namespace {
+// Overrides the accessors with sentinel values to prove base-reference calls
+// dispatch virtually to the override.
+class SentinelAccessorAllocator : public MemoryAllocator {
+ public:
+  using MemoryAllocator::MemoryAllocator;
+  size_t used_size() const override {
+    return 111;
+  }
+  size_t free_size() const override {
+    return 222;
+  }
+};
+} // namespace
+
+TEST_F(MemoryAllocatorTest, UsedAndFreeSizeDispatchVirtually) {
+  std::array<uint8_t, 16> mem_pool{};
+  SentinelAccessorAllocator derived(mem_pool.size(), mem_pool.data());
+  MemoryAllocator& base = derived;
+  EXPECT_EQ(base.used_size(), 111u);
+  EXPECT_EQ(base.free_size(), 222u);
+}
+
+// The base used_size()/free_size() report the true bump-cursor offset
+// (cur_ - begin_): the end of the last block relative to base_address(),
+// including the padding inserted before an aligned allocation. That matches the
+// deleted EspMemoryAllocator's formula (end_block - base_address) exactly. The
+// deleted ArmMemoryAllocator tracker used a different formula that skipped this
+// inter-allocation padding -- for the sequence below it would have reported 17,
+// not 32 -- so dropping it intentionally makes Arm report this corrected value.
+TEST_F(MemoryAllocatorTest, UsedAndFreeSizeAcrossAlignmentPadding) {
+  constexpr size_t mem_size = 128;
+  std::array<uint8_t, mem_size> mem_pool{};
+  MemoryAllocator allocator(mem_size, mem_pool.data());
+
+  // 1-byte block, then a 16-aligned block: 1 + 15 padding + 16 = 32 used.
+  ASSERT_NE(allocator.allocate(1, /*alignment=*/1), nullptr);
+  void* p2 = allocator.allocate(16, /*alignment=*/16);
+  ASSERT_NE(p2, nullptr);
+
+  const size_t expected_used = static_cast<size_t>(
+      static_cast<uint8_t*>(p2) + 16 - allocator.base_address());
+  EXPECT_EQ(allocator.used_size(), expected_used);
+  EXPECT_EQ(allocator.free_size(), allocator.size() - expected_used);
+}
+
 TEST_F(MemoryAllocatorTest, MemoryAllocatorAlignment) {
   constexpr size_t arr_size = 6;
   size_t allocation[arr_size] = {7, 6, 3, 76, 4, 1};
diff --git a/runtime/executor/merged_data_map.h b/runtime/executor/merged_data_map.h
index d5ae97057f2..aae92d90a9b 100644
--- a/runtime/executor/merged_data_map.h
+++ b/runtime/executor/merged_data_map.h
@@ -57,7 +57,7 @@ class MergedDataMap final : public NamedDataMap {
    */
   ET_NODISCARD
   Result<const TensorLayout> get_tensor_layout(
-      executorch::aten::string_view key) const override {
+      std::string_view key) const override {
     auto layout = first_->get_tensor_layout(key);
     if (layout.ok()) {
       return layout.get();
@@ -76,8 +76,7 @@ class MergedDataMap final : public NamedDataMap {
    * @return error if the key is not present or data cannot be loaded.
    */
   ET_NODISCARD
-  Result<FreeableBuffer> get_data(
-      executorch::aten::string_view key) const override {
+  Result<FreeableBuffer> get_data(std::string_view key) const override {
     auto data = first_->get_data(key);
     if (data.error() != Error::NotFound) {
       return data;
@@ -97,7 +96,7 @@ class MergedDataMap final : public NamedDataMap {
    * @returns an Error indicating if the load was successful.
    */
   ET_NODISCARD Error load_data_into(
-      ET_UNUSED executorch::aten::string_view key,
+      ET_UNUSED std::string_view key,
       ET_UNUSED void* buffer,
       ET_UNUSED size_t size) const override {
     return Error::NotImplemented;
diff --git a/runtime/executor/pte_data_map.cpp b/runtime/executor/pte_data_map.cpp
index 881bfd5165a..e35745e7689 100644
--- a/runtime/executor/pte_data_map.cpp
+++ b/runtime/executor/pte_data_map.cpp
@@ -26,8 +26,7 @@ namespace internal {
 }
 
 ET_NODISCARD
-Result<FreeableBuffer> PteDataMap::get_data(
-    executorch::aten::string_view key) const {
+Result<FreeableBuffer> PteDataMap::get_data(std::string_view key) const {
   for (uint32_t i = 0; i < named_data_->size(); i++) {
     const auto* named_data_item = named_data_->Get(i);
     ET_CHECK_OR_RETURN_ERROR(
diff --git a/runtime/executor/pte_data_map.h b/runtime/executor/pte_data_map.h
index b4b46a6b541..36d33ae3945 100644
--- a/runtime/executor/pte_data_map.h
+++ b/runtime/executor/pte_data_map.h
@@ -79,7 +79,7 @@ class PteDataMap final : public NamedDataMap {
    */
   ET_NODISCARD
   Result<const TensorLayout> get_tensor_layout(
-      ET_UNUSED executorch::aten::string_view key) const override {
+      ET_UNUSED std::string_view key) const override {
     return Error::NotImplemented;
   }
 
@@ -91,14 +91,13 @@ class PteDataMap final : public NamedDataMap {
    * @return error if the key is not present or data cannot be loaded.
    */
   ET_NODISCARD
-  Result<FreeableBuffer> get_data(
-      executorch::aten::string_view key) const override;
+  Result<FreeableBuffer> get_data(std::string_view key) const override;
 
   /**
    * The PteDataMap currently does not implement load_into.
    */
   ET_NODISCARD Error load_data_into(
-      ET_UNUSED executorch::aten::string_view key,
+      ET_UNUSED std::string_view key,
       ET_UNUSED void* buffer,
       ET_UNUSED size_t size) const override {
     return Error::NotImplemented;
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index a1991a0562c..1324a40cf52 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -38,7 +38,7 @@ class TensorInfoTestFriend final {
       Span<const uint8_t> dim_order,
       executorch::aten::ScalarType scalar_type,
       const bool is_memory_planned,
-      executorch::aten::string_view name) {
+      std::string_view name) {
     return TensorInfo::create(
                Span<const int32_t>(sizes.data(), sizes.size()),
                Span<const uint8_t>(dim_order.data(), dim_order.size()),
@@ -236,7 +236,7 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
           Span<const uint8_t>(dim_order.data(), dim_order.size()),
           executorch::aten::ScalarType::Float,
           false, // is_memory_planned
-          executorch::aten::string_view{nullptr, 0}),
+          std::string_view{nullptr, 0}),
       "");
 }
 
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index c0877aac924..d7d0cb08567 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -75,7 +75,6 @@
             "bfloat16_test.cpp",
             "dont_shadow_complex_test.c",
             "half_test.cpp",
-            "optional_test.cpp",
             "scalar_test.cpp",
             "tensor_impl_test.cpp",
             "tensor_test.cpp"