pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 18 additions & 5 deletions b/‎.ci/docker/build.sh‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 3 additions & 1 deletion b/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.ci/docker/common/install_cuda_windows_cross_compile.sh‎
Lines changed: 14 additions & 11 deletions b/‎.ci/docker/common/install_cuda_windows_cross_compile.sh‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎.ci/docker/common/install_pytorch.sh‎
Lines changed: 24 additions & 1 deletion b/‎.ci/docker/common/install_pytorch.sh‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎.ci/docker/ubuntu/Dockerfile‎
Lines changed: 6 additions & 1 deletion b/‎.ci/docker/ubuntu/Dockerfile‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ci/scripts/cuda_benchmark.py‎
Lines changed: 31 additions & 1 deletion b/‎.ci/scripts/cuda_benchmark.py‎
Lines changed: 31 additions & 1 deletion
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -91,14 +92,21 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
+# Pull channel + spec/url helpers out of torch_pin.py so install_pytorch.sh
+# (which runs inside the docker build, where torch_pin.py isn't available)
+# can decide between wheel install (test/release) and source build (nightly).
+# Self-hosted runners often have python3 but not the unversioned python alias.
+PYTHON_BIN=$(command -v python3 || command -v python)
+TORCH_PIN_HELPERS=$(cd ../.. && "$PYTHON_BIN" -c "from torch_pin import CHANNEL, torch_spec, torchaudio_spec, torchvision_spec, torch_index_url_base; print(CHANNEL); print(torch_spec()); print(torchaudio_spec()); print(torchvision_spec()); print(torch_index_url_base())")
+TORCH_CHANNEL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '1p')
+TORCH_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '2p')
+TORCHAUDIO_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '3p')
+TORCHVISION_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '4p')
+TORCH_INDEX_URL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '5p')
+
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
-# Copy arm setup script from root to here
-# TODO(huydhn): Figure out a way to rebuild the Docker image automatically
-# with a new image hash when the content here is updated
-cp -r ../../examples/arm/ ./arm
-
 docker build \
   --no-cache \
   --progress=plain \
@@ -108,6 +116,11 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
+  --build-arg "TORCH_CHANNEL=${TORCH_CHANNEL}" \
+  --build-arg "TORCH_SPEC=${TORCH_SPEC}" \
+  --build-arg "TORCHAUDIO_SPEC=${TORCHAUDIO_SPEC}" \
+  --build-arg "TORCHVISION_SPEC=${TORCHVISION_SPEC}" \
+  --build-arg "TORCH_INDEX_URL=${TORCH_INDEX_URL}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
 
@@ -1 +1 @@
-a9592258daacad7423fd5f39aaa59c6e36471520
+585799cf7039d376d2ac4848b5ef0b501f60679e
@@ -1 +1 @@
-659af3c353e49b35c191cdd2dba3b3c79d0e6822
+release/2.11
@@ -38,13 +38,15 @@ apt-get update
 # - libcublas-dev: cuBLAS development files
 # - libcusparse-dev: cuSPARSE development files
 # - libcufft-dev: cuFFT development files
+# - libcurand-dev: cuRAND development files
 apt-get install -y --no-install-recommends \
     "cuda-nvcc-${CUDA_VERSION_DASH}" \
     "cuda-cudart-dev-${CUDA_VERSION_DASH}" \
     "cuda-nvrtc-dev-${CUDA_VERSION_DASH}" \
     "libcublas-dev-${CUDA_VERSION_DASH}" \
     "libcusparse-dev-${CUDA_VERSION_DASH}" \
-    "libcufft-dev-${CUDA_VERSION_DASH}"
+    "libcufft-dev-${CUDA_VERSION_DASH}" \
+    "libcurand-dev-${CUDA_VERSION_DASH}"
 
 # Clean up
 apt-get clean
 
@@ -48,20 +48,23 @@ get_torch_cuda_version() {
 }
 
 install_windows_cuda() {
-    # Get CUDA version from torch
-    TORCH_CUDA_VERSION=$(get_torch_cuda_version)
+    # Use CUDA_VERSION env var if set (from Docker build arg), otherwise query PyTorch
+    if [ -n "${CUDA_VERSION:-}" ]; then
+        echo "Using CUDA version from environment: ${CUDA_VERSION}"
+        CUDA_MAJOR_MINOR=$(echo "${CUDA_VERSION}" | cut -d. -f1,2)
+    else
+        TORCH_CUDA_VERSION=$(get_torch_cuda_version)
+
+        if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
+            echo "ERROR: Could not detect CUDA version from PyTorch."
+            echo "Make sure PyTorch with CUDA support is installed or set CUDA_VERSION."
+            exit 1
+        fi
 
-    if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
-        echo "ERROR: Could not detect CUDA version from PyTorch."
-        echo "Make sure PyTorch with CUDA support is installed before running this script."
-        exit 1
+        echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
+        CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
     fi
 
-    echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
-
-    # Extract major.minor version (e.g., "12.8" from "12.8.1" or "12.8")
-    CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
-
     # Look up the full version and driver version
     if [ -z "${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}" ]; then
         echo "ERROR: CUDA version ${CUDA_MAJOR_MINOR} is not in the known version map."
 
@@ -17,6 +17,24 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
+  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
+    # Test/release: install the published wheels directly. The specs and URL
+    # are passed in as docker build args (computed from torch_pin.py by
+    # .ci/docker/build.sh). RC wheels at /whl/test/ get re-uploaded under the
+    # same version, so use --no-cache-dir there to avoid stale cache hits.
+    local cache_flag=""
+    if [ "${TORCH_CHANNEL}" = "test" ]; then
+      cache_flag="--no-cache-dir"
+    fi
+    pip_install --force-reinstall ${cache_flag} \
+      "${TORCH_SPEC}" "${TORCHVISION_SPEC}" "${TORCHAUDIO_SPEC}" \
+      --index-url "${TORCH_INDEX_URL}/cpu"
+    return
+  fi
+
+  # Nightly: build pytorch from source against the pinned SHA in pytorch.txt
+  # so we catch upstream regressions, then install audio/vision from the
+  # commits that pytorch itself pins.
   git clone https://github.com/pytorch/pytorch.git
 
   # Fetch the target commit
@@ -27,11 +45,16 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
+  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
+  # doesn't pass — gcc-11 in this image is too old to accept the combined
+  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
+  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
+  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
-  # Grab the pinned audio and vision commits from PyTorch
+  # Defer to PyTorch's own pinned audio/vision commits.
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION
   TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
 
@@ -64,6 +64,11 @@ ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
 
 ARG TORCH_VERSION
+ARG TORCH_CHANNEL
+ARG TORCH_SPEC
+ARG TORCHAUDIO_SPEC
+ARG TORCHVISION_SPEC
+ARG TORCH_INDEX_URL
 ARG SKIP_PYTORCH
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
@@ -105,7 +110,7 @@ COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_c
 COPY ./common/utils.sh utils.sh
 RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
     CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda.sh && \
-    bash ./install_cuda_windows_cross_compile.sh; \
+    CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda_windows_cross_compile.sh; \
     fi
 RUN rm -f install_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
 # Set up CUDA environment for Linux compilation (nvcc, etc.)
 
@@ -40,6 +40,8 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
       -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
   cmake --build $PWD --target "PyQnnManagerAdaptor" -j$(nproc)
 
@@ -18,7 +18,9 @@ class RunMetrics:
     """Metrics from a single run."""
 
     generated_tokens: int
+    prompt_tokens: int
     tokens_per_sec: float
+    prefill_tokens_per_sec: float
     model_load_time_ms: float
     total_inference_time_ms: float
     encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
     def __repr__(self):
         return (
             f"Tokens: {self.generated_tokens}, "
-            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
+            f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
+            f"Decode: {self.tokens_per_sec:.2f} t/s, "
             f"Model load: {self.model_load_time_ms:.0f}ms, "
             f"Total inference: {self.total_inference_time_ms:.0f}ms, "
             f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
 
         # Extract values
         generated_tokens = data.get("generated_tokens", 0)
+        prompt_tokens = data.get("prompt_tokens", 0)
         inference_start_ms = data.get("inference_start_ms", 0)
         inference_end_ms = data.get("inference_end_ms", 0)
         prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
             if generation_time_ms > 0
             else 0
         )
+
+        # Calculate prefill throughput
+        prefill_tokens_per_sec = (
+            (prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
+        )
+
         model_load_time_ms = model_load_end_ms - model_load_start_ms
         first_token_latency_ms = first_token_ms - prompt_eval_end_ms
 
         return RunMetrics(
             generated_tokens=generated_tokens,
+            prompt_tokens=prompt_tokens,
             tokens_per_sec=tokens_per_sec,
+            prefill_tokens_per_sec=prefill_tokens_per_sec,
             model_load_time_ms=model_load_time_ms,
             total_inference_time_ms=total_inference_time_ms,
             encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
 
     # Metrics
     throughput: MetricStats
+    prefill_throughput: MetricStats
     model_load_time: MetricStats
     total_inference_time: MetricStats
     encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
             "throughput_min": self.throughput.min_val,
             "throughput_max": self.throughput.max_val,
             "throughput_stdev": self.throughput.stdev,
+            "prefill_throughput_mean": self.prefill_throughput.mean,
+            "prefill_throughput_min": self.prefill_throughput.min_val,
+            "prefill_throughput_max": self.prefill_throughput.max_val,
+            "prefill_throughput_stdev": self.prefill_throughput.stdev,
             "model_load_time_mean": self.model_load_time.mean,
             "model_load_time_min": self.model_load_time.min_val,
             "model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
                 runner_type,
                 base_extra_info,
             ),
+            self.prefill_throughput.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
             self.model_load_time.create_v3_record(
                 model_name_with_quant,
                 backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
             "t/s",
             {"trimmed_runs": len(trimmed_throughput)},
         ),
+        prefill_throughput=create_metric_stats(
+            "prefill_encoder_throughput(tokens/sec)",
+            [r.prefill_tokens_per_sec for r in results],
+            "t/s",
+        ),
         model_load_time=create_metric_stats(
             "model_load_time(ms)",
             [r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
 
     # Print all metrics using their print_stats method
     summary.throughput.print_stats()
+    summary.prefill_throughput.print_stats()
     summary.model_load_time.print_stats()
     summary.total_inference_time.print_stats()
     summary.encoder_time.print_stats()
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-a9592258daacad7423fd5f39aaa59c6e36471520`
	`1`	`+585799cf7039d376d2ac4848b5ef0b501f60679e`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-659af3c353e49b35c191cdd2dba3b3c79d0e6822`
	`1`	`+release/2.11`